In [29]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import joblib
import random



In [30]:
df = pd.read_csv('../../HO1/data/coursera_course_details_preprocessed.csv')
df

Unnamed: 0,Title,Partner,Subject,Description,Rating,Level,Duration (in hour),Skills,URL
0,microsoft business analyst professional certif...,Microsoft,Business Strategy,launch career business analyst build jobready ...,4.6,Beginner level,120.0,business process modeling pivot table chart bu...,https://www.coursera.org/professional-certific...
1,data engineering foundation specialization,IBM,Data Management,build foundation data engineering career devel...,4.7,Beginner level,80.0,database design database management system big...,https://www.coursera.org/specializations/data-...
2,psychological first aid,Johns Hopkins University,Psychology,psychological first aid,4.8,Beginner level,6.0,trauma care crisis intervention triage stress ...,https://www.coursera.org/learn/psychological-f...
3,tableau business intelligence analyst professi...,Tableau Learning Partner,Data Analysis,launch career data analytics build indemand sk...,4.7,Beginner level,320.0,spatial data analysis data visualization softw...,https://www.coursera.org/professional-certific...
4,think like cfo specialization,IESE Business School,Finance,think like cfo specialization,4.8,Beginner level,40.0,balance sheet working capital capital budgetin...,https://www.coursera.org/specializations/think...
...,...,...,...,...,...,...,...,...,...
355,ibm business analyst professional certificate,IBM,Business Essentials,get jobready business analyst build essential ...,4.8,Beginner level,120.0,stakeholder engagement business analytics busi...,https://www.coursera.org/professional-certific...
356,adobe content creator professional certificate,Adobe,Marketing,launch career content creator build indemand s...,4.7,Beginner level,160.0,storytelling content marketing social medium i...,https://www.coursera.org/professional-certific...
357,google automation python professional certificate,Google,Support and Operations,learn indemand skill like python git automatio...,4.8,Advanced level,240.0,unit testing programming principle github inte...,https://www.coursera.org/professional-certific...
358,generative ai engineering llm specialization,IBM,Machine Learning,advance ml career gen ai llm master essential ...,4.5,Intermediate level,120.0,text mining large language modeling applied ma...,https://www.coursera.org/specializations/gener...


In [31]:
df['CombinedText'] = (
    df['Title'].fillna('') + ' ' +
    df['Description'].fillna('') + ' ' +
    df['Skills'].fillna('')
)

In [32]:
df['CombinedText'][0]

'microsoft business analyst professional certificate launch career business analyst build jobready skill indemand career business analysis little month prior experience required get started business process modeling pivot table chart business analysis excel formula requirement analysis scrum software development user story requirement elicitation microsoft excel'

In [33]:
# Vectorize using TF-IDF
vectorizer = TfidfVectorizer(stop_words='english', max_features=5000)
tfidf_matrix = vectorizer.fit_transform(df['CombinedText'])

In [34]:
vectorized_df = pd.DataFrame(tfidf_matrix.toarray(), columns=vectorizer.get_feature_names_out())
vectorized_df.to_csv("result/vectorized_courses_1.csv", index=False)

In [35]:
vectorized_df

Unnamed: 0,able,academic,academy,accelerate,accelerated,acceptance,access,accessibility,account,accountability,...,workplace,workspace,world,wrangling,write,writing,xml,yellow,youll,zapier
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
355,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
356,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
357,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
358,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [36]:
# Cosine similarity
similarity_matrix = cosine_similarity(tfidf_matrix)

In [37]:
similarity_df = pd.DataFrame(similarity_matrix, index=df['Title'], columns=df['Title'])
similarity_df.to_csv("result/similarity_matrix_1.csv")

In [38]:
model = {
    "vectorizer": vectorizer,
    "tfidf_matrix": tfidf_matrix,
    "dataframe": df
}

joblib.dump(model, "../models/course_recommender1.joblib")

['../models/course_recommender1.joblib']

In [39]:
def generate_top_recommendations(df, similarity_matrix, top_n=5):
    recommendations = []
    for idx, row in df.iterrows():
        sim_scores = list(enumerate(similarity_matrix[idx]))
        sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
        top_indices = [i for i, score in sim_scores[1:top_n+1]]
        recommended_titles = df.iloc[top_indices]['Title'].tolist()
        recommendations.append(recommended_titles)
    return recommendations

In [40]:
df['TopRecommendations'] = generate_top_recommendations(df, similarity_matrix, top_n=5)
df[['Title', 'TopRecommendations']].to_csv("result/top_recommendations_1.csv", index=False)

sample_idx = random.randint(0, len(df) - 1)
print(f"\nExample course: {df.iloc[sample_idx]['Title']}")
print("Top recommendations:")
for title in df.iloc[sample_idx]['TopRecommendations']:
    print(f" - {title}")


Example course: microsoft excel professional certificate
Top recommendations:
 - chatgpt excel aienhanced data analysis insight specialization
 - excel skill business specialization
 - ibm data analytics excel r professional certificate
 - data analysis visualization foundation specialization
 - ibm data analyst professional certificate
