In [1]:
import numpy as np
import pandas as pd
from statistics import harmonic_mean
from langdetect import detect
from sklearn.feature_extraction.text import TfidfVectorizer, TfidfTransformer, CountVectorizer
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import f1_score
from sklearn.metrics.pairwise import cosine_similarity

ModuleNotFoundError: No module named 'numpy'

In [2]:
df = pd.read_csv('coursea_data.csv')
df.head()

Unnamed: 0.1,Unnamed: 0,course_title,course_organization,course_Certificate_type,course_rating,course_difficulty,course_students_enrolled
0,134,(ISC)² Systems Security Certified Practitioner...,(ISC)²,SPECIALIZATION,4.7,Beginner,5.3k
1,743,A Crash Course in Causality: Inferring Causal...,University of Pennsylvania,COURSE,4.7,Intermediate,17k
2,874,A Crash Course in Data Science,Johns Hopkins University,COURSE,4.5,Mixed,130k
3,413,A Law Student's Toolkit,Yale University,COURSE,4.7,Mixed,91k
4,635,A Life of Happiness and Fulfillment,Indian School of Business,COURSE,4.8,Mixed,320k


In [3]:
df.drop(['Unnamed: 0', 'course_organization'], axis=1, inplace=True)
df.head()

Unnamed: 0,course_title,course_Certificate_type,course_rating,course_difficulty,course_students_enrolled
0,(ISC)² Systems Security Certified Practitioner...,SPECIALIZATION,4.7,Beginner,5.3k
1,A Crash Course in Causality: Inferring Causal...,COURSE,4.7,Intermediate,17k
2,A Crash Course in Data Science,COURSE,4.5,Mixed,130k
3,A Law Student's Toolkit,COURSE,4.7,Mixed,91k
4,A Life of Happiness and Fulfillment,COURSE,4.8,Mixed,320k


In [4]:
df.course_students_enrolled.apply(lambda count : count[-1]).value_counts()

k    887
m      4
Name: course_students_enrolled, dtype: int64

In [5]:
df = df[df.course_students_enrolled.str.endswith('k')]

In [6]:
df['course_students_enrolled'] = df['course_students_enrolled'].apply(lambda enrolled : eval(enrolled[:-1]) * 1000)
df

Unnamed: 0,course_title,course_Certificate_type,course_rating,course_difficulty,course_students_enrolled
0,(ISC)² Systems Security Certified Practitioner...,SPECIALIZATION,4.7,Beginner,5300.0
1,A Crash Course in Causality: Inferring Causal...,COURSE,4.7,Intermediate,17000.0
2,A Crash Course in Data Science,COURSE,4.5,Mixed,130000.0
3,A Law Student's Toolkit,COURSE,4.7,Mixed,91000.0
4,A Life of Happiness and Fulfillment,COURSE,4.8,Mixed,320000.0
...,...,...,...,...,...
886,Программирование на Python,SPECIALIZATION,4.5,Intermediate,52000.0
887,Психолингвистика (Psycholinguistics),COURSE,4.8,Mixed,21000.0
888,Разработка интерфейсов: вёрстка и JavaScript,SPECIALIZATION,4.5,Intermediate,30000.0
889,Русский как иностранный,SPECIALIZATION,4.6,Intermediate,9800.0


In [7]:
minmax_scaler = MinMaxScaler()
scaled_ratings = minmax_scaler.fit_transform(df[['course_rating','course_students_enrolled']])

In [8]:
df['course_rating'] = scaled_ratings[:,0]
df['course_students_enrolled'] = scaled_ratings[:,1]
df['overall_rating'] = df[['course_rating','course_students_enrolled']].apply(lambda row : harmonic_mean(row), axis=1)

In [9]:
df

Unnamed: 0,course_title,course_Certificate_type,course_rating,course_difficulty,course_students_enrolled,overall_rating
0,(ISC)² Systems Security Certified Practitioner...,SPECIALIZATION,0.823529,Beginner,0.004587,0.009122
1,A Crash Course in Causality: Inferring Causal...,COURSE,0.823529,Intermediate,0.018709,0.036586
2,A Crash Course in Data Science,COURSE,0.705882,Mixed,0.155100,0.254319
3,A Law Student's Toolkit,COURSE,0.823529,Mixed,0.108027,0.190999
4,A Life of Happiness and Fulfillment,COURSE,0.882353,Mixed,0.384430,0.535534
...,...,...,...,...,...,...
886,Программирование на Python,SPECIALIZATION,0.705882,Intermediate,0.060954,0.112217
887,Психолингвистика (Psycholinguistics),COURSE,0.882353,Mixed,0.023537,0.045850
888,Разработка интерфейсов: вёрстка и JavaScript,SPECIALIZATION,0.705882,Intermediate,0.034400,0.065602
889,Русский как иностранный,SPECIALIZATION,0.764706,Intermediate,0.010018,0.019777


In [10]:
df = df[df.course_title.apply(lambda title : detect(title) == 'en')]

In [11]:
vectorizer = TfidfVectorizer(stop_words='english')
vectors = vectorizer.fit_transform(df.course_title)

In [12]:
# def recommend_by_course_title (title, recomm_count=10) : 
#     title_vector = vectorizer.transform([title])
#     cosine_sim = cosine_similarity(vectors, title_vector)
#     idx = np.argsort(np.array(cosine_sim[:,0]))[-recomm_count:]
#     sdf = df.iloc[idx].sort_values(by='overall_rating', ascending=False)
#     return sdf

In [49]:
%%writefile util.py

def recommend_by_course_title_1(title, recomm_count=5):
    # Transform input title into a vector
    title_vector = vectorizer.transform([title])
    
    # Calculate cosine similarity between title vector and all vectors in the dataset
    cosine_sim = cosine_similarity(vectors, title_vector)
    
    # Get indices of the top recomm_count most similar courses
    idx = np.argsort(np.array(cosine_sim[:, 0]))[-recomm_count:]
    
    # Select the recommended courses based on the indices
    sdf = df.iloc[idx].sort_values(by='overall_rating', ascending=False).reset_index(drop=True)
    
    # Rescale the 'overall_rating' column from 0-1 to 0-5 range
    scaler = MinMaxScaler(feature_range=(0, 5))
    sdf['overall_rating'] = scaler.fit_transform(sdf[['overall_rating']])
    
    sdf_result = sdf[['course_title', 'course_Certificate_type', 'course_difficulty', 'overall_rating']]
    #to json
    sdf_result.to_json('recommendations.json', orient='records')
    
    return sdf_result

Writing util.py


## Recommendations

In [42]:
df['course_title'].unique()

array(['(ISC)² Systems Security Certified Practitioner (SSCP)',
       'A Crash Course in Causality:  Inferring Causal Effects from Observational Data',
       'A Crash Course in Data Science', "A Law Student's Toolkit",
       'A Life of Happiness and Fulfillment',
       'ADHD: Everyday Strategies for Elementary Students',
       'AI For Everyone', 'AI For Medical Treatment',
       'AI Foundations for Everyone', 'AI for Medicine',
       'AWS Fundamentals: Addressing Security Risk',
       'AWS Fundamentals: Building Serverless Applications',
       'AWS Fundamentals: Going Cloud-Native',
       'AWS Fundamentals: Migrating to the Cloud',
       'Aboriginal Worldviews and Education', 'Academic English: Writing',
       'Accelerated Computer Science Fundamentals', 'Access Controls',
       'Accounting Analytics', 'Accounting for Decision Making',
       'Achieving Personal and Professional Success',
       'Addiction Treatment: Clinical Skills for Healthcare Providers',
       'Advan

In [50]:
recommend_by_course_title_1("A Crash Course in Data Science")

Unnamed: 0,course_title,course_Certificate_type,course_difficulty,overall_rating
0,Introduction to Data Science,SPECIALIZATION,Beginner,5.0
1,What is Data Science?,COURSE,Beginner,4.479985
2,A Crash Course in Data Science,COURSE,Mixed,2.344963
3,Crash Course on Python,COURSE,Beginner,1.470129
4,A Crash Course in Causality: Inferring Causal...,COURSE,Intermediate,0.0


In [51]:
recommend_by_course_title_1('Программирование на Python')

Unnamed: 0,course_title,course_Certificate_type,course_difficulty,overall_rating
0,Introduction to Data Science in Python,COURSE,Intermediate,5.0
1,Machine Learning with Python,COURSE,Intermediate,1.729496
2,Python Basics,COURSE,Beginner,1.569335
3,Data Analysis with Python,COURSE,Beginner,1.547999
4,Statistics with Python,SPECIALIZATION,Beginner,0.0


In [38]:
recommend_by_course_title_1("A Law Student's Toolkit")

Unnamed: 0,course_title,course_Certificate_type,course_difficulty,overall_rating
0,A Law Student's Toolkit,COURSE,Mixed,5.0
1,The Manager's Toolkit: A Practical Guide to Ma...,COURSE,Mixed,2.676802
2,An Introduction to American Law,COURSE,Beginner,1.310533
3,Corporate & Commercial Law I: Contracts & Empl...,COURSE,Intermediate,0.519654
4,Healthcare Law,SPECIALIZATION,Intermediate,0.0


In [40]:
recommend_by_course_title_1('Introduction to Sustainability')

Unnamed: 0,course_title,course_Certificate_type,course_difficulty,overall_rating
0,Introduction to Programming in C,SPECIALIZATION,Beginner,5.0
1,Introduction to Sustainability,COURSE,Mixed,4.283035
2,Strategy and Sustainability,COURSE,Beginner,0.601123
3,Corporate Sustainability. Understanding and Se...,COURSE,Mixed,0.532857
4,Beyond the Sustainable Development Goals (SDGs...,COURSE,Mixed,0.0


In [52]:
recommend_by_course_title_1('psychology')

Unnamed: 0,course_title,course_Certificate_type,course_difficulty,overall_rating
0,Introduction to Psychology,COURSE,Beginner,5.0
1,Social Psychology,COURSE,Beginner,3.952864
2,Positive Psychology,COURSE,Mixed,3.218164
3,Foundations of Positive Psychology,SPECIALIZATION,Beginner,1.280402
4,Positive Psychology: Resilience Skills,COURSE,Beginner,0.0
