In [15]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from statistics import harmonic_mean
from langdetect import detect
from sklearn.feature_extraction.text import TfidfVectorizer, TfidfTransformer, CountVectorizer
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import f1_score
from sklearn.metrics.pairwise import cosine_similarity

In [16]:
df = pd.read_csv('coursea_data.csv')  # Replace with the actual path if it's in a subfolder

# Drop the irrelevant columns
df.drop(['Unnamed: 0', 'course_organization'], axis=1, inplace=True)

# Display the DataFrame to check if it's loaded correctly
df.head()

Unnamed: 0,course_title,course_Certificate_type,course_rating,course_difficulty,course_students_enrolled
0,(ISC)² Systems Security Certified Practitioner...,SPECIALIZATION,4.7,Beginner,5.3k
1,A Crash Course in Causality: Inferring Causal...,COURSE,4.7,Intermediate,17k
2,A Crash Course in Data Science,COURSE,4.5,Mixed,130k
3,A Law Student's Toolkit,COURSE,4.7,Mixed,91k
4,A Life of Happiness and Fulfillment,COURSE,4.8,Mixed,320k


In [17]:
df


Unnamed: 0,course_title,course_Certificate_type,course_rating,course_difficulty,course_students_enrolled
0,(ISC)² Systems Security Certified Practitioner...,SPECIALIZATION,4.7,Beginner,5.3k
1,A Crash Course in Causality: Inferring Causal...,COURSE,4.7,Intermediate,17k
2,A Crash Course in Data Science,COURSE,4.5,Mixed,130k
3,A Law Student's Toolkit,COURSE,4.7,Mixed,91k
4,A Life of Happiness and Fulfillment,COURSE,4.8,Mixed,320k
...,...,...,...,...,...
886,Программирование на Python,SPECIALIZATION,4.5,Intermediate,52k
887,Психолингвистика (Psycholinguistics),COURSE,4.8,Mixed,21k
888,Разработка интерфейсов: вёрстка и JavaScript,SPECIALIZATION,4.5,Intermediate,30k
889,Русский как иностранный,SPECIALIZATION,4.6,Intermediate,9.8k


In [18]:
df.course_students_enrolled.apply(lambda count : count[-1]).value_counts()

course_students_enrolled
k    887
m      4
Name: count, dtype: int64

In [19]:
df = df[df.course_students_enrolled.str.endswith('k')]


In [20]:
df['course_students_enrolled'] = df['course_students_enrolled'].apply(lambda enrolled : eval(enrolled[:-1]) * 1000)
df

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['course_students_enrolled'] = df['course_students_enrolled'].apply(lambda enrolled : eval(enrolled[:-1]) * 1000)


Unnamed: 0,course_title,course_Certificate_type,course_rating,course_difficulty,course_students_enrolled
0,(ISC)² Systems Security Certified Practitioner...,SPECIALIZATION,4.7,Beginner,5300.0
1,A Crash Course in Causality: Inferring Causal...,COURSE,4.7,Intermediate,17000.0
2,A Crash Course in Data Science,COURSE,4.5,Mixed,130000.0
3,A Law Student's Toolkit,COURSE,4.7,Mixed,91000.0
4,A Life of Happiness and Fulfillment,COURSE,4.8,Mixed,320000.0
...,...,...,...,...,...
886,Программирование на Python,SPECIALIZATION,4.5,Intermediate,52000.0
887,Психолингвистика (Psycholinguistics),COURSE,4.8,Mixed,21000.0
888,Разработка интерфейсов: вёрстка и JavaScript,SPECIALIZATION,4.5,Intermediate,30000.0
889,Русский как иностранный,SPECIALIZATION,4.6,Intermediate,9800.0


In [22]:
minmax_scaler = MinMaxScaler()
scaled_ratings = minmax_scaler.fit_transform(df[['course_rating','course_students_enrolled']])

In [23]:
df['course_rating'] = scaled_ratings[:,0]
df['course_students_enrolled'] = scaled_ratings[:,1]
df['overall_rating'] = df[['course_rating','course_students_enrolled']].apply(lambda row : harmonic_mean(row), axis=1)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['course_rating'] = scaled_ratings[:,0]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['course_students_enrolled'] = scaled_ratings[:,1]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['overall_rating'] = df[['course_rating','course_students_enrolled']].apply(lambda row : harmonic_mean(row),

In [24]:
df

Unnamed: 0,course_title,course_Certificate_type,course_rating,course_difficulty,course_students_enrolled,overall_rating
0,(ISC)² Systems Security Certified Practitioner...,SPECIALIZATION,0.823529,Beginner,0.004587,0.009122
1,A Crash Course in Causality: Inferring Causal...,COURSE,0.823529,Intermediate,0.018709,0.036586
2,A Crash Course in Data Science,COURSE,0.705882,Mixed,0.155100,0.254319
3,A Law Student's Toolkit,COURSE,0.823529,Mixed,0.108027,0.190999
4,A Life of Happiness and Fulfillment,COURSE,0.882353,Mixed,0.384430,0.535534
...,...,...,...,...,...,...
886,Программирование на Python,SPECIALIZATION,0.705882,Intermediate,0.060954,0.112217
887,Психолингвистика (Psycholinguistics),COURSE,0.882353,Mixed,0.023537,0.045850
888,Разработка интерфейсов: вёрстка и JavaScript,SPECIALIZATION,0.705882,Intermediate,0.034400,0.065602
889,Русский как иностранный,SPECIALIZATION,0.764706,Intermediate,0.010018,0.019777


In [25]:
df = df[df.course_title.apply(lambda title : detect(title) == 'en')]

In [26]:
vectorizer = TfidfVectorizer(stop_words='english')
vectors = vectorizer.fit_transform(df.course_title)

In [27]:
def recommend_by_course_title (title, recomm_count=10) : 
    title_vector = vectorizer.transform([title])
    cosine_sim = cosine_similarity(vectors, title_vector)
    idx = np.argsort(np.array(cosine_sim[:,0]))[-recomm_count:]
    sdf = df.iloc[idx].sort_values(by='overall_rating', ascending=False)
    return sdf

In [28]:
recommend_by_course_title('A Crash Course in Data Science')

Unnamed: 0,course_title,course_Certificate_type,course_rating,course_difficulty,course_students_enrolled,overall_rating
487,Introduction to Data Science in Python,COURSE,0.705882,Intermediate,0.46892,0.563503
486,Introduction to Data Science,SPECIALIZATION,0.764706,Beginner,0.37236,0.500843
864,What is Data Science?,COURSE,0.823529,Beginner,0.31201,0.452559
54,Applied Data Science,SPECIALIZATION,0.764706,Beginner,0.26373,0.392199
711,SQL for Data Science,COURSE,0.764706,Beginner,0.19131,0.306053
2,A Crash Course in Data Science,COURSE,0.705882,Mixed,0.1551,0.254319
825,Tools for Data Science,COURSE,0.764706,Beginner,0.14303,0.240986
171,Crash Course on Python,COURSE,0.882353,Beginner,0.095957,0.173089
1,A Crash Course in Causality: Inferring Causal...,COURSE,0.823529,Intermediate,0.018709,0.036586
594,Mathematics for Data Science,SPECIALIZATION,0.705882,Beginner,0.012674,0.0249


In [29]:
recommend_by_course_title('machine learning')

Unnamed: 0,course_title,course_Certificate_type,course_rating,course_difficulty,course_students_enrolled,overall_rating
563,Machine Learning,SPECIALIZATION,0.764706,Intermediate,0.34822,0.478533
200,Data Science: Statistics and Machine Learning,SPECIALIZATION,0.647059,Intermediate,0.25166,0.36238
28,Advanced Machine Learning,SPECIALIZATION,0.705882,Advanced,0.22752,0.344122
57,Applied Machine Learning in Python,COURSE,0.764706,Intermediate,0.17924,0.29041
595,Mathematics for Machine Learning,SPECIALIZATION,0.764706,Beginner,0.17924,0.29041
570,Machine Learning with Python,COURSE,0.823529,Intermediate,0.14303,0.243729
568,Machine Learning for Business Professionals,COURSE,0.764706,Intermediate,0.068196,0.125224
566,Machine Learning and Reinforcement Learning in...,SPECIALIZATION,0.235294,Intermediate,0.033193,0.058178
567,Machine Learning for All,COURSE,0.764706,Beginner,0.021123,0.04111
569,Machine Learning for Trading,SPECIALIZATION,0.352941,Intermediate,0.016295,0.031151


In [30]:
recommend_by_course_title('english')

Unnamed: 0,course_title,course_Certificate_type,course_rating,course_difficulty,course_students_enrolled,overall_rating
261,English for Career Development,COURSE,0.882353,Mixed,0.91551,0.898626
18,Academic English: Writing,SPECIALIZATION,0.823529,Beginner,0.64997,0.726528
260,English for Business and Entrepreneurship,COURSE,0.882353,Beginner,0.2758,0.420243
259,English Composition I,COURSE,0.764706,Beginner,0.23959,0.364864
262,English for Journalism,COURSE,0.882353,Mixed,0.1551,0.263824
111,Business English Communication Skills,SPECIALIZATION,0.823529,Intermediate,0.14303,0.243729
112,Business English: Networking,COURSE,0.823529,Intermediate,0.091129,0.164099
110,Business English,SPECIALIZATION,0.823529,Intermediate,0.044056,0.083637
548,Learn English,SPECIALIZATION,0.588235,Beginner,0.024744,0.047489
810,The Pronunciation of American English,SPECIALIZATION,0.764706,Beginner,0.000241,0.000483
