## COURSE RECOMMENDER SYSTEM

In [None]:
 # Importing Basic Libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
from sklearn.feature_extraction.text import CountVectorizer
import nltk
from nltk.stem.porter import PorterStemmer
from sklearn.metrics.pairwise import cosine_similarity
import warnings
warnings.filterwarnings('ignore')

In [None]:
# Importing the course dataset
courses = pd.read_csv("Course_info.csv")

In [None]:
# Importing the comment dataset
comments = pd.read_csv("Comments.csv")

In [None]:
# Checking the rows and columns of the dataset
courses.head(2)

In [None]:
courses['course_url'] = 'https://www.udemy.com'+ courses['course_url']

In [None]:
# Checking the rows and columns of the dataset
comments.head(2)

In [None]:
# Checking the column types
courses.info()

In [None]:
# Checking the descriptive statistics of numerical columns
courses.describe()

In [None]:
# Checking the descriptive statistics of categorical columns
courses.describe(include='O')

In [None]:
# Checking the column types
comments.info()

In [None]:
# Checking the missing values in the dataset
courses.isnull().sum()

In [None]:
# Checking the missing values in the dataset
comments.isnull().sum()

In [None]:
# Checking the duplicates values in the dataset
courses.duplicated().sum()

# EXPLORATORY DATA ANALYSIS

In [None]:
total_courses = len(courses)

In [None]:
paid_courses = courses[courses['is_paid']==True]
free_courses = courses[courses['is_paid']==False]

In [None]:
total_enrollments = courses['num_subscribers'].sum()
paid_enrollments = paid_courses['num_subscribers'].sum()
free_enrollments = free_courses['num_subscribers'].sum()

In [None]:
print('Total Courses: ', total_courses)
print('Total Enrollments: ', total_enrollments)
print('Total Paid Courses: ', len(paid_courses))
print('Total Free Courses: ', len(free_courses))
print('Total Enrollments in Paid Courses: ', paid_enrollments)
print('Total Enrollments in free Courses: ', free_enrollments)

In [None]:
# Distribution of free and paid courses
plt.figure(figsize=(5,4))
sns.countplot(x='is_paid', data=courses)

#ax = sns.countplot(x='is_paid', data=courses)
#for p in ax.patches: ax.annotate(f'{p.get_height()}', (p.get_x() + p.get_width()/2. + p.get_height()), ha='center', va='baselaine')

plt.title('Distribution of Paid and Free Courses')
plt.xlabel('Price (True: Paid, False: Free)')
plt.ylabel('Number of Courses')
plt.show()

In [None]:
# Price distribution for Paid Courses
plt.figure(figsize=(12,4))
sns.histplot(paid_courses['price'], bins=100, kde=True)
plt.title('Distribution of Prices of Paid Courses')
plt.xlabel('Price')
plt.ylabel('Frequency')
plt.show()

In [None]:
# Courses with Hightest Price
courses[courses['price'] == courses.price.max()][['title','price','instructor_name','language']]

In [None]:
# Distribution of Number of Subscribers based on Paid and Free Courses
plt.figure(figsize=(8,4))
sns.barplot(x= 'is_paid', y='num_subscribers', data= courses, estimator='sum', errorbar=None)

plt.title('Distribution of Number of Subscribers based on Paid and Free Courses')
plt.xlabel('Price (True: Paid, False: Free)')
plt.ylabel('Number of Subscribers')
plt.show()

In [None]:
plt.figure(figsize=(10,6))
sns.displot(courses['avg_rating'])
plt.xlabel('Average Rating Distribution')
plt.title('Distribution of Average Ratings')
plt.show()

In [None]:
# Top Five Courses with Maximum Subscriber
courses_sorted = courses.sort_values(by = 'num_subscribers', ascending = False)
top_5_courses = courses_sorted.head()

plt.figure(figsize=(5,4))
plt.barh(top_5_courses['title'], top_5_courses['num_subscribers'])
plt.xlabel('Number of Subscribers')
plt.ylabel('Courses Title')
plt.title('Top Five Courses with Maximum Subscriber')
plt.gca().invert_yaxis()
plt.show()

In [None]:
# Top Five Courses with Maximum Reviews
courses_sorted = courses.sort_values(by = 'num_reviews', ascending = False)
top_5_courses = courses_sorted.head()

plt.figure(figsize=(5,4))
plt.barh(top_5_courses['title'], top_5_courses['num_reviews'], color ='red')
plt.xlabel('Number of Reviews')
plt.ylabel('Courses Title')
plt.title('Top Five Courses with Maximum Reviews')
plt.gca().invert_yaxis()
plt.show()

In [None]:
# Top Five Courses with Maximum Comments
courses_sorted = courses.sort_values(by = 'num_comments', ascending = False)
top_5_courses = courses_sorted.head()

plt.figure(figsize=(5,4))
plt.barh(top_5_courses['title'], top_5_courses['num_comments'], color = 'red')
plt.xlabel('Number of Comments')
plt.ylabel('Courses Title')
plt.title('Top Five Courses with Maximum Comments')
plt.gca().invert_yaxis()
plt.show()

In [None]:
# Top Five Courses with Maximum Lectures
courses_sorted = courses.sort_values(by = 'num_lectures', ascending = False)
top_5_courses = courses_sorted.head()

plt.figure(figsize=(5,4))
plt.barh(top_5_courses['title'], top_5_courses['num_lectures'], color = 'red')
plt.xlabel('Number of Lectures')
plt.ylabel('Courses Title')
plt.title('Top Five Courses with Maximum Lectures')
plt.gca().invert_yaxis()
plt.show()

In [None]:
# Top Five Courses with Highest Content Length
courses_sorted = courses.sort_values(by = 'content_length_min', ascending = False)
top_5_courses = courses_sorted.head()

plt.figure(figsize=(5,4))
plt.barh(top_5_courses['title'], top_5_courses['content_length_min'], color = 'orange')
plt.xlabel('Content Length in Minutes')
plt.ylabel('Courses Title')
plt.title('Top Five Courses with Highest Content Length')
plt.gca().invert_yaxis()
plt.show()

In [None]:
# Summary statistics for Average Rating
rating_stats = courses['avg_rating'].describe()
print(rating_stats)

In [None]:
courses['avg_rating'].max()

In [None]:
courses_with_5_rating = courses[courses['avg_rating'] == 5]
courses_with_5_rating.sample(5)

In [None]:
print(courses_with_5_rating.shape)

In [None]:
courses[courses['avg_rating'] == courses.avg_rating.max()][['title','instructor_name','language']]

In [None]:
courses['category'].unique()

In [None]:
courses['subcategory'].unique()

In [None]:
categories_counts = courses['category'].value_counts()
print("Course Categories Distribution: ", categories_counts)

# Add a pie plot

In [None]:
subcategories_counts = courses.groupby(['category','subcategory']).size()
subcategories_counts

# Add a pie plot

In [None]:
# Count plot for the number of courses in different langauges
plt.figure(figsize=(14, 6))
sns.countplot(data=courses, x='language', order=courses['language'].value_counts().index)
plt.xlabel("")
plt.ylabel("Number of courses in every language")
plt.title("Udemy Courses offered in different languages")
plt.xticks(rotation=90);

In [None]:
courses.language.value_counts()

In [None]:
courses.groupby(['language'])[['avg_rating', 'num_reviews', 'num_comments']].agg('mean').style.background_gradient(cmap='Wistia')

In [None]:
plt.figure(figsize = (14,6))
sns.countplot(x='instructor_name', data = courses,
              order = courses['instructor_name'].value_counts().iloc[:10].index)
plt.title('Top 10 Instructor with maximum Courses')
plt.xlabel('Instructor Name')
plt.ylabel('Number of Courses')
plt.xticks(rotation = 45)
plt.show()

In [None]:
courses['published_time'] = pd.to_datetime(courses['published_time'].str[0:10])
courses['published_year'] = courses['published_time'].dt.year

In [None]:
# Group the year column and perform operations for data visualization
courses_year_count = courses['published_year'].value_counts()

courses_num_subs = courses.groupby('published_year')['num_subscribers'].sum()

courses_num_reviews = courses.groupby('published_year')['num_reviews'].sum()

courses_num_comments = courses.groupby('published_year')['num_comments'].sum()

courses_num_lectures = courses.groupby('published_year')['num_lectures'].sum()

group_courses_year = pd.pivot_table(courses, index='published_year',
                               values=['id', 'num_subscribers', 'num_reviews', 'num_comments', 'num_lectures'],
                               aggfunc={'id': len, 'num_subscribers': np.sum, 'num_reviews':np.sum,
                                        'num_comments':np.sum, 'num_lectures':np.sum})

print('Year-wise count and sum values for different features: \n')
group_courses_year

In [None]:
# Visualization of year-wise feature data
dict_cols = dict({'id': 'Number of courses published', 'num_subscribers': 'Total subscribers',
                  'num_comments': 'Total comments', 'num_reviews': 'Total reviews',
                 'num_lectures': 'Total lectures'})

for key, val in dict_cols.items():
    df = group_courses_year
    fig, ax = plt.subplots(figsize=(10,4))
    sns.barplot(data=df, x=df.index, y=df[key], palette='hls')
    ax.set_xlabel("Year")
    ax.set_ylabel(val)
    ax.set_title(f'{val} per year')
    plt.show()

In [None]:
# Dropping rows with any column having NaN.
comments = comments.dropna(axis=0).reset_index(drop=True)

In [None]:
# Mapping the course title from the courses dataframe to comments dataframe
dict_id = dict(zip(courses['id'], courses['title']))
comments['title'] = comments['course_id'].map(dict_id)

In [None]:
# New column 'review' based on 'rate' column.
comments['review'] = pd.cut(comments['rate'], bins=[0, 2.5, 3.0, 5.0], labels=['Negative', 'Neutral', 'Positive'])
comments.sample(5)

In [None]:
# Share of positive and negative reviews
def autopct_format(values):
        def my_format(pct):
            total = sum(values)
            val = int(round(pct*total/100.0))
            return '{:.0f}%'.format(pct, v=val)
        return my_format

reviews_counts = comments['review'].value_counts()
explode= [0,0.1,0.1]
plt.pie(reviews_counts,labels = reviews_counts.index, explode=explode, autopct=autopct_format(reviews_counts))
plt.show()
print(f"\nNumber of Positive, Negative and Neutral Reviews:\n{reviews_counts}")

## POPULARITY BASED COURSE RECOMMENDER SYSTEM

In [None]:
# Dropping rows with any columns having Nan
courses = courses.dropna(axis=0).reset_index(drop=True)

In [None]:
# Filering courses for 4.5 and above ratings
popular_courses = courses.loc[courses['avg_rating']>=4.5]

In [None]:
# Filtering with english language courses
languages_to_keep = ['English']
popular_courses = popular_courses.loc[popular_courses['language'].isin(languages_to_keep)].reset_index()

In [None]:
popular_50_courses = popular_courses.sort_values('num_subscribers', ascending=False).head(50)

In [None]:
popular_50_courses = popular_50_courses[['id', 'title', 'price', 'course_url', 'num_subscribers', 'avg_rating', 'content_length_min', 'published_time','instructor_name']]

In [None]:
popular_50_courses.head()

## CONTENT BASED COURSE RECOMMENDER SYSTEM

In [None]:
popular_courses.head()

In [None]:
popular_courses['tags'] = popular_courses['title'] + ' ' + popular_courses['headline'] + ' ' + popular_courses['instructor_name'] + ' ' + popular_courses['topic']

In [None]:
popular_courses = popular_courses.head(5000)

In [None]:
courses_crs = popular_courses[['id', 'title', 'tags']]
courses_crs.sample(10)

In [None]:
courses_crs['tags'] = courses_crs['tags'].apply(lambda x: x.lower())
courses_crs['tags'] = courses_crs['tags'].str.replace("[^\w\s]","")

In [None]:
ps = PorterStemmer()

In [None]:
def stem(tags):
  text = []

  for i in tags.split():
    text.append(ps.stem(i))

  return " ".join(text)

In [None]:
courses_crs['tags'] = courses_crs['tags'].apply(stem)

In [None]:
cv = CountVectorizer(max_features=500, stop_words='english')
vectors = cv.fit_transform(courses_crs['tags']).toarray()

In [None]:
vectors.shape

In [None]:
cv.get_feature_names_out()

In [None]:
similarity = cosine_similarity(vectors)

In [None]:
def recommend(course):
    course_index = courses_crs[courses_crs['title'] == course].index[0]
    distances = similarity[course_index]
    course_list = sorted(list(enumerate(distances)), reverse = True, key = lambda x: x[1])[1:7]
    data = []
    
    for i in course_list:
        item =[]
        temp_df = popular_courses[popular_courses['title'] == popular_courses.iloc[i[0]].title]
        item.extend(list(temp_df['title'].values))
        item.extend(list(temp_df['instructor_name'].values))
        item.extend(list(temp_df['course_url'].values))
        
        data.append(item)
        
    return data

In [None]:
recommend('How to Train a Puppy')

In [None]:
import pickle
pickle.dump(popular_50_courses, open('popular_50_courses.pkl', 'wb'))

In [None]:
pickle.dump(popular_courses, open('popular_courses.pkl', 'wb'))
pickle.dump(courses_crs, open('courses_crs.pkl', 'wb'))
pickle.dump(vectors, open('vectors.pkl', 'wb'))

In [None]:
def searchterm(keyword, df):
    result_df = df[df['title'].str.lower().str.contains(keyword)]
    top6 = result_df.sort_values(by='num_subscribers', ascending=False).head(6)
    return top6

In [None]:
user_input = 'Python'
user_input = user_input.lower()
resultdf = searchterm(user_input, popular_courses)
data1 = []

course_title = resultdf['title'].unique()[:7]
    
for title in course_title:
    item1 = []
    temp_df = resultdf.loc[resultdf['title'] == title]

    item1.extend(list(temp_df['title'].values))
    item1.extend(list(temp_df['instructor_name'].values))
    item1.extend(list(temp_df['course_url'].values))
        
    data1.append(item1)
    

In [None]:
data1