# Importing Dependencies

In [1]:
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

## Load the dataset

In [2]:
data = pd.read_csv("../data/courses.csv")
data.head(5)

Unnamed: 0,course_title,course_organization,course_certificate_type,course_time,course_rating,course_reviews_num,course_difficulty,course_url,course_students_enrolled,course_skills,course_summary,course_description
0,(ISC)² Systems Security Certified Practitioner...,ISC2,Specialization,3 - 6 Months,4.7,492.0,Beginner,https://www.coursera.org/specializations/sscp-...,6958.0,"['Risk Management', 'Access Control', 'Asset',...",[],Pursue better IT security job opportunities an...
1,.NET FullStack Developer,Board Infinity,Specialization,1 - 3 Months,4.3,51.0,Intermediate,https://www.coursera.org/specializations/dot-n...,2531.0,"['Web API', 'Web Development', 'Cascading Styl...",['Master .NET full stack web dev: from .NET co...,Develop the proficiency required to design and...
2,21st Century Energy Transition: how do we make...,University of Alberta,Course,1 - 3 Months,4.8,62.0,Beginner,https://www.coursera.org/learn/21st-century-en...,4377.0,[],['Understand the complexity of systems supplyi...,"Affordable, abundant and reliable energy is fu..."
3,A Crash Course in Causality: Inferring Causal...,University of Pennsylvania,Course,1 - 3 Months,4.7,517.0,Intermediate,https://www.coursera.org/learn/crash-course-in...,39004.0,"['Instrumental Variable', 'Propensity Score Ma...",[],We have all heard the phrase “correlation does...
4,A life with ADHD,University of Geneva,Course,1 - 3 Months,,,Beginner,https://www.coursera.org/learn/life-with-adhd,,"['differential diagnosis and comorbidities', '...",[' Understand what ADHD is and the challenges ...,What is ADHD and what are the challenges that ...


## Understanding the dataset

In [3]:
data.shape

(1000, 12)

In [4]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 12 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   course_title              1000 non-null   object 
 1   course_organization       1000 non-null   object 
 2   course_certificate_type   1000 non-null   object 
 3   course_time               1000 non-null   object 
 4   course_rating             994 non-null    float64
 5   course_reviews_num        994 non-null    object 
 6   course_difficulty         1000 non-null   object 
 7   course_url                1000 non-null   object 
 8   course_students_enrolled  959 non-null    object 
 9   course_skills             1000 non-null   object 
 10  course_summary            1000 non-null   object 
 11  course_description        999 non-null    object 
dtypes: float64(1), object(11)
memory usage: 93.9+ KB


In [5]:
data.isnull().sum()

course_title                 0
course_organization          0
course_certificate_type      0
course_time                  0
course_rating                6
course_reviews_num           6
course_difficulty            0
course_url                   0
course_students_enrolled    41
course_skills                0
course_summary               0
course_description           1
dtype: int64

In [6]:
data["course_difficulty"].value_counts()

course_difficulty
Beginner        685
Intermediate    199
Mixed            80
Advanced         36
Name: count, dtype: int64

In [7]:
data["course_rating"].value_counts()

course_rating
4.7    291
4.8    289
4.6    173
4.9     86
4.5     81
4.4     34
4.3     14
4.2     10
3.9      4
4.0      3
5.0      3
3.8      1
3.0      1
2.8      1
2.7      1
3.7      1
3.3      1
Name: count, dtype: int64

In [8]:
data["course_organization"].value_counts()

course_organization
Google                                             96
IBM                                                78
University of Pennsylvania                         55
DeepLearning.AI                                    43
Google Cloud                                       35
                                                   ..
Google - Spectrum Sharing                           1
UNSW Sydney (The University of New South Wales)     1
University of Arizona                               1
Politecnico di Milano                               1
University of Houston                               1
Name: count, Length: 159, dtype: int64

In [9]:
data["course_title"]

0      (ISC)² Systems Security Certified Practitioner...
1                               .NET FullStack Developer
2      21st Century Energy Transition: how do we make...
3      A Crash Course in Causality:  Inferring Causal...
4                                       A life with ADHD
                             ...                        
995    Étudier en France: French Intermediate course ...
996    Цифровий маркетинг і електронна комерція від G...
997                           تحليلات البيانات من Google
998    用 Python 做商管程式設計（一）(Programming for Business C...
999    用 Python 做商管程式設計（二）(Programming for Business C...
Name: course_title, Length: 1000, dtype: object

# Required Columns for System

In [10]:
df = data[["course_title", "course_difficulty", "course_description", "course_skills"]]

In [11]:
data.head(5)

Unnamed: 0,course_title,course_organization,course_certificate_type,course_time,course_rating,course_reviews_num,course_difficulty,course_url,course_students_enrolled,course_skills,course_summary,course_description
0,(ISC)² Systems Security Certified Practitioner...,ISC2,Specialization,3 - 6 Months,4.7,492.0,Beginner,https://www.coursera.org/specializations/sscp-...,6958.0,"['Risk Management', 'Access Control', 'Asset',...",[],Pursue better IT security job opportunities an...
1,.NET FullStack Developer,Board Infinity,Specialization,1 - 3 Months,4.3,51.0,Intermediate,https://www.coursera.org/specializations/dot-n...,2531.0,"['Web API', 'Web Development', 'Cascading Styl...",['Master .NET full stack web dev: from .NET co...,Develop the proficiency required to design and...
2,21st Century Energy Transition: how do we make...,University of Alberta,Course,1 - 3 Months,4.8,62.0,Beginner,https://www.coursera.org/learn/21st-century-en...,4377.0,[],['Understand the complexity of systems supplyi...,"Affordable, abundant and reliable energy is fu..."
3,A Crash Course in Causality: Inferring Causal...,University of Pennsylvania,Course,1 - 3 Months,4.7,517.0,Intermediate,https://www.coursera.org/learn/crash-course-in...,39004.0,"['Instrumental Variable', 'Propensity Score Ma...",[],We have all heard the phrase “correlation does...
4,A life with ADHD,University of Geneva,Course,1 - 3 Months,,,Beginner,https://www.coursera.org/learn/life-with-adhd,,"['differential diagnosis and comorbidities', '...",[' Understand what ADHD is and the challenges ...,What is ADHD and what are the challenges that ...


## Data Preprocessing

In [12]:
# Filling NA values in course_description column to an empty string
data["course_description"].fillna(" ", inplace=True)

In [13]:
# Removing spaces and certain characters from the 'course_title' column
data.loc[:, "course_title"] = data["course_title"].apply(
    lambda x: x.replace(" ", "").replace(":", "")
)

# Removing spaces and certain characters from the 'course_description' column
data.loc[:, "course_description"] = data["course_description"].apply(
    lambda x: x.replace(" ", "")
    .replace("_", "")
    .replace(":", "")
    .replace("(", "")
    .replace(")", "")
)

# Removing parenthesis from the 'course_skills' column
data.loc[:, "course_skills"] = data["course_skills"].apply(
    lambda x: x.replace("(", "").replace(")", "")
)

In [14]:
data.head(5)

Unnamed: 0,course_title,course_organization,course_certificate_type,course_time,course_rating,course_reviews_num,course_difficulty,course_url,course_students_enrolled,course_skills,course_summary,course_description
0,(ISC)²SystemsSecurityCertifiedPractitioner(SSCP),ISC2,Specialization,3 - 6 Months,4.7,492.0,Beginner,https://www.coursera.org/specializations/sscp-...,6958.0,"['Risk Management', 'Access Control', 'Asset',...",[],PursuebetterITsecurityjobopportunitiesandprove...
1,.NETFullStackDeveloper,Board Infinity,Specialization,1 - 3 Months,4.3,51.0,Intermediate,https://www.coursera.org/specializations/dot-n...,2531.0,"['Web API', 'Web Development', 'Cascading Styl...",['Master .NET full stack web dev: from .NET co...,Developtheproficiencyrequiredtodesignanddevelo...
2,21stCenturyEnergyTransitionhowdowemakeitwork?,University of Alberta,Course,1 - 3 Months,4.8,62.0,Beginner,https://www.coursera.org/learn/21st-century-en...,4377.0,[],['Understand the complexity of systems supplyi...,"Affordable,abundantandreliableenergyisfundamen..."
3,ACrashCourseinCausalityInferringCausalEffectsf...,University of Pennsylvania,Course,1 - 3 Months,4.7,517.0,Intermediate,https://www.coursera.org/learn/crash-course-in...,39004.0,"['Instrumental Variable', 'Propensity Score Ma...",[],Wehaveallheardthephrase“correlationdoesnotequa...
4,AlifewithADHD,University of Geneva,Course,1 - 3 Months,,,Beginner,https://www.coursera.org/learn/life-with-adhd,,"['differential diagnosis and comorbidities', '...",[' Understand what ADHD is and the challenges ...,WhatisADHDandwhatarethechallengesthatcomewithi...


In [15]:
# Creating a new column 'tags' by concatenating values from different columns
data["tags"] = (
    data["course_title"]
    + data["course_difficulty"]
    + data["course_description"]
    + data["course_skills"]
)

data["tags"].iloc[1]

".NETFullStackDeveloperIntermediateDeveloptheproficiencyrequiredtodesignanddevelopcomprehensive,scalable,andhigh-performingapplicationswiththe.NETframeworkviathisin-depthspecialization.Thecurriculumissegmentedintothreeintensivecourses\r\n.NETFullStackFoundation\r\nUnderstandthebasicsof.NETanditssignificantcomponents.\r\nHarnessingtheC#programminglanguage,masteringeverythingfrombasicsyntaxtocomplexstructures.\r\nConstructingbothwebanddesktopapplicationswithanunparalleledunderstandingof.NET'scapacities.\r\nFrontendDevelopmentusingReact\r\nDesigningwebpagesemployingthecoreprinciplesofHTMLandCSS.\r\nUtilizingJavaScriptfordynamicandinteractivefunctionalities,coveringeverythingfrombasicvariablestocomplexfunctions.\r\nConstructingdynamicandinteractiveUIsusingReact'scoreconceptssuchascomponents,state,props,andJSX.\r\nBackendDevelopmentusingASP.NET\r\nMastertheASP.NETCoreframework,designedformodernwebapplications.\r\nExploreASP.NETMVCandcraftwebapplicationsfollowingMVCpatterns.\r\nDelveintoASP.

In [16]:
# Creating a new DataFrame 'new_df' based on existing data
new_df = data[["course_title", "tags"]].copy()

# Replacing commas in the 'tags' column with spaces
new_df["tags"] = data["tags"].str.replace(",", " ")

# Replacing commas in the 'course_title' column with spaces
new_df["course_title"] = data["course_title"].str.replace(",", " ")

# Renaming the 'course_title' column to 'course_name'
new_df.rename(columns={"course_title": "course_name"}, inplace=True)

# Converting the 'tags' column to string type
new_df["tags"] = new_df["tags"].astype(str)

# Lowercasing the values in the 'tags' column
new_df["tags"] = new_df["tags"].apply(lambda x: x.lower())

# Text Vectorization

In [17]:
from sklearn.feature_extraction.text import CountVectorizer

In [18]:
cv = CountVectorizer(max_features=5000, stop_words="english")

In [19]:
vectors = cv.fit_transform(new_df["tags"]).toarray()

# Stemming Process

In [20]:
import nltk
from nltk.stem.porter import PorterStemmer

# Initialize the Porter stemmer
ps = PorterStemmer()


# Define the stemming function
def stem(text):
    stemmed_words = []

    # Tokenize the text and stem each word
    for word in text.split():
        stemmed_word = ps.stem(word)
        stemmed_words.append(stemmed_word)

    # Join the stemmed words back into a single string
    return " ".join(stemmed_words)


# Apply stemming to the 'tags' column
new_df["tags"] = new_df["tags"].apply(stem)

# Similarity Measure 

In [21]:
from sklearn.metrics.pairwise import cosine_similarity

In [22]:
similarity = cosine_similarity(vectors)

# Recommendation Function

In [23]:
import re


def preprocess_text(text):
    if isinstance(text, str):
        # Convert to lowercase
        text = text.lower()
        # Remove special characters and punctuation
        text = re.sub(r"[^a-zA-Z0-9\s]", "", text)
        # Remove extra whitespaces
        text = re.sub(r"\s+", " ", text).strip()
    return text


def recommend_course(input_string, df, cv, vectors):
    # Preprocess input string
    input_string = preprocess_text(input_string)
    input_string = stem(input_string)
    input_vector = cv.transform([input_string]).toarray()

    # Calculate similarity with input string
    input_similarity = cosine_similarity(input_vector, vectors)

    # Find top similar courses
    similar_courses = []
    for i in range(len(input_similarity[0])):
        similar_courses.append((df.iloc[i], input_similarity[0][i]))

    # Sort and return top recommendations
    similar_courses = sorted(similar_courses, reverse=True, key=lambda x: x[1])
    return similar_courses[:3]

In [24]:
recommended_courses = recommend_course("Python", data, cv, vectors)
for course_details, similarity_score in recommended_courses:
    print(course_details)
    print()

course_title                                    LearntoProgramTheFundamentals
course_organization                                     University of Toronto
course_certificate_type                                                Course
course_time                                                      1 - 3 Months
course_rating                                                             4.7
course_reviews_num                                                       6.2k
course_difficulty                                                    Beginner
course_url                    https://www.coursera.org/learn/learn-to-program
course_students_enrolled                                              421,737
course_skills               ['Python Syntax And Semantics', 'Computer Prog...
course_summary                                                             []
course_description          Behindeverymouseclickandtouch-screentap,therei...
tags                        LearntoProgramTheFundamentalsBeginne