# Course Recommender

## Import Libraries

In [9]:
# data cleaning and EDA
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
# build a model
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.cluster import KMeans

# text analysis
import nltk 
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
import re

import math

## Bring in Clean Data

In [3]:
df = pd.read_csv('clean_data.csv')

In [4]:
df.head(1)

Unnamed: 0.1,Unnamed: 0,name,instructor,level,category,subcategory,no_of_students,rating,no_of_rating,about,syllabus,language
0,0,Meditation: A way to achieve your goals in you...,Duck-Joo Lee,Beginner Level,Arts and Humanities,Philosophy,78489,4.6,843,Do we truly think that we have lived for ourse...,['Self- reflection is the methodology of medit...,English


## Data Cleaning/Wrangling

- Explain cleaning and wrangling procedures

In [5]:
STOPWORDS = set(stopwords.words('english')+ ['homework','quiz','week','version','want','introduction','use', 'learn'
             'what','have','well'])
MIN_WORDS = 4
MAX_WORDS = 200

PATTERN_S = re.compile("\'s")  # matches `'s` from text  
PATTERN_RN = re.compile("\\r\\n") #matches `\r` and `\n`
PATTERN_PUNC = re.compile(r"[^\w\s0-9]") # matches all non A-z whitespace

def clean_text(text):
    """
    Series of cleaning. String to lower case, remove non words characters and numbers.
        text (str): input text
    return (str): modified initial text
    """
    text = text.lower()  # lowercase text
    text = re.sub(PATTERN_S, ' ', text)
    text = re.sub(PATTERN_RN, ' ', text)
    text = re.sub(PATTERN_PUNC, ' ', text)
    return text

def tokenizer(sentence, min_words=MIN_WORDS, max_words=MAX_WORDS, stopwords=STOPWORDS, lemmatize=False):
    """
    Lemmatize, tokenize, crop and remove stop words.
    """
    if lemmatize:
        stemmer = WordNetLemmatizer()
        tokens = [stemmer.lemmatize(w) for w in word_tokenize(sentence)]
    else:
        tokens = [w.lower() for w in word_tokenize(sentence)]
    tokens = [w for w in tokens if (len(w) > min_words and len(w) < max_words
                                                        and w not in stopwords)]
    return tokens    


def clean_syllabus(df):
    """
    Remove irrelavant characters (in new column clean_sentence).
    Lemmatize, tokenize words into list of words (in new column tok_lem_sentence).
    """
    df['clean_syllabus'] = df['syllabus'].apply(clean_text)
    df['clean_syl'] = df['clean_syllabus'].apply(
        lambda x: tokenizer(x, min_words=MIN_WORDS, max_words=MAX_WORDS, stopwords=STOPWORDS, lemmatize=True))
    return df
    
df = clean_syllabus(df)

In [6]:
len(STOPWORDS)

188

In [14]:
df['clean_syl'][0]

['reflection',
 'methodology',
 'meditation',
 'growth',
 'happiness',
 'human',
 'completion',
 'reflection',
 'academic',
 'research',
 'reflection',
 'graph',
 'ultimate',
 'purpose',
 'reflection',
 'learn',
 'principle',
 'reflection',
 'practice',
 'reflection',
 'subject',
 'methodology',
 'reflection',
 'practice',
 'reflection',
 'essential',
 'subject',
 'barrier',
 'relationship',
 'inferiority',
 'childhood',
 'emotion',
 'hatred',
 'since',
 'childhood',
 'worry',
 'response',
 'problem',
 'current',
 'recognize',
 'copied',
 'world',
 'world',
 'think',
 'living',
 'principle',
 'formation',
 'human',
 'recognition',
 'observed',
 'philosopher',
 'neuroscientist',
 'understanding',
 'reason',
 'human',
 'original',
 'learn',
 'practice',
 'methodology',
 'cleansing',
 'change',
 'methodology',
 'cleansing',
 'repetition',
 'reflection',
 'essential',
 'subject',
 'potential',
 'positive',
 'learn',
 'practice',
 'methodology',
 'reflection',
 'cleansing',
 'everyday',
 's

In [10]:
# tekrar düşün

df['abs_rating'] = round(df['rating'] * (df['no_of_rating'].apply(lambda x: math.log(x))),2)

In [213]:
def extract_best_indices(m, topk, mask=None):
    """
    Use sum of the cosine distance over all tokens.
    m (np.array): cos matrix of shape (nb_in_tokens, nb_dict_tokens)
    topk (int): number of indices to return (from high to lowest in order)
    """
    # return the sum on all tokens of cosinus for each sentence
    if len(m.shape) > 1:
        cos_sim = np.mean(m, axis=0) 
    else: 
        cos_sim = m
    index = np.argsort(cos_sim)[::-1] # from highest idx to smallest score 
    if mask is not None:
        assert mask.shape == m.shape
        mask = mask[index]
    else:
        mask = np.ones(len(cos_sim))
    mask = np.logical_or(cos_sim[index] != 0, mask) #eliminate 0 cosine distance
    best_index = index[mask][:topk]
    if cos_sim.max() < 0.2:
        return [], cos_sim
    return best_index, cos_sim

In [224]:
# Adapt stop words
token_stop = tokenizer(' '.join(STOPWORDS), lemmatize=True)

# Fit TFIDF
vectorizer = TfidfVectorizer(stop_words=token_stop, tokenizer=tokenizer) 
tfidf_mat = vectorizer.fit_transform(df['syllabus'].values) # -> (num_sentences, num_vocabulary)
tfidf_mat.shape

(911, 36848)

In [225]:
tfidf_mat[0].data

array([0.03206104, 0.09206371, 0.01924251, 0.01262559, 0.02378162,
       0.02325072, 0.03399636, 0.02506207, 0.02962282, 0.06876045,
       0.01268014, 0.01382127, 0.02091406, 0.01632968, 0.01461547,
       0.02875258, 0.02867158, 0.02875258, 0.01358341, 0.0223388 ,
       0.08033139, 0.01946456, 0.10095621, 0.02033479, 0.01957958,
       0.01946456, 0.01604378, 0.03399636, 0.01551874, 0.03399636,
       0.01892769, 0.01216081, 0.00820885, 0.03064778, 0.03378374,
       0.0319772 , 0.0223388 , 0.04232537, 0.03468152, 0.08886845,
       0.07757829, 0.04182812, 0.03314282, 0.03648244, 0.07633235,
       0.08625773, 0.01203936, 0.09413599, 0.02859428, 0.03584785,
       0.02991309, 0.06799273, 0.05475889, 0.02028835, 0.06137581,
       0.04013911, 0.01994226, 0.03568467, 0.02965154, 0.11206721,
       0.0479665 , 0.19236623, 0.04279976, 0.0352112 , 0.02058311,
       0.06799273, 0.05363451, 0.06901769, 0.07134485, 0.03653001,
       0.01738006, 0.01461547, 0.04129973, 0.03399636, 0.03206

In [226]:
def get_recommendations_tfidf(sentence, tfidf_mat):
    
    """
    Return the database sentences in order of highest cosine similarity relatively to each 
    token of the target sentence. 
    """
    # Embed the query sentence
    tokens = [str(tok) for tok in word_tokenize(sentence)]
    vec = vectorizer.transform(tokens)
    # Create list with similarity between query and dataset
    mat = cosine_similarity(vec, tfidf_mat)
    # Best cosine distance for each token independantly
    #print(mat.shape)
    best_index, cos_sim = extract_best_indices(mat, topk=5)
    return best_index, cos_sim

query_sentence = 'Introduction to Structured Query Language'
best_index, cos_sim = get_recommendations_tfidf(query_sentence, tfidf_mat)
display(df[['name', 'subcategory','abs_rating']].iloc[best_index])

Unnamed: 0,name,subcategory,abs_rating


In [227]:
def recommender():
    response = input( "\033[1m" + """
                        Welcome to the Coursera course recommender! (It contains only free courses.)
                        Are you new here?(yes/no): """).lower().replace(" ", '')
    while response == 'yes':
        query_sentence = input("\033[1m" + """
                        I am glad you prefer studying today!
                        What do you want to learn? Give me some keywords that are important for you: 
                        """).lower()
        best_index = get_recommendations_tfidf(query_sentence, tfidf_mat)[0]
        if len(best_index)==0:
            response = input("\033[1m" + """
                        Sorry, couldnt find a good match. Do you want to try again? (yes/no)
                        """)
            if response == 'yes':
                print("\033[1m"+ """
                        This time please give more details.
                        """)
            if response == 'no':
                print("\033[1m"+ """
                        Please don't quit on studying. See you later!
                        """)
            
        elif len(best_index)!=0:
            print("\033[1m"+"""
                        We have three courses for you, choose according to your level: """)
            for i in list(best_index):
                course_name = df['name'].iloc[i]
                instructor = df['instructor'].iloc[i]
                level = df['level'].iloc[i]
                print("\033[1m"+ """ 
                            {}: 
                            {} 
                            by {} """.format(level,"\u0332".join(course_name),instructor))
                    
            response = 'no'
        

In [230]:
recommender()


                        Welcome to the Coursera course recommender! (It contains only free courses.)
                        Are you new here?(yes/no): yes

                        I am glad you prefer studying today!
                        What do you want to learn? Give me some keywords that are important for you: 
                        data science
[1m
                        We have three courses for you, choose according to your level: 
[1m 
                            Beginner Level: 
                            S̲o̲c̲i̲a̲l̲ ̲S̲c̲i̲e̲n̲c̲e̲ ̲A̲p̲p̲r̲o̲a̲c̲h̲e̲s̲ ̲t̲o̲ ̲t̲h̲e̲ ̲S̲t̲u̲d̲y̲ ̲o̲f̲ ̲C̲h̲i̲n̲e̲s̲e̲ ̲S̲o̲c̲i̲e̲t̲y̲ ̲P̲a̲r̲t̲ ̲1 
                            by Cameron Campbell 
[1m 
                            Beginner Level: 
                            P̲h̲i̲l̲o̲s̲o̲p̲h̲y̲,̲ ̲S̲c̲i̲e̲n̲c̲e̲ ̲a̲n̲d̲ ̲R̲e̲l̲i̲g̲i̲o̲n̲:̲ ̲S̲c̲i̲e̲n̲c̲e̲ ̲a̲n̲d̲ ̲P̲h̲i̲l̲o̲s̲o̲p̲h̲y 
                            by Dr J Adam Carter 
[1m 
                            Beginner Level: 
