### Content-based, collaberative and hybrid recommender for courses at Aalto University
This notebook shows how a simple content-based, collaberative and hybrid recommender can be made for real-data of courses. 
We use data of course descriptions and courses taken by students in the past
The output is a file with the most K similair courses to all D courses. Hence, it is a DxK matrix.

**Note: this notebook doesn't input any data, since this is private data. It only shows how to make the model.**



### Load packages

In [None]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from scipy import sparse

### Define variables
Set these, fitting to your datasets and file structure

In [None]:
file_path_courses=""
delimiter_courses="" 

In [None]:
coursecode_col=""
coursename_col=""
coursedescript_col=""

In [None]:
file_path_hist=""
delimiter_hist=""

In [None]:
hist_studentid_col=""
hist_coursecode_col=""

In [None]:
file_path_hist_bin="../Data/history_binary.csv"
path_results="../Data/Results"

### Load course data

In [None]:
#load course description data
#this data is in this script only used to retrieve the matching names from the course codes. 
df_courses=pd.read_csv(file_path_courses,delimiter=delimiter_courses)
df_courses=df_courses.rename(columns={coursecode_col:"code",coursename_col:"name",coursedescript_col:"description"})
df_courses=df_courses.set_index("code",drop=False)

In [None]:
#create a dataframe with all the matching codes and names in course description data
df_codename=df_courses[["code","name"]]

In [None]:
# #read the course description data
# df_courses.set_index('Code',inplace=True)
# df_courses.index.name='code'

In [None]:
print("Number of courses in course description data:",df_codename.shape[0])

### Load historical data

In [None]:
#load csv with historical data
df_hist=pd.read_csv(file_path_hist,delimiter=delimiter_hist)

In [None]:
print("Number of courses taken by all students:",len(df_hist))

In [None]:
df_hist=df_hist[[hist_studentid_col,hist_coursecode_col]]
df_hist.columns=["ID","code"]

### Create matrix indicating which students have taken which courses
create the DxN matrix where dataframe of size DxN where D is number of courses and N number of users
binary entries, 1 indicating user has token that course, 0 indicating student has not taken that course.

In [None]:
#Note: this takes rather long to create, so execute once, and then use the cell below to load the file

#merge dataframes such that only courses are left which are both in the historical and course description data
df_hist_both = pd.merge(df_hist, df_codename, how='inner', on=['code'])

#crashes when doing with too large dataset
df_hist_bin=pd.crosstab(df_hist_both["code"], df_hist_both["ID"])
df_hist_bin.to_csv(file_path_hist_bin)

In [None]:
# #read the csv history_dataframe
# #this is a DxN matrix where D is number of courses and N number of users
# #binary entries, 1 indicating user has token that course, 0 indicating student has not taken that course.
# df_history=pd.read_csv(file_path_hist_bin,index_col="code")

### Define similarity measures
For content, collaberative and hybrid

In [None]:
def get_sim_measure_content (df,input_columns,sim='cos',stopwords='english',smooth=True,sublin=False,tokenize=None):
    """
    Calculate the similarity between every entry of a dataframe
    df: the dataframe
    sim: the similarity measure. Now only implemented for cosine similarity.
    input_columns: the columns of the courses dataframe we want to consider
    stopwords: indicates which stopwords should be removed
    smooth: whether to use smooth idf
    sublin: whether to use sublinear TF
    tokenize: whether to tokenize the input
    """
    
    #create a new column which has all the columns we are interested in as one string
    df_courses['combined']=df[input_columns].apply(lambda x: ' '.join(x), axis=1)
    
    #define the tf-idf vectorizer
    tfidf_all = TfidfVectorizer(stop_words=stopwords,smooth_idf=smooth,sublinear_tf=sublin,tokenizer=tokenize)
    #get the tf-idf score for each word in each ontent description of each course
    tfidf_matrix_all = tfidf_all.fit_transform(df['combined'])
    
    #calculate the cosine similarity
    if sim=='cos':
        sim_matrix=cosine_similarity(tfidf_matrix_all,tfidf_matrix_all)
    
    return pd.DataFrame(data=sim_matrix, index= df.index, columns= df.index)

In [None]:
def get_sim_measure_collaberative (df, norm=True,sim='cos'):
    """
    Calculate the similarity between every entry of a dataframe
    df: the dataframe
    norm: whether to normalize the dataframe
    sim: the similarity measure. Now only implemented for cosine similarity and cooccurence. 
    """
    
    if norm:
        #normalize for the fact that some users have taken more courses than others. 
        #the resulting dataframe is not binary, but contains values between 0 and 1. 
        magnitude=np.sqrt(np.square(np.sum(df,axis=0)))
        df=df.divide(magnitude)
    
    #it is a rather large matrix, so much faster when converting it into a sparse matrix
    mat_df=sparse.csr_matrix(df)
    if sim=='cos':
        sim_matrix=cosine_similarity(mat_df,mat_df)
    if sim=='cooccur':
        sim_matrix=(mat_df@mat_df.T).todense()
    
    #df.index is here the course codes
    return pd.DataFrame(data=sim_matrix, index= df.index, columns= df.index)

In [None]:
def get_sim_measure_hybrid (df_content,df_collab, input_columns,norm=True,sim_con='cos',sim_col='cos',stopwords='english',smooth=True,sublin=False,tokenize=None):
    #sadly, the concat doesn't work too well when content and collab have differen indices and columns, thus only do the hybrid for courses that appear in both datasets
    df_content_both=df_content[df_content.index.isin(df_collab.index)]
    df_sim_content=get_sim_measure_content(df_content_both,input_columns=input_columns,sim=sim_con,stopwords=stopwords,smooth=smooth,sublin=sublin,tokenize=tokenize)
    df_sim_collab=get_sim_measure_collaberative(df_collab,norm=norm,sim=sim_col)
    df_sim_hybrid=pd.concat([df_sim_content,df_sim_collab],join='outer').groupby(level=0).sum()
    
    return df_sim_hybrid

### Extract most similar courses for each course

In [None]:
def get_mostsimilar(user_input,df):
    """
    Get the courses sorted on similarity to the input course
    user_input: the course code of the course we are interested in
    df: dataframe with similarity scores
    """

    #Get similarity of course to all other courses
    # structure is list of (index, similarity)
    input_row = list(enumerate(df.loc[user_input]))

    #sort the courses by descending score
    courses_sorted = df.loc[user_input].sort_values(ascending=False)
    
    highest_codes=courses_sorted.index
    highest_scores=list(courses_sorted)
    

    return highest_codes,highest_scores

In [None]:
def mostsim_allcourses(df_sim,indices,number_courses=3):
    """
    Create a dataframe which has the most similar courses to each course in the input dataframe
    df_sim: dataframe with similarity between all courses
    indices: dataframe which has the code and name of all courses of df_sim
    number_courses: the amount of most similar courses we want to return for each course.  
    """

    k=number_courses
    #use the indices dataframe as basis
    df_mostsim=indices.copy()
    #add a column for every similar course
    columns_sim=['sim'+str(i) for i in range(1,k+1)]
    #set columns to be the original plus the similarity columns
    df_mostsim=df_mostsim.reindex(columns=[*df_mostsim.columns.tolist()+columns_sim])

    #for every course
    for i in df_mostsim.index:
        #get the similar courses
        sim_codes,sim_scores=get_mostsimilar(i,df_sim)
        #add the highest k to the dataframe
        #sim_codes[0] is the input course, so don't include that
        df_mostsim.loc[i,columns_sim]=df_mostsim.loc[sim_codes[range(1,k+1)]]['name'].values
    df_mostsim.drop(columns=['code'],inplace=True)
    return df_mostsim

In [None]:
df_simall_collaberative=mostsim_allcourses(get_sim_measure_collaberative(df_hist_bin),df_codename[df_codename.code.isin(df_hist.code)])

In [None]:
df_simall_content=mostsim_allcourses(get_sim_measure_content(df_courses,["name","description"]),df_codename)

In [None]:
df_simall_hybrid=mostsim_allcourses(get_sim_measure_hybrid(df_courses,df_hist_bin,["name","description"]),df_codename[df_codename.code.isin(df_hist.code)])

In [None]:
df_simall_hybrid.shape

### Save results to csv

In [None]:
df_simall_hybrid.to_csv(path_results+"df_simcourses_hybrid.csv")
df_simall_content.to_csv(path_results+"df_simcourses_content.csv")
df_simall_collaberative.to_csv(path_results+"df_simcourses_collaberative.csv")