In [64]:
import pandas as pd
import numpy as np

from CF_Helpers import cf_helpers as hlp

import os.path

# User-Based Collaborative Filtering

In [65]:
#load data from previous notebooks
train_df_cluster = pd.read_csv('C:~your_path~/RecSys/obtained_data/predicted_cluster_train.csv')
test_df_cluster = pd.read_csv('C:~your_path~/RecSys/obtained_data/predicted_cluster_test.csv')


In [66]:
train_df_cluster.tail(1)

Unnamed: 0,StudentID,12E050SCIENCES,12E051SCIENCES,5869,5870,71105,71120,71121,71133,74110,...,T208011,T208012,T208013,T208014,T208015,T208016,T208017,T208033,T214006,cluster
479,600,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,5.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1


In [67]:
# students and their respective cluster
student_cluster = pd.DataFrame({
    'StudentID': test_df_cluster['StudentID'].values,
    'cluster': test_df_cluster['cluster'].values
})

student_cluster.head()

Unnamed: 0,StudentID,cluster
0,2,0
1,6,0
2,9,0
3,16,0
4,17,0


In [68]:
clusters = student_cluster['cluster'].unique().tolist()
clusters # ClusterIDs used 

[0, 1]

### **The first essential step is to merge the test students to their respective training cluster, in order to recommend courses collaboratively.**

In [69]:
def collaborative_filtering_df(train_df, test_df, x): #joining test subjects to their respective cluster with train subjects
    df = hlp.filter_df_cluster(train_df, x)
    df_temp = hlp.filter_df_cluster(test_df, x)
    index_list = df_temp.StudentID.values
    df_concat = pd.concat([df, df_temp], ignore_index=True)
    df_concat = df_concat.set_index('StudentID')
    df_concat = df_concat.replace(0, np.NaN)
    return df_concat, index_list     #x is cluster number

In [70]:
def create_recommendation_input(train_df_cluster, test_df):
    list_cf_df = []
    student_cluster = pd.DataFrame({
        'StudentID': test_df['StudentID'].values,
        'cluster': test_df['cluster'].values})
    unique_clusters = student_cluster['cluster'].unique().tolist()
    for i in unique_clusters:
        df_temp = collaborative_filtering_df(train_df_cluster, test_df, i)
        list_cf_df.append(df_temp)
    return list_cf_df
    
# Output is a list of tuples(dataframe with merged train and test students, list of unique StudentIDs of test students)   
#_________
# Example of indexing:
# list_cf_df[0][0] #dataframe of respective cluster
# list_cf_df[0][1].tolist() # unique StudentID

In [71]:
list_cf_df = create_recommendation_input(train_df_cluster, test_df_cluster)

In [72]:
list_cf_df[1][0]['cluster'].unique().tolist()

[1]

In [73]:
list_cf_df[0][1].tolist()[:10] # StudentIDs

[2, 6, 9, 16, 17, 18, 20, 23, 38, 40]

In [74]:
student_cluster.StudentID.unique()[:10] #obtained IDs in list_cf_df is exactly the same as test subjects

array([ 2,  6,  9, 16, 17, 18, 20, 23, 38, 40], dtype=int64)

### **After Creating the Input for the RecSys Engine, Recommendations can be obtained**

In [75]:
# keeping track of the cluster and the test subject as well
# consider filtering mean_grades with a higher count than a precalculated 
# store two recommendation lists
#     1) highest mean_grades
#     2) Majority Vote
# int: x = number of recommendations to be obtained per student
# list_cf_df = list of tuples with dataframe that corresponds to output of create_recommendation_input()

def get_recommendations(list_cf_df, x):
    rec_mean_grades = pd.DataFrame()
    rec_majority = pd.DataFrame()
    for i in list_cf_df: # iteration over tuple of (df, index_list of test students added )
        df = i[0] # dataframe an der stelle i[0]
        cluster_threshold = int(len(i[0])* 0.1)
        print(cluster_threshold)
        user_id = i[1].tolist()
        for j in user_id: # jth entry of index list in resutling from filtering in i[0]
            student_data = df[df.index == j] # get student data
            courses_not_taken = hlp.filter_T1_courses(student_data) # get all courses not taken by student j, excluding T1
            cf_mean = pd.DataFrame({'mean_cluster_grade': hlp.round_to_nearest_grade(df.mean(skipna = True))})
            cf_count = pd.DataFrame({'count': df.count()})
            mean_grades = pd.concat([courses_not_taken, cf_mean, cf_count],join = 'inner', axis = 1)
            recommendations = pd.DataFrame(mean_grades[mean_grades['count'] >= cluster_threshold]) #df where count is bigger than 5
            rec_mean = recommendations.sort_values(by = 'mean_cluster_grade', ascending = False)[:x]
            iterables = [df['cluster'].unique().tolist(),[j],rec_mean.index.tolist()]
            index = pd.MultiIndex.from_product(iterables, names=["ClusterID","StudentID", "CourseID"])
            rec_mean = rec_mean.set_index(index)
            rec_mean_grades = pd.concat([rec_mean_grades, rec_mean]) 
            rec_maj = recommendations.sort_values(by = 'count', ascending = False)[:x]
            iterables = [df['cluster'].unique().tolist(), [j], rec_maj.index.tolist()]
            index = pd.MultiIndex.from_product(iterables, names=["ClusterID", "StudentID", "CourseID"])
            rec_maj = rec_maj.set_index(index)
            rec_majority = pd.concat([rec_majority, rec_maj])
    return rec_mean_grades, rec_majority


### **Obtain 10 Recommendations per student**

In [76]:
# MultiIndex Dataframe for easier evaluation
rec_mean_grades, rec_majority = get_recommendations(list_cf_df, 10)
rec_mean_grades.head(10)

29
30


Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,mean_cluster_grade,count
ClusterID,StudentID,CourseID,Unnamed: 3_level_1,Unnamed: 4_level_1
,2,12E050SCIENCES,5.0,82
,2,12E051SCIENCES,5.0,82
,2,T208012,5.0,64
,2,T208011,5.0,33
,2,T208003,5.0,43
,2,T208000,5.0,46
,2,T207059,5.0,92
,2,T207037,5.0,207
,2,T207036,5.0,190
,2,T207034,5.0,178


In [77]:
rec_majority.index.get_level_values('ClusterID').unique()
# ClusterID 0 was interpreted as empty
# /!\ Cluster ID is now float, needs to be fixed

Float64Index([nan, 1.0], dtype='float64', name='ClusterID')

In [78]:
#Fill nan with 0 in MultiIndex for both recommendation approaches
if isinstance(rec_majority.index, pd.MultiIndex):
    rec_majority.index = pd.MultiIndex.from_frame(
       rec_majority.index.to_frame().fillna(0)
    )
else:
    rec_majority.index = rec_majority.index.fillna(0)

rec_majority.index = rec_majority.index.set_levels(rec_majority.index.levels[0].astype(int), level = 0)
rec_majority.head()

# inspiration of code snippet:
# https://stackoverflow.com/questions/32506689/replace-nan-in-dataframe-index

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,mean_cluster_grade,count
ClusterID,StudentID,CourseID,Unnamed: 3_level_1,Unnamed: 4_level_1
0,2,T207037,5.0,207
0,2,T207004,5.0,194
0,2,T207036,5.0,190
0,2,T206060,5.0,184
0,2,T207034,5.0,178


In [79]:
if isinstance(rec_mean_grades.index, pd.MultiIndex):
    rec_mean_grades.index = pd.MultiIndex.from_frame(
       rec_mean_grades.index.to_frame().fillna(0)
    )
else:
    rec_mean_grades.index = rec_mean_grades.index.fillna(0)

rec_mean_grades.index = rec_mean_grades.index.set_levels(rec_mean_grades.index.levels[0].astype(int), level = 0)
rec_mean_grades.head()


Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,mean_cluster_grade,count
ClusterID,StudentID,CourseID,Unnamed: 3_level_1,Unnamed: 4_level_1
0,2,12E050SCIENCES,5.0,82
0,2,12E051SCIENCES,5.0,82
0,2,T208012,5.0,64
0,2,T208011,5.0,33
0,2,T208003,5.0,43


**Check if the number of students in the test set has the same length as in the recommendations obtained**

In [80]:
rec_mean_grades.index.get_level_values('StudentID').nunique()

119

In [81]:
rec_majority.index.get_level_values('StudentID').nunique()

119

In [82]:
len(student_cluster)

119

In [83]:
rec_mean_grades.to_csv(os.path.join('C:~your_path~/RecSys/obtained_data', 'rec_mean_grades.csv'), index = True)
rec_majority.to_csv(os.path.join('C:~your_path~/RecSys/obtained_data', 'rec_majority.csv'), index = True)

In [84]:
rec_majority.index.get_level_values('ClusterID').unique() # changed index 'ClusterID' to int

Int64Index([0, 1], dtype='int64', name='ClusterID')

In [85]:
rec_mean_grades.index.get_level_values('ClusterID').unique()

Int64Index([0, 1], dtype='int64', name='ClusterID')

# Step by Step Ilustration of interal mechanism of RecSys Implementation
This is a step by step illustration on the output, each step of the recommendation engine creates and further processes

## Step 1: Create Collaborative Filtering Data Frame

In [86]:
test_cf, index_list = collaborative_filtering_df(train_df_cluster, test_df_cluster, 1) # cluster 2, numbers start at 0!

In [87]:
test_cf.info() #169 columns as the whole course catalogue is mapped, dataframe still carries 'cluster column'

<class 'pandas.core.frame.DataFrame'>
Int64Index: 300 entries, 301 to 599
Columns: 169 entries, 12E050SCIENCES to cluster
dtypes: float64(168), int64(1)
memory usage: 398.4 KB


In [88]:
test_cf.head() # merged dataframe with test students added to the predicted cluster

Unnamed: 0_level_0,12E050SCIENCES,12E051SCIENCES,5869,5870,71105,71120,71121,71133,74110,74112,...,T208011,T208012,T208013,T208014,T208015,T208016,T208017,T208033,T214006,cluster
StudentID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
301,,,,,,,,,,,...,,,,,,,,,,1
302,,,,,,5.5,,,5.75,,...,,,,,,,,,,1
303,,,,,,,,,,,...,,,,,,,,,,1
304,,,,,,,,,5.25,,...,,,,,,,,,,1
305,,,,,,,,,5.5,,...,,,,,,,,,,1


In [89]:
len(index_list) 

57

## **Step 2: Get the courses the student took (student data)**

In [90]:
# get test student 
user_id = index_list[10]
user_id

365

In [91]:
student_data = test_cf[test_cf.index == user_id]

In [92]:
student_data # all the courses the student took, NaN for courses the student did not take

Unnamed: 0_level_0,12E050SCIENCES,12E051SCIENCES,5869,5870,71105,71120,71121,71133,74110,74112,...,T208011,T208012,T208013,T208014,T208015,T208016,T208017,T208033,T214006,cluster
StudentID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
365,,,,,,,,,,,...,,,,,,,,,,1


## **Step 3: Get the courses the student did not take (courses not taken), filter T1 courses**

In [93]:
courses_not_taken = list(student_data.columns[student_data.isna().all()]) 

In [94]:
print(courses_not_taken[:10]) # T1 courses need to be filtered out!
# at this step, we see that it holds courses from both bachelor programs

['12E050SCIENCES', '12E051SCIENCES', '5869', '5870', '71105', '71120', '71121', '71133', '74110', '74112']


In [95]:
# T1 courses are courses from first year of political science, that need to be filtered out because the student cannot take any T1 courses 
# once all T1 requirements are passed. 
# This type of indication amongst courses does not exist for the bachelore of educational science
prefix = 'T1'
for course in courses_not_taken[:]:
    if course.startswith(prefix):
        courses_not_taken.remove(course)
print(courses_not_taken[:10])


['12E050SCIENCES', '12E051SCIENCES', '5869', '5870', '71105', '71120', '71121', '71133', '74110', '74112']


In [96]:
# all the courses that the student did not take yet with T2 prefix need to be merged to a dataframe with empty index
selection = pd.DataFrame()
selection[''] = courses_not_taken
selection = selection.set_index([''])
selection.head()

## Step 4: Get mean_cluster_grades and total cluster counts of all courses in the cluster

In [97]:
cf_mean = pd.DataFrame({
    'mean_cluster_grade': hlp.round_to_nearest_grade(test_cf.mean(skipna = True))})

In [98]:
cf_mean.head() # NaN for those courses that are not in the study program

Unnamed: 0,mean_cluster_grade
12E050SCIENCES,
12E051SCIENCES,
5869,
5870,
71105,4.75


In [99]:
cf_count = pd.DataFrame({'count': test_cf.count()})
cf_count.head()

Unnamed: 0,count
12E050SCIENCES,0
12E051SCIENCES,0
5869,0
5870,0
71105,10


## Step 4: Merge mean grades and total counts to the course selection of the test student
### **This step is important, based on this recommendations can be derived**

In [100]:
mean_grades = pd.concat([selection, cf_mean, cf_count],join = 'inner', axis=1)

**Recommendations based on mean grades**

In [101]:
recommendations = pd.DataFrame(mean_grades[mean_grades['count'] >= 5]) #at least 5 students should have taken the course 
recommendations = pd.DataFrame(recommendations.sort_values(by = 'mean_cluster_grade', ascending = False))
recommendations.head(6)

Unnamed: 0,mean_cluster_grade,count
7421AG,5.25,17
7422AT,5.25,25
7422AU,5.25,21
7422AA,5.0,29
742380,5.0,124
7422AK,5.0,21


**Recommendations based on majority vote**

In [102]:
recommendations = pd.DataFrame(recommendations.sort_values(by = 'count', ascending = False))
recommendations.head(6)

Unnamed: 0,mean_cluster_grade,count
7422A7,5.0,243
7422A8,5.0,243
7422A5,5.0,243
7417I,5.0,243
742871,5.0,214
742066,5.0,211


## Step 5: Add MultiIndex for Evaluation

In [103]:
iterables = [test_cf['cluster'].unique().tolist(),[index_list[10]],recommendations.index.tolist()]
index = pd.MultiIndex.from_product(iterables, names=["ClusterID", "StudentID", "CourseID"])

In [104]:
recommendations = recommendations.set_index(index)

In [105]:
recommendations.head() 
# The student studies educational science. Luckily, no courses are recommended that belong to the other bachelor program

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,mean_cluster_grade,count
ClusterID,StudentID,CourseID,Unnamed: 3_level_1,Unnamed: 4_level_1
1,365,7422A7,5.0,243
1,365,7422A8,5.0,243
1,365,7422A5,5.0,243
1,365,7417I,5.0,243
1,365,742871,5.0,214
