In [87]:
import pandas as pd
import numpy as np

# Evaluation of CF-based Recommendations

The pre-processed data is initially split in training and test sets and the last ten courses are removed. The quality of each recommendation approach is assessed offline, by examining whether courses that are proposed to the user of interest, were actually taken. This is achieved by removing entries, obtain corresponding recommendations, and then compare those to the actual course selection. <br> Per RecSys-Approach and for both study programs, two different ranking methods are applied: **majority vote** and **mean grades**, and are evaluated separately in order to identify whether any statements of the course selection procedure can be derived. Although this approach merely assess how accurate the Course RecSys can predict the course history, it serves as an initial benchmark for offline evaluation as it still assesses how accurate course preferences are captured. What cannot be anticipated is whether a different choice would have been made, if a student was confronted with the obtained recommendation list. <br>
In order to evaluate the recommendation quality, key measurement metrics from the traditional **Information Retrieval (IR)** are applied. The table below depicts the four possible outcomes of the course recommendation process, from which key metrics can be derived and is commonly known as the *Confusion Matrix*. <br> The state in which a course, that was taken was truly recommended, is defined as a **True Positive (TP)**, however if it was not recommended it is defined as a **False Negative (FN)**. The scenario in which a course that was not taken by a student, was actually provided as a recommendation is denoted as **False Positive (FP)** or as a **True Negative (TN)**, for the case of non-recommendation. Given these four building blocks, the evaluation metrics **Recall**, **Precision** and **F1** are derived.<br>
<br>

| |**Recommended** | **Not Recommended**  | 
|---:|:-------------|:-----------|
|**Course Taken**| True Positive (TP)  | False Negative (FN)      | 
| **Course Not Taken**| False Positive (FP)  |True Negative (TN)   | 

<br>

The metric **Recall** is defined as the number of successful recommendations to the total number of course choices made and is given by: <br>
\begin{equation}
    \label{eq:Recall}
    Recall = \frac{TP}{TP + FN}
\end{equation}<br>
**Precision** can be understood as a measure of result relevancy, where the ratio of all courses truly taken out of any given recommendation list, to all courses that were recommended, is examined: <br>
\begin{equation}
    \label{eq:Precision}
    Precision = \frac{TP}{TP + FP}
\end{equation}<br>
The **F-measure** is a metric derived from formulas **Recall** and **Precision** and when viewed from the probability perspective, can be seen as the number of course recommendations that need to be made before the first failure is detected. The most common F-measure represents a consistent mean of Precision and Recall, denoted as **F1** and is defined as:<br>
\begin{equation}
    \label{eq:F1}
    F1 = \frac{2 \times (Precision \times Recall)}{(Precision + Recall)}
\end{equation}

In [88]:
# load data and validate that Multi Index Data Frame was loaded correctly

##  Recommendation List, two different ranking methods
mean_grades = pd.read_csv('C:~your_path~/RecSys/obtained_data/rec_mean_grades.csv', index_col = [0, 1, 2])
maj_vote = pd.read_csv('C:~your_path~/RecSys/obtained_data/rec_majority.csv', index_col = [0, 1, 2])


## initial data set that is needed for validation
true_test_df = pd.read_csv('C:~your_path~/RecSys/obtained_data/test_df.csv')

In [89]:
true_test_df = true_test_df.set_index(['StudentID'])
true_test_df.head()

Unnamed: 0_level_0,CourseID,Grades
StudentID,Unnamed: 1_level_1,Unnamed: 2_level_1
313,742004,5.25
313,742003,5.75
313,742002,5.5
313,742001,5.0
313,742064,5.25


In [90]:
mean_grades.head(1)

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,mean_cluster_grade,count
ClusterID,StudentID,CourseID,Unnamed: 3_level_1,Unnamed: 4_level_1
0,2,12E050SCIENCES,5.0,82


In [91]:
maj_vote.head(1)

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,mean_cluster_grade,count
ClusterID,StudentID,CourseID,Unnamed: 3_level_1,Unnamed: 4_level_1
0,2,T207037,5.0,207


## Core Function of Evaluation Method

### Calculation of Precision, Recal and F1

In [92]:
# calculate precision, recall and f1 
# a confusion matrix as a whole cannot be build as true negatives will highly delute the evaluation performance
# tn = (if a course is not recommended it is not taken)

# input:
#        the true test dataframe,
#        the recommendation dataframe
#        'entries removed': (int) number of entries removed from the original test dataframe (here 10)

#output: a dataframe with the respective metric calcualted per StudentID

def evaluate_rec(true_test_df, rec_df, entries_removed):
    evaluation_df = pd.DataFrame()
    IDs = rec_df.index.get_level_values('StudentID').unique().tolist()
    for i in IDs:
        true = set(true_test_df[true_test_df.index == i][-entries_removed:]['CourseID'].tolist())
        rec_temp = rec_df[rec_df.index.get_level_values('StudentID') == i]
        rec = set(rec_temp.index.get_level_values('CourseID').tolist())
        intersection = true & rec
        tp = len(intersection)
        fn = len(true) - len(intersection)
        fp = len(rec) - tp
        recall = tp / (tp + fn)
        precision = tp / (tp + fp)
        if (recall > 0) & (precision > 0): 
            f1 = (2*(precision * recall)) / (precision + recall)
        else: 
            f1 = 0
        cluster = rec_temp.index.get_level_values('ClusterID').unique().tolist()[0]
        evaluation_i = pd.DataFrame(columns = ['ClusterID','StudentID', 'Recall', 'Precision', 'F1'], index = [0])
        evaluation_i.iloc[0] = [cluster, i, recall, precision, f1]
        evaluation_df = pd.concat([evaluation_df, evaluation_i], axis = 0)
    evaluation_df = evaluation_df.set_index(['StudentID'])
    return evaluation_df

In [93]:
eval_rec_maj = evaluate_rec(true_test_df, maj_vote, 10) 
eval_rec_mean = evaluate_rec(true_test_df, mean_grades, 10) 

In [94]:
eval_rec_maj.head(10) #evaluation of each student with defined evaluation metric

Unnamed: 0_level_0,ClusterID,Recall,Precision,F1
StudentID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2,0,0.3,0.3,0.3
6,0,0.6,0.6,0.6
9,0,0.4,0.4,0.4
16,0,0.4,0.4,0.4
17,0,0.555556,0.5,0.526316
18,0,0.5,0.5,0.5
20,0,0.4,0.4,0.4
23,0,0.4,0.4,0.4
38,0,0.4,0.4,0.4
40,0,0.555556,0.5,0.526316


### Overview of Evaluation Metrics per Method and Semester

In [95]:
a = eval_rec_maj.mean().tolist()
b = eval_rec_mean.mean().tolist()

result = [a, b]

In [96]:
df = pd.DataFrame(columns = ['Majority Vote', 'Mean Grades'],
                 index = ['Recall', 'Precision', 'F1'])
df.loc['Recall'] = pd.Series({'Majority Vote':result[0][1], 'Mean Grades':result[1][1]})
df.loc['Precision'] = pd.Series({'Majority Vote':result[0][2], 'Mean Grades':result[1][2]})
df.loc['F1'] = pd.Series({'Majority Vote':result[0][3], 'Mean Grades':result[1][3]})

df

Unnamed: 0,Majority Vote,Mean Grades
Recall,0.463212,0.309337
Precision,0.460504,0.307563
F1,0.461787,0.308403


### Overview of Evaluation Metrics per Cluster, Method and Semester

In [97]:
clusters = [0, 1]
list_of_df = [eval_rec_maj, eval_rec_mean]
df_cluster = pd.DataFrame(columns = ['Majority Vote', 'Mean Grades'],
                 index = ['Recall', 'Precision', 'F1'])

def create_metric_df(clusters, list_of_df, df_cluster):
    df_fin_list = []
    for i in clusters:
        for dfi in range(len(list_of_df)):
            df = list_of_df[dfi]
            a = df[df['ClusterID'] == i].mean().tolist()
            column = df_cluster.columns[dfi]
        
            df_cluster.loc['Recall'][dfi] = a[1]
            df_cluster.loc['Precision'][dfi] = a[2]
            df_cluster.loc['F1'][dfi] = a[3]
    
        iterables = [[i] ,df_cluster.index.tolist()]
        index = pd.MultiIndex.from_product(iterables, names=["ClusterID", 'Metrics'])
        temp = df_cluster.set_index(index)
        df_fin_list.append(temp)
    
    df_fin = pd.concat(df_fin_list, axis = 0)
    return df_fin

In [98]:
df_final = create_metric_df(clusters, list_of_df, df_cluster)
df_final

Unnamed: 0_level_0,Unnamed: 1_level_0,Majority Vote,Mean Grades
ClusterID,Metrics,Unnamed: 2_level_1,Unnamed: 3_level_1
0,Recall,0.521326,0.359857
0,Precision,0.516129,0.356452
0,F1,0.518591,0.358065
1,Recall,0.4,0.254386
1,Precision,0.4,0.254386
1,F1,0.4,0.254386


Recommendation Systems originate from the urge to facilitate user navigation through item catalogues on diverse content platforms, which needs to be clearly distinguished to the Learning Environment of a university. Students enrolled in university programs need to complete a mandatory course workload and the amount of courses that can be freely chosen varies depending on the program. This situation does not exist on platforms where users merely consume content. There is no such thing as **mandatory content**, before movie or music choices can be made. The fact that the performance metrics of cluster 1 are significantly lower then those obtained for cluster 0 indicate that if one can filter out 'mandatory' content, the recommendation performance will improve. Mandatory courses in the bachelor of Political Science are indicated with the prefix 'T1' and can be easily filtered. In the Bachelor of Educational Science no such seperation exist, as there most of the time a *choose-x-out-of-y*-scenario. Collaborative Filtering internal ranking mechanism will rank course that can be understood as mandatory at the very top, even though all choices of the mandatory workload were already made.


# Step by Step Illustration of Internal Mechanism Of Rec-Sys Evaluation


In [99]:
# starting with test user 17
# get last 10 entries from test_data, corresponds to 'entries removed' in the preprocessing step
true_test_df[true_test_df.index == 17][-10:] 

Unnamed: 0_level_0,CourseID,Grades
StudentID,Unnamed: 1_level_1,Unnamed: 2_level_1
17,T207002,5.25
17,T207034,4.5
17,T207004,4.75
17,T207002,4.0
17,J2P201,4.5
17,12E050SCIENCES,5.5
17,12E051SCIENCES,5.5
17,T206006,5.25
17,T205004,5.75
17,T206001,4.75


In [100]:
# store as set for efficient comparison
true = set(true_test_df[true_test_df.index == 17][-10:]['CourseID'].tolist()) 

In [101]:
# get the recommendations for the same user from recommendation dataframe
df_temp = maj_vote[maj_vote.index.get_level_values('StudentID') == 17]
df_temp.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,mean_cluster_grade,count
ClusterID,StudentID,CourseID,Unnamed: 3_level_1,Unnamed: 4_level_1
0,17,T207002,5.0,233
0,17,T207035,5.0,214
0,17,T207004,5.0,194
0,17,T207036,5.0,190
0,17,T206060,5.0,184


In [102]:
maj_vote.index.get_level_values('StudentID').unique().tolist()[:5]

[2, 6, 9, 16, 17]

In [103]:
rec = set(df_temp.index.get_level_values('CourseID').tolist())

In [104]:
# get the intersection of two sets (recommendations and actual courses taken)
intersection = true & rec
intersection

{'12E050SCIENCES', '12E051SCIENCES', 'T207002', 'T207004', 'T207034'}

In [105]:
tp = len(intersection) # True positive is the length of the intersection: the course that are in both sets
fn = len(true) - len(intersection) # False negative is the length of actual courses -  true positives
fp = len(rec) - tp # False positive 

In [106]:
recall = tp / (tp + fn)
recall

0.5555555555555556

In [107]:
precision = tp / (tp + fp)
precision # precision is very bad due to the nature of recommending out of a list

0.5

In [108]:
f1 = (2*(precision * recall)) / (precision + recall)
f1

0.5263157894736842