In [42]:
#pip install setuptools==58.2.0
#pip install scikit-surprise==1.1.3
#pip install mlxtend


In [43]:
%matplotlib inline

from pathlib import Path

import heapq
from collections import defaultdict

import pandas as pd
import matplotlib.pylab as plt
from mlxtend.frequent_patterns import apriori
from mlxtend.frequent_patterns import association_rules

from surprise import Dataset, Reader
from surprise.prediction_algorithms import KNNBasic
from surprise.model_selection import train_test_split
from math import sqrt
from sklearn.metrics.pairwise import cosine_similarity

In [44]:
DATA = Path('dmba')

### 3. We again consider the data in CourseTopics.csv describing course purchases at Statistics.com (see Problem 14.2 and data sample in Table). We want to provide a course recommendation to a student who purchased the Regression and Forecast courses. Apply user-based and item-based collaborative filtering to the data, using both Pearson correlation and Cosine similarity. Pandas.melt() unpivots a DataFrame from wide format to long format and we can use this method to turn the data into triplets such as (student, course, rating). Also review the pandas code in Table 2.3 for data processing. Note you need to create a Student_ID variable for "id_vars=" in the Pandas.melt() function as such a variable is not in the data.

In [46]:
ct_df = pd.read_csv(DATA / 'CourseTopics.csv')
print(ct_df.head(5))

   Intro  DataMining  Survey  Cat Data  Regression  Forecast  DOE  SW
0      1           1       0         0           0         0    0   0
1      0           0       1         0           0         0    0   0
2      0           1       0         1           1         0    0   1
3      1           0       0         0           0         0    0   0
4      1           1       0         0           0         0    0   0


In [47]:
ct_df["Student_id"] = range(0,ct_df.shape[0])
ct_df.head(5)

Unnamed: 0,Intro,DataMining,Survey,Cat Data,Regression,Forecast,DOE,SW,Student_id
0,1,1,0,0,0,0,0,0,0
1,0,0,1,0,0,0,0,0,1
2,0,1,0,1,1,0,0,1,2
3,1,0,0,0,0,0,0,0,3
4,1,1,0,0,0,0,0,0,4


In [48]:
ct_df_regress = ct_df[ct_df['Regression'] ==1]
ct_df_regress_forcast = ct_df_regress[ct_df_regress['Forecast']==1]
ct_df_regress_forcast

Unnamed: 0,Intro,DataMining,Survey,Cat Data,Regression,Forecast,DOE,SW,Student_id
21,0,0,0,0,1,1,0,0,21
25,0,0,0,0,1,1,0,1,25
32,1,1,1,1,1,1,1,0,32
52,0,0,0,0,1,1,1,0,52
66,1,0,0,1,1,1,0,0,66
74,0,0,0,1,1,1,0,0,74
79,1,0,0,1,1,1,0,0,79
80,0,0,0,0,1,1,0,0,80
107,1,1,1,0,1,1,0,0,107
165,1,1,0,0,1,1,0,0,165


In [49]:
melted_ct_df = ct_df_regress_forcast.melt(id_vars='Student_id', var_name='course', value_name='enrolled')
melted_ct_df 

Unnamed: 0,Student_id,course,enrolled
0,21,Intro,0
1,25,Intro,0
2,32,Intro,1
3,52,Intro,0
4,66,Intro,1
...,...,...,...
107,165,SW,0
108,181,SW,0
109,216,SW,1
110,222,SW,1


In [50]:
reader = Reader(rating_scale=(0,1))
reader

<surprise.reader.Reader at 0x2017d5254e0>

In [51]:
data = Dataset.load_from_df(melted_ct_df[['Student_id', 'course', 'enrolled']],reader= reader)
trainset = data.build_full_trainset()
sim_options_item = {'name':'cosine', 'user_based': False}
sim_options_user = {'name':'cosine', 'user_based': True}
algo_item = KNNBasic(sim_options=sim_options_item)
algo_user = KNNBasic(sim_options=sim_options_user)
algo_item.fit(trainset=trainset)
algo_user.fit(trainset=trainset)
pred = algo_item.predict(str(2), str('DataMining'),  verbose=True)
pred.est

Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
user: 2          item: DataMining r_ui = None   est = 0.47   {'was_impossible': True, 'reason': 'User and/or item is unknown.'}


0.4732142857142857

In [52]:
def testUser (StudentId):
    print("Using User Based Filtering")
    for i in set(melted_ct_df['course']):
        pred = algo_user.predict(str(StudentId), i ,  verbose=False)
        print("prediction for class " +i + " for Sudent "+ str(StudentId)+ ":")
        print(pred.est)

In [53]:
def testItem(StudentId):
    print("Using Item Based Filtering")
    for i in set(melted_ct_df['course']):
        pred = algo_item.predict(str(StudentId), i ,  verbose=False)
        print("prediction for class " +i + " for Sudent "+ str(StudentId)+ ":")
        print(pred.est)

In [54]:
testUser(4)

Using User Based Filtering
prediction for class Intro for Sudent 4:
0.4732142857142857
prediction for class Regression for Sudent 4:
0.4732142857142857
prediction for class Cat Data for Sudent 4:
0.4732142857142857
prediction for class Forecast for Sudent 4:
0.4732142857142857
prediction for class Survey for Sudent 4:
0.4732142857142857
prediction for class DataMining for Sudent 4:
0.4732142857142857
prediction for class SW for Sudent 4:
0.4732142857142857
prediction for class DOE for Sudent 4:
0.4732142857142857


In [55]:
testItem(4)

Using Item Based Filtering
prediction for class Intro for Sudent 4:
0.4732142857142857
prediction for class Regression for Sudent 4:
0.4732142857142857
prediction for class Cat Data for Sudent 4:
0.4732142857142857
prediction for class Forecast for Sudent 4:
0.4732142857142857
prediction for class Survey for Sudent 4:
0.4732142857142857
prediction for class DataMining for Sudent 4:
0.4732142857142857
prediction for class SW for Sudent 4:
0.4732142857142857
prediction for class DOE for Sudent 4:
0.4732142857142857


### 5. Course Ratings. The Institute for Statistics Education at Statistics.com asks students to rate a variety of aspects of a course as soon as the student completes it. The Institute is contemplating instituting a recommendation system that would provide students with recommendations for additional courses as soon as they submit their rating for a completed course. Consider the excerpt from student ratings of online statistics courses shown in Table 14.17, and the problem of what to recommend to student E.N.

#### a. First consider a user-based collaborative filter. This requires computing correlations between all student pairs. For which students is it possible to compute correlations with E.N.? Compute them.

It is possible to compute correlations with LN and DS as they have some (more than 1) courses in common.

In [56]:
#get avg scores for everyone
avg_EN = (4+4+4+3)/4
avg_LN= (4+3+2+4+2)/5
avg_MH = (3+4+4)/3
avg_JH= (2+2)/2
avg_DU= (4+4)/2
avg_DS= (4+2+4)/3


In [57]:
cor_EN_LN = (
    ((4-avg_EN)*(4-avg_LN))+((4-avg_EN)*(4-avg_LN))+((3-avg_EN)*(2-avg_LN))
)/(
    sqrt(((4-avg_EN)**2)+((4-avg_EN)**2)+((3-avg_EN)**2)) *
    sqrt(((4-avg_LN)**2)+((4-avg_LN)**2)+((2-avg_LN)**2))
)
cor_EN_LN

0.8703882797784892

In [58]:
cor_EN_MH = (
    ((4-avg_EN)*(3-avg_MH))
)/(
    sqrt(((4-avg_EN)**2)) *
    sqrt(((3-avg_MH)**2))
)
cor_EN_MH

-1.0

In [59]:
cor_EN_JH = (
    ((4-avg_EN)*(2-avg_JH))
)/(
    sqrt(((4-avg_EN)**2)) *
    sqrt(((2)**2)) #no diff between avg and this val. 
)
cor_EN_JH

0.0

In [60]:
cor_EN_DU = (
    ((4-avg_EN)*(4-avg_DU))
)/(
    sqrt(((4-avg_EN)**2)) *
    sqrt(((4)**2)) #no diff between avg and this val. 
)
cor_EN_DU

0.0

In [61]:
cor_EN_DS = (
    ((4-avg_EN)*(4-avg_DS))+((4-avg_EN)*(2-avg_DS))+((4-avg_EN)*(4-avg_DS))
)/(
    sqrt(((4-avg_EN)**2)+((4-avg_EN)**2)+((4-avg_EN)**2)) *
    sqrt(((4-avg_DS)**2)+((2-avg_DS)**2)+((4-avg_DS)**2))
)
cor_EN_DS

-1.5700924586837752e-16

#### b. Based on the single nearest student to E.N., which single course should we recommend to E.N.? Explain why.

Based on the closest user LN, we can recomend Python as a new course as it is the highest ranked class EN does not have

#### c. Use scikit-learn function sklearn.metrics.pairwise.cosine_similarity() to compute the cosine similarity between users.

In [62]:
cours_df = pd.read_csv(DATA / "courserating.csv")
cours_df.head(3)

Unnamed: 0.1,Unnamed: 0,SQL,Spatial,PA1,DM in R,Python,Forecast,R Prog,Hadoop,Regression
0,LN,4.0,,,,3.0,2.0,4.0,,2.0
1,MH,3.0,4.0,,,4.0,,,,
2,JH,2.0,2.0,,,,,,,


In [63]:
cours_df.set_index('Unnamed: 0', inplace=True)
cours_df

Unnamed: 0_level_0,SQL,Spatial,PA1,DM in R,Python,Forecast,R Prog,Hadoop,Regression
Unnamed: 0,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
LN,4.0,,,,3.0,2.0,4.0,,2.0
MH,3.0,4.0,,,4.0,,,,
JH,2.0,2.0,,,,,,,
EN,4.0,,,4.0,,,4.0,,3.0
DU,4.0,4.0,,,,,,,
FL,,4.0,,,,,,,
GL,,4.0,,,,,,,
AH,,3.0,,,,,,,
SA,,,4.0,,,,,,
RW,,,2.0,,,,,4.0,


In [64]:
def replace_na_with_row_avg(row):
    row_avg = row.mean(skipna=True)  # Calculate row average excluding NaN values
    return row.fillna(row_avg)

In [65]:
# cours_df_filled = cours_df.apply(replace_na_with_row_avg, axis=1)
# cours_df_filled

In [66]:
print(cosine_similarity(cours_df.loc[['LN', 'EN'], ['SQL', 'R Prog', 'Regression']]))
print(cosine_similarity(cours_df.loc[['MH', 'EN'], ['SQL']]))
print(cosine_similarity(cours_df.loc[['JH', 'EN'], ['SQL']]))
print(cosine_similarity(cours_df.loc[['DU', 'EN'], ['SQL']]))
print(cosine_similarity(cours_df.loc[['DS', 'EN'], ['SQL', 'DM in R', 'R Prog']]))
# cos_sim[3] #for EN

[[1.         0.98910049]
 [0.98910049 1.        ]]
[[1. 1.]
 [1. 1.]]
[[1. 1.]
 [1. 1.]]
[[1. 1.]
 [1. 1.]]
[[1.         0.96225045]
 [0.96225045 1.        ]]


#### d. Based on the cosine similarities of the nearest students to E.N., which course should be recommended to E.N.?

Based on the cloeset student LN. The student EN should be recoemended Python. it is the highest rated class that is not shared by them. 

#### e. What is the conceptual difference between using the correlation as opposed to cosine similarities? (Hint: How are the missing values in the matrix handled in each case?)

Corelation looks at the proximinity of the nearest neighbors. It subtracts the mean of the row from the values of the row for each row so it is sort of bound by its own measurments while only comparing the two records along corelated columns. Similarity does not subtract the mean, it only uses raw scores. It ignores correlated cloumn values of zero to zero. It measures how similar the two records are without being bould by its own measurments. 

#### f. With large datasets, it is computationally difficult to compute user-based recommendations in real time, and an item-based approach is used instead. Returning to the rating data (not the binary matrix), let’s now take that approach.

##### i. If the goal is still to find a recommendation for E.N., for which course pairs is it possible and useful to calculate correlations?

if we want to still give a recomendatiuon to EN we can recomend teh following pairs: (SQL, Spatial),(SQL,DM in R), (SQL,Python), (SQL, R Prog), (SQL, Regression),(DM in R, Rprog),(Rprog, regression)

##### ii. Just looking at the data, and without yet calculating course pair correlations, which course would you recommend to E.N., relying on item-based filtering? Calculate two course pair correlations involving your guess and report the results.

I recomend python to EN. 

In [68]:
avg_sql = (4+3+2+4+4+4)/6
avg_reg = (3+2)/2
avg_py = (4+4+4)/3


In [78]:
cor_SQL_py = (
    ((3-avg_py)*(4-avg_sql)) + ((4-avg_py)*(3-avg_sql))
)/(
    sqrt(
        ((3-avg_py)**2)+((4-avg_py)**2)
        ) *
    sqrt(
        ((4-avg_sql)**2)+((3-avg_sql)**2)
        ) #no diff between avg and this val. 
)
cor_SQL_py

-0.9999999999999998

In [79]:
cor_py_Reg = (
    ((3-avg_py)*(2-avg_reg))
)/(
    sqrt(
        ((3-avg_py)**2)
        ) *
    sqrt(
        ((2-avg_reg)**2)
        ) #no diff between avg and this val. 
)
cor_py_Reg

1.0

The suggested class python is strongly inversly corelated to SQL and strongly correlated to Regression. It has strong relationships in opposite direections to two of the highly rated classes EN is already taking. 

#### g. Apply item-based collaborative filtering to this dataset (using Python) and based on the results, recommend a course to E.N.

In [96]:
def get_top_n(predictions, numberOfItems):
    byUser = defaultdict(list)
    for p in predictions:
        byUser[p.uid].append(p)

    for uid, userPredictions in byUser.items():
        byUser[uid] = heapq.nlargest(numberOfItems, userPredictions, key= lambda p: p.est)
    return byUser   

In [82]:
cours_df = pd.read_csv(DATA / "courserating.csv")
cours_df

Unnamed: 0.1,Unnamed: 0,SQL,Spatial,PA1,DM in R,Python,Forecast,R Prog,Hadoop,Regression
0,LN,4.0,,,,3.0,2.0,4.0,,2.0
1,MH,3.0,4.0,,,4.0,,,,
2,JH,2.0,2.0,,,,,,,
3,EN,4.0,,,4.0,,,4.0,,3.0
4,DU,4.0,4.0,,,,,,,
5,FL,,4.0,,,,,,,
6,GL,,4.0,,,,,,,
7,AH,,3.0,,,,,,,
8,SA,,,4.0,,,,,,
9,RW,,,2.0,,,,,4.0,


In [88]:
melted_course_df = cours_df.melt(id_vars='Unnamed: 0' , var_name='course', value_name='rating')
melted_course_df.head(2)

Unnamed: 0.1,Unnamed: 0,course,rating
0,LN,SQL,4.0
1,MH,SQL,3.0


In [89]:
reader_course = Reader(rating_scale=(1,4))

In [94]:
data = Dataset.load_from_df(melted_course_df[['Unnamed: 0', 'course', 'rating']],reader= reader_course)
trainset , testset = train_test_split(data, test_size=0.25, random_state=1)
sim_options_item = {'name':'cosine', 'user_based': False}
algo_item = KNNBasic(sim_options=sim_options_item)
algo_item.fit(trainset=trainset)


Computing the cosine similarity matrix...
Done computing similarity matrix.


<surprise.prediction_algorithms.knns.KNNBasic at 0x2016428fca0>

In [97]:
predictions = algo_item.test(testset)
recomend = get_top_n(predictions=predictions, numberOfItems=1)
recomend

defaultdict(list,
            {'MH': [Prediction(uid='MH', iid='SQL', r_ui=3.0, est=4, details={'was_impossible': True, 'reason': 'Not enough neighbors.'})],
             'AF': [Prediction(uid='AF', iid='DM in R', r_ui=nan, est=4, details={'was_impossible': True, 'reason': 'Not enough neighbors.'})],
             'AH': [Prediction(uid='AH', iid='Spatial', r_ui=3.0, est=4, details={'was_impossible': True, 'reason': 'Not enough neighbors.'})],
             'EN': [Prediction(uid='EN', iid='Python', r_ui=nan, est=4, details={'was_impossible': True, 'reason': 'Not enough neighbors.'})],
             'RW': [Prediction(uid='RW', iid='SQL', r_ui=nan, est=4, details={'was_impossible': True, 'reason': 'Not enough neighbors.'})],
             'KG': [Prediction(uid='KG', iid='SQL', r_ui=nan, est=4, details={'was_impossible': True, 'reason': 'Not enough neighbors.'})],
             'MG': [Prediction(uid='MG', iid='Forecast', r_ui=4.0, est=4, details={'was_impossible': True, 'reason': 'Not enough ne

We can see that for EN it recomends Python as teh course to pick up

#### h. Convert all numeric ratings to 1 and all blank (missing values) to 0. Apply user-based and item-based collaborative filtering to this dataset using both Pearson correlation and Cosine similarity and based on the results, recommend a course to E.N.

In [100]:
cours_df = pd.read_csv(DATA / "courserating.csv")
cours_df_filled = cours_df.fillna(0)
cours_df_filled.head(5)

Unnamed: 0.1,Unnamed: 0,SQL,Spatial,PA1,DM in R,Python,Forecast,R Prog,Hadoop,Regression
0,LN,4.0,0.0,0.0,0.0,3.0,2.0,4.0,0.0,2.0
1,MH,3.0,4.0,0.0,0.0,4.0,0.0,0.0,0.0,0.0
2,JH,2.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,EN,4.0,0.0,0.0,4.0,0.0,0.0,4.0,0.0,3.0
4,DU,4.0,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [101]:
melted_course_df = cours_df.melt(id_vars='Unnamed: 0' , var_name='course', value_name='rating')
melted_course_df.head(2)

Unnamed: 0.1,Unnamed: 0,course,rating
0,LN,SQL,4.0
1,MH,SQL,3.0


In [102]:
reader_course = Reader(rating_scale=(0,1))

In [103]:
data = Dataset.load_from_df(melted_course_df[['Unnamed: 0', 'course', 'rating']],reader= reader_course)
trainset , testset = train_test_split(data, test_size=0.25, random_state=1)
sim_options_item = {'name':'cosine', 'user_based': False}
algo_item = KNNBasic(sim_options=sim_options_item)
algo_item.fit(trainset=trainset)
algo_user.fit(trainset=trainset)

Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.


<surprise.prediction_algorithms.knns.KNNBasic at 0x20118376c20>

In [104]:
predictions_user = algo_user.test(testset)
recomend_user = get_top_n(predictions=predictions_user, numberOfItems=1)
predictions_item = algo_item.test(testset)
recomend_item = get_top_n(predictions=predictions_item, numberOfItems=1)

In [106]:
recomend_user['EN']

[Prediction(uid='EN', iid='Python', r_ui=nan, est=1, details={'was_impossible': True, 'reason': 'Not enough neighbors.'})]

In [107]:
recomend_item['EN']

[Prediction(uid='EN', iid='Python', r_ui=nan, est=1, details={'was_impossible': True, 'reason': 'Not enough neighbors.'})]

Both Item and User based collaborative filtering recomend python as the next course for EN to take. 