In [1]:
#Import libraries
import pandas as pd
import numpy as np
from surprise import SVD
from surprise import Dataset
from surprise import Reader
from surprise.model_selection import cross_validate
from surprise.model_selection import train_test_split
from surprise import KNNWithMeans
from surprise.model_selection import GridSearchCV
from surprise import accuracy
import pickle

In [2]:
#Create average moodtracker for each user as a whole number
df1 = pd.read_csv('moodtracker_rating_values.csv')
df2 = pd.read_csv("moodtracker_rating_instances.csv")

df3 = pd.merge(df1, df2, left_on='id', right_on='rating_id')

df4 =df3.drop(columns=['id_x','id_y', 'date', "rating_id"])
df5=pd.DataFrame(df4.groupby(['app_user_id']).mean())
df6=df5.astype(int)
df6.head()

Unnamed: 0_level_0,anxiety_rating,coping_rating,depression_rating,loneliness_rating
app_user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2,1,2,1,2
3,0,0,0,0
7,4,2,3,3
11,2,3,3,3
13,3,4,3,3


In [3]:
#Create a column containing a string of digits for the average moodtracker
df6['MergedRatings'] = df6[df6.columns[0:]].apply(
    lambda x: ','.join(x.dropna().astype(str)),
    axis=1
)
df6.head()

Unnamed: 0_level_0,anxiety_rating,coping_rating,depression_rating,loneliness_rating,MergedRatings
app_user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2,1,2,1,2,1212
3,0,0,0,0,0
7,4,2,3,3,4233
11,2,3,3,3,2333
13,3,4,3,3,3433


In [4]:
#Create user-item(page) dataframe. Every click on page recorded as a rating of 1
df7 = pd.read_csv('content-dwelltime.csv')
df7["Rating"]=1
df8=df7.drop(columns=['id', 'start_time',"end_time"])

df8.head()

Unnamed: 0,app_user_id,content_id,Rating
0,2,57,1
1,2,59,1
2,2,61,1
3,2,58,1
4,2,56,1


In [5]:
#Replace app user id with the moodtracker string and drop duplicates
df9=pd.merge(df8, df6, on='app_user_id')
df10=df9.drop(["app_user_id","anxiety_rating","coping_rating","depression_rating","loneliness_rating"],axis=1)
df11=df10[["MergedRatings","content_id","Rating"]]



In [6]:
#Use surprise library to create a dataset
reader=Reader(rating_scale=(0,1))
data=Dataset.load_from_df(df11[["MergedRatings","content_id","Rating"]],reader)

In [7]:
#Split data into a training and test set
trainset, testset = train_test_split(data, test_size=.25,random_state=1)

User-user approach considers similarities between users.Item-item approach considers similarities between items (pages)

In [8]:
#Find the best algorithm parameters

param_grid = {
    "n_epochs": [5, 10],
    "lr_all": [0.002, 0.005],
    "reg_all": [0.4, 0.6]
}
gs = GridSearchCV(SVD, param_grid, measures=["rmse", "mae"], cv=3)

gs.fit(data)

print(gs.best_score["rmse"])
print(gs.best_params["rmse"])


0.008375634437770415
{'n_epochs': 10, 'lr_all': 0.005, 'reg_all': 0.6}


Best parameters are 10 epochs, a learning rate of 0.005 and 0.6 regularisation

In [9]:
# Train the algorithm on the trainset, and predict ratings for the testset
algo = gs.best_estimator['rmse']
algo.fit(trainset)
predictions = algo.test(testset)

# Then compute RMSE
accuracy.rmse(predictions)

RMSE: 0.0074


0.007372403447338125

In [10]:
# get the list of the page ids
unique_ids = df11['content_id'].unique()
# get the list of the pages that a user with a moodtracker rating of 1,2,1,2 has rated
iids1212 = df11.loc[df11['content_id']=="1,2,1,2", 'content_id']
# remove the rated pages for the recommendations
PagesToPredict = np.setdiff1d(unique_ids,iids1212)

In [11]:
my_recs = []
for iid in PagesToPredict:
    my_recs.append((iid, algo.predict(uid="1,2,1,2",iid=iid).est))
    
pd.DataFrame(my_recs, columns=['iid', 'predictions']).sort_values('predictions', ascending=False).head(10)

Unnamed: 0,iid,predictions
0,4,1.0
57,61,1.0
37,41,1.0
38,42,1.0
39,43,1.0
1,5,1.0
41,45,1.0
42,46,1.0
44,48,1.0
46,50,1.0


In [12]:
#Pickle the file so that the model can be imported into the app

pickle.dump(algo, open('MoodtrackerCollaborativeFiltering.pkl','wb'))


In [13]:
model = pickle.load(open('MoodtrackerCollaborativeFiltering.pkl','rb'))
print(model.predict("2,2,2,2",iid=iid))

user: 2,2,2,2    item: 117        r_ui = None   est = 1.00   {'was_impossible': False}


## References
https://towardsdatascience.com/building-and-testing-recommender-systems-with-surprise-step-by-step-d4ba702ef80b

https://realpython.com/build-recommendation-engine-collaborative-filtering/

https://surprise.readthedocs.io/en/stable/getting_started.html![image.png](attachment:image.png)

https://www.datacamp.com/community/tutorials/pickle-python-tutorial