In [13]:
import pandas as pd
import numpy as np
import random
from math import sqrt

In [2]:
df = pd.read_csv("take_home_ss_ratings.csv")
df.head()

Unnamed: 0,user_id,quiz_type,quiz_number,question_number,item_id,rating
0,0,random,0,0,45,1
1,0,random,0,1,734,1
2,0,random,0,2,4145,1
3,0,random,0,3,3231,0
4,0,random,0,4,361,1


### Input Selection

- Select a user_id from the dataset as our input user
- The more observation the user has, the better for user profiling in our recommendation system
- The observations with $quiz\_type$ = "random" will be treated as input profile data, and the ones with $quiz\_type$ = "personalized" will be considered as a ground truth to compare with our recommendation output. 

In [4]:
len(df)

1116370

In [10]:
print ("The observation counts for each user_id: ")
print (df.user_id.value_counts())
print ("\n The user_id 13613 has the most observations: \n", df[df.user_id == 13613])

The observation counts for each user_id: 
13613    1040
11786     700
15044     610
6273      580
11460     550
         ... 
16623      10
16622      10
16620      10
16619      10
17295      10
Name: user_id, Length: 20000, dtype: int64

 The user_id 13613 has the most observations: 
         user_id quiz_type  quiz_number  question_number  item_id  rating
753040    13613    random            0                0     2276       1
753041    13613    random            0                1     1471       1
753042    13613    random            0                2     3446       0
753043    13613    random            0                3     1258       1
753044    13613    random            0                4      381       1
...         ...       ...          ...              ...      ...     ...
754075    13613    random          103             1035     1406       0
754076    13613    random          103             1036     4643       1
754077    13613    random          103             1037

In [11]:
# separate the user data into having randomized or personalized quizes. 
# The recommendation input will only consider the randomized ones. 
df_random = df[df.quiz_type == 'random']
len(df_random)

779920

In [12]:
# personalized data can be considered as the basemodel whose performance can be used to compare with our recommendation
df_personalized = df[df.quiz_type == 'personalized']
len(df_personalized)

336450

In [15]:
# build our input observations for recommendation system, only keep columns necessary
inputItems = df[(df.user_id == 13613) & (df.quiz_type == "random")][['item_id', 'rating']]
print ("The input for user profiling is: \n", inputItems)

The input for user profiling is: 
         item_id  rating
753040     2276       1
753041     1471       1
753042     3446       0
753043     1258       1
753044      381       1
...         ...     ...
754075     1406       0
754076     4643       1
754077     1152       0
754078     1582       0
754079      352       0

[800 rows x 2 columns]


In [16]:
# As a single user might answer the same question more than one time, we would like to avoid the situation for 
# easeness of user similiarity calculation. We only keep the "like" answer when duplicates happen.
inputItems = inputItems.groupby(['item_id']).max().reset_index()
print ("The input with no duplications for user profiling is: \n", inputItems)

The input with no duplications for user profiling is: 
      item_id  rating
0         16       1
1         20       1
2         28       1
3         32       1
4         42       1
..       ...     ...
724     4975       0
725     4979       0
726     4981       1
727     4986       1
728     4997       1

[729 rows x 2 columns]


In [17]:
# the original dataset shall exclude the input dataset for the following user similiarity calculation
df = df[df.user_id != 13613]
df.head()
len(df)

1115330

In [18]:
# Adjust our dataset for recommendation system, only keep columns necessary
df = df.drop(['quiz_type', 'quiz_number', 'question_number'], 1)

  df = df.drop(['quiz_type', 'quiz_number', 'question_number'], 1)


### Find Similiar Users in the Dataset

In [19]:
#Filtering out users that have rated items which the input user has rated, too.
userSubset = df[df['item_id'].isin(inputItems['item_id'].tolist())]
print ("Users in dataset that have rated the same item as input user: \n", userSubset.head())

Users in dataset that have rated the same item as input user: 
     user_id  item_id  rating
1         0      734       1
3         0     3231       0
5         0     4005       1
9         0      513       0
10        0     2848       0


In [20]:
# As a single user might answer the same question more than one time, we would like to avoid the situation for 
# easeness of user similiarity calculation. We only keep the "like" answer when duplicates happen.
userSubset = userSubset.groupby(["user_id", "item_id"]).max().reset_index() # fix the above duplicate bug

In [21]:
userSubsetGroup = userSubset.groupby(['user_id'])

In [22]:
# for verification of our unstanding of dataset
print ("The user_id with the most observation in our dataset: \n", userSubset.user_id.value_counts())

The user_id with the most observation in our dataset: 
 11786    99
6273     96
15044    90
16398    84
11810    79
         ..
14825     1
11394     1
4301      1
9783      1
16292     1
Name: user_id, Length: 18884, dtype: int64


In [23]:
for ID in userSubset.user_id.unique():
    if len(userSubsetGroup.get_group(ID)) == 99:
        print ("The user_id = ", ID)
        break

The user_id =  11786


In [24]:
#Sorting it so users with items most in common with the input will have priority
userSubsetGroup = sorted(userSubsetGroup,  key=lambda x: len(x[1]), reverse=True)

In [25]:
userSubsetGroup[0:3]

[(11786,
         user_id  item_id  rating
  93285    11786       16       0
  93286    11786       48       1
  93287    11786       61       1
  93288    11786       89       1
  93289    11786      108       0
  ...        ...      ...     ...
  93379    11786     4800       0
  93380    11786     4857       1
  93381    11786     4889       1
  93382    11786     4921       0
  93383    11786     4939       0
  
  [99 rows x 3 columns]),
 (6273,
         user_id  item_id  rating
  49487     6273      104       1
  49488     6273      106       1
  49489     6273      160       1
  49490     6273      198       0
  49491     6273      204       1
  ...        ...      ...     ...
  49578     6273     4721       1
  49579     6273     4921       1
  49580     6273     4925       0
  49581     6273     4981       0
  49582     6273     4986       1
  
  [96 rows x 3 columns]),
 (15044,
          user_id  item_id  rating
  119253    15044       28       0
  119254    15044       42    

### User Similiarity Calculation
- We will calculate the similiarity between the users in the dataset and the user from the input.
- We're going to find out how similar each user is to the input through the **Pearson Correlation Coefficient**.

In [26]:
#Store the Pearson Correlation in a dictionary, where the key is the user_id and the value is the coefficient
pearsonCorrelationDict = {}

#For every user group in our subset
for name, group in userSubsetGroup:
    #Let's start by sorting the input and current user group so the values aren't mixed up later on
    inputItems = inputItems.sort_values(by='item_id')
    group = group.sort_values(by='item_id')
    
    #Get the N for the formula
    nRatings_group = len(group)

    #Get the review scores for the items that they both have in common
    temp_df = inputItems[inputItems['item_id'].isin(group['item_id'].tolist())]
    
    #And then store them in a temporary buffer variable in a list format to facilitate future calculations
    tempRatingList = temp_df['rating'].tolist() # X
    # for verification, the two numbers shall be the same
    if len(group) != len(temp_df): 
        break
        
    #Let's also put the current user group reviews in a list format
    tempGroupList = group['rating'].tolist()    # Y
    # for verification, the two numbers shall be the same
    if len(tempRatingList) != len(tempGroupList): 
        break

    #Now let's calculate the pearson correlation between two users, so called, x and y
    Sxx = sum([i**2 for i in tempRatingList])/float(nRatings_group) - pow(sum(tempRatingList)/float(nRatings_group), 2)
        
    Syy = sum([i**2 for i in tempGroupList])/float(nRatings_group) - pow(sum(tempGroupList)/float(nRatings_group),2)
    
    Sxy = sum(i*j for i, j in zip(tempRatingList, tempGroupList))/float(nRatings_group) - (sum(tempRatingList)/float(nRatings_group))*(sum(tempGroupList)/float(nRatings_group))

    #If the denominator is different than zero, then divide, else, 0 correlation.
    if Sxx != 0 and Syy != 0:
        pearsonCorrelationDict[name] = Sxy/sqrt(Sxx*Syy)
    else:
        pearsonCorrelationDict[name] = 0

In [27]:
pearsonCorrelationDict.items()

dict_items([(11786, 0.10280786170305843), (6273, 0.18782402666365872), (15044, 0.06578683696810747), (16398, -0.0934840022852446), (11810, 0.08459542116087194), (5118, 0.14406548870054156), (74, 0.22158001855077183), (19739, 0.17804334484284684), (5745, 0.16222142113076277), (1258, 0.1967223688411586), (1436, 0.07196969696969681), (10944, 0.13312693498452058), (15867, 0.06558300085401735), (16236, 0.24019223070763107), (18810, 0.0961232974701556), (19288, 0.1503893404502031), (17899, -0.0010783177012186672), (11460, 0.14400244857369446), (19246, 0.08352612997481396), (6985, -0.08492077756084478), (7912, 0.16392133663309602), (17042, 0.09332965506148581), (17531, 0.12769230769230838), (12933, 0.1146340578217826), (19211, 0.2736123023541794), (3528, 0.026856632724128398), (16144, 0.12845499917553616), (17196, 0.2831675361164617), (7326, -0.01511857892036934), (10839, 0.048911598804452165), (14977, 0.22480720075647434), (15003, 0.27668578554643003), (10764, 0.22538272639745238), (11662, 0

In [29]:
pearsonDF = pd.DataFrame.from_dict(pearsonCorrelationDict, orient='index')
pearsonDF.columns = ['similarityIndex']
pearsonDF['user_id'] = pearsonDF.index
pearsonDF.index = range(len(pearsonDF))
pearsonDF.head()

Unnamed: 0,similarityIndex,user_id
0,0.102808,11786
1,0.187824,6273
2,0.065787,15044
3,-0.093484,16398
4,0.084595,11810


In [30]:
# For potential computation cost, we will get the top 50 users that are most similar to the input.
topUsers=pearsonDF.sort_values(by='similarityIndex', ascending=False)[0:50]
print ("A glance of the top 50 user_id: \n", topUsers.head())

A glance of the top 50 user_id: 
        similarityIndex  user_id
15477              1.0     9025
14346              1.0    18694
7745               1.0     8865
13536              1.0    10538
10060              1.0     9798


### Generate Recommendation DataFrame

In [32]:
# merge with our dataset to get the items these similiar users have rated
topUsersRating=topUsers.merge(df, left_on='user_id', right_on='user_id', how='inner')

print ("A glance of the top 50 user_id with items they rated: \n", topUsersRating.head())
print ("\n Number of observations: ", len(topUsersRating))

A glance of the top 50 user_id with items they rated: 
    similarityIndex  user_id  item_id  rating
0              1.0     9025     4688       1
1              1.0     9025     1119       1
2              1.0     9025     1417       0
3              1.0     9025      373       1
4              1.0     9025     3625       0

 Number of observations:  1240


In [34]:
#Multiplies the similarity by the user's ratings
topUsersRating['weightedRating'] = topUsersRating['similarityIndex']*topUsersRating['rating']

print ("A glance of the top 50 user_id with weighted rating: \n", topUsersRating.head())

A glance of the top 50 user_id with weighted rating: 
    similarityIndex  user_id  item_id  rating  weightedRating
0              1.0     9025     4688       1             1.0
1              1.0     9025     1119       1             1.0
2              1.0     9025     1417       0             0.0
3              1.0     9025      373       1             1.0
4              1.0     9025     3625       0             0.0


In [51]:
#Applies a sum to the topUsers after grouping it up by item_id
tempTopUsersRating = topUsersRating.groupby('item_id').sum()[['similarityIndex','weightedRating']]
tempTopUsersRating.columns = ['sum_similarityIndex','sum_weightedRating']

print ("A glance of the item_id that the input user might like from the most similiar 50 users: \n", tempTopUsersRating.head())
print ("\n Number of observations: ", len(tempTopUsersRating))

A glance of the item_id that the input user might like from the most similiar 50 users: 
          sum_similarityIndex  sum_weightedRating
item_id                                         
5                        1.0                 1.0
12                       1.0                 0.0
13                       2.0                 0.0
14                       1.0                 1.0
26                       1.0                 0.0

 Number of observations:  1076


In [41]:
#Creates an empty dataframe
recommendation_df = pd.DataFrame()
#Now we take the weighted average
recommendation_df['weighted average recommendation score'] = tempTopUsersRating['sum_weightedRating']/tempTopUsersRating['sum_similarityIndex']
recommendation_df['item_id'] = tempTopUsersRating.index

print ("A glance of the weighted average recommendation scores for these item_id: \n", recommendation_df.head())


A glance of the weighted average recommendation scores for these item_id: 
          weighted average recommendation score  item_id
item_id                                                
5                                          1.0        5
12                                         0.0       12
13                                         0.0       13
14                                         1.0       14
26                                         0.0       26


In [42]:
recommendation_df = recommendation_df.sort_values(by='weighted average recommendation score', ascending=False)

print ("A glance of the sorted weighted average recommendation scores for these item_id: \n", recommendation_df.head())

A glance of the sorted weighted average recommendation scores for these item_id: 
          weighted average recommendation score  item_id
item_id                                                
5                                          1.0        5
2910                                       1.0     2910
2829                                       1.0     2829
2848                                       1.0     2848
2854                                       1.0     2854


### Evaluation

In [45]:
# We make recommendation of the item_ids who have the max similiartity score, 1.0
recommendation_df_max = recommendation_df[recommendation_df['weighted average recommendation score'] == np.max(recommendation_df['weighted average recommendation score'])]

print ("A glance of the list of recommendation based on input user's profile: \n", recommendation_df_max.head())
print ("\n Number of observations: ", len(recommendation_df_max))

A glance of the list of recommendation based on input user's profile: 
          weighted average recommendation score  item_id
item_id                                                
5                                          1.0        5
2910                                       1.0     2910
2829                                       1.0     2829
2848                                       1.0     2848
2854                                       1.0     2854

 Number of observations:  565


In [46]:
# Compared our recommendations with the personalized dataset, which are also collected from another 
# recommendation system

overlapped_items = recommendation_df_max[recommendation_df_max.item_id.isin((df_personalized[df_personalized.user_id == 13613].item_id.to_list()))]
print ("The overlapped items betweeen the 2 recommendation systems: \n", overlapped_items)
print ("\n Number of observations: ", len(overlapped_items))

The overlapped items betweeen the 2 recommendation systems: 
          weighted average recommendation score  item_id
item_id                                                
2821                                       1.0     2821
2807                                       1.0     2807
3303                                       1.0     3303
3357                                       1.0     3357
3228                                       1.0     3228
3083                                       1.0     3083
3092                                       1.0     3092
3125                                       1.0     3125
3205                                       1.0     3205
1933                                       1.0     1933
1993                                       1.0     1993
2616                                       1.0     2616
1855                                       1.0     1855
2394                                       1.0     2394
2205                                      

In [47]:
# We'd like to know on these overlapped items, does the input user (user_id = 13613) really like these items or not?
df_compared = df_personalized[df_personalized.user_id == 13613][df_personalized.item_id.isin(overlapped_items.item_id.to_list())]

print ("The overlapped items tested by the input user: \n", df_compared)

The overlapped items tested by the input user: 
         user_id     quiz_type  quiz_number  question_number  item_id  rating
753052    13613  personalized            1               12     3083       1
753059    13613  personalized            1               19     4331       1
753060    13613  personalized            2               20      789       1
753066    13613  personalized            2               26     1600       1
753067    13613  personalized            2               27     3205       1
753094    13613  personalized            5               54     3445       1
753095    13613  personalized            5               55     4962       1
753139    13613  personalized            9               99     1933       1
753180    13613  personalized           14              140     2205       1
753186    13613  personalized           14              146     4673       1
753207    13613  personalized           16              167     2394       1
753357    13613  personaliz

  df_compared = df_personalized[df_personalized.user_id == 13613][df_personalized.item_id.isin(overlapped_items.item_id.to_list())]


In [50]:
# We define a metric call success recommendation rate, which means the ratio the input user really like our recommendations
success_recommendation_rate = np.mean(df_compared.rating)

print ("The success recommendation rate for our system is: {:0.2f} %".format(success_recommendation_rate*100))

The success recommendation rate for our system is: 73.68 %
