In [169]:
import pandas as pd
import numpy as np
from scipy.spatial.distance import cosine, euclidean

### Task 1: Import data

In [170]:
df_data1 = pd.read_csv("DBbook_train_ratings.tsv", sep="\t")
print(df_data1.head())

   userID  itemID  rate
0    6873    3201     4
1    6873    3098     4
2    6873    4198     4
3    6873    5950     4
4    6873     204     4


In [171]:
print("Unique users: %d" % len(df_data1["userID"].unique()))
print("Unique books: %d" % len(df_data1["itemID"].unique()))

Unique users: 6181
Unique books: 6166


### Task 2: Utility matrix

In [172]:
dense_matrix = df_data1.pivot_table(values="rate", index=["userID"], columns=["itemID"])
print("Shape of the user-item matrix: %d x %d" % dense_matrix.shape)
print(dense_matrix.head(10))

Shape of the user-item matrix: 6181 x 6166
itemID  1     2     3     5     7     8     9     11    12    13    ...  8157  \
userID                                                              ...         
1        NaN   NaN   NaN   NaN   NaN   NaN   NaN   NaN   NaN   NaN  ...   NaN   
2        NaN   NaN   NaN   NaN   NaN   NaN   NaN   NaN   NaN   NaN  ...   NaN   
3        NaN   NaN   NaN   NaN   NaN   NaN   NaN   NaN   NaN   NaN  ...   NaN   
4        NaN   NaN   NaN   NaN   NaN   NaN   NaN   NaN   NaN   NaN  ...   NaN   
5        NaN   NaN   NaN   NaN   NaN   NaN   NaN   NaN   NaN   NaN  ...   NaN   
6        NaN   NaN   NaN   NaN   NaN   NaN   NaN   NaN   NaN   NaN  ...   NaN   
7        NaN   NaN   NaN   NaN   NaN   NaN   NaN   NaN   NaN   NaN  ...   NaN   
8        NaN   NaN   NaN   NaN   NaN   NaN   NaN   NaN   NaN   NaN  ...   NaN   
9        NaN   NaN   NaN   NaN   NaN   NaN   NaN   NaN   NaN   NaN  ...   NaN   
10       NaN   NaN   NaN   NaN   NaN   NaN   NaN   NaN   NaN   NaN

In [173]:
print("Number of cells in the utility matrix with NaN: %d" % dense_matrix.isna().sum().sum())

Number of cells in the utility matrix with NaN: 38036488


In [174]:
print("Percentage of cells in the utility matrix with NaN: {:.2%}".format(dense_matrix.isna().sum().sum()/(len(dense_matrix.columns)*len(dense_matrix.index))))

Percentage of cells in the utility matrix with NaN: 99.80%


In [175]:
dense_matrix = dense_matrix.fillna(0)
dense_matrix.head()

itemID,1,2,3,5,7,8,9,11,12,13,...,8157,8160,8161,8162,8163,8164,8166,8167,8168,8169
userID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0,0.0
5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


### Task 3: userID 2 - top 5 similar users based on Euclidean distance

In [176]:
def top_k_users(user_number, k):
    df_sim = dense_matrix.loc[dense_matrix.index != user_number] # remove the active user
    df_sim["distance"] = df_sim.apply(lambda x: euclidean(dense_matrix.loc[user_number], x), axis=1)     # calculate the distance for between the given user and each row
    return df_sim.sort_values(by="distance").head(k)["distance"]     # return the top k from the sorted distances

top_k_users(2, 5) # retrieve top five similar users to userID 2

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  This is separate from the ipykernel package so we can avoid doing imports until


userID
3917    11.401754
6875    12.569805
986     12.806248
1983    12.884099
1156    12.922848
Name: distance, dtype: float64

### Task 4: Comparisons - Euclidean distance

In [177]:
print("The Euclidean distance between itemID 18 and itemID 1: {:.6f}".format(euclidean(dense_matrix[18], dense_matrix[1])))

The Euclidean distance between itemID 18 and itemID 1: 29.189039


In [178]:
print("The Euclidean distance between itemID 36 and itemID 1: {:.6f}".format(euclidean(dense_matrix[36], dense_matrix[1])))

The Euclidean distance between itemID 36 and itemID 1: 40.124805


In [179]:
print("ItemID 18 is more similar to itemID 1 because its Euclidean distance is less than that of itemID 36.")

ItemID 18 is more similar to itemID 1 because its Euclidean distance is less than that of itemID 36.


### Task 5: itemID 8010 - top 5 similar items based on Euclidean distance

In [180]:
def top_k_items(item_number, k):
    df_sim = dense_matrix.transpose() # copy the dense matrix and transpose it so each row represents an item
    df_sim = df_sim.loc[df_sim.index != item_number] # remove the active item 
    df_sim["distance"] = df_sim.apply(lambda x: euclidean(dense_matrix[item_number], x), axis=1) # calculate the distance between the given item for each row (apply the function to each row if axis = 1)
    return df_sim.sort_values(by="distance").head(k)["distance"] # return the top k from the sorted distances    

top_k_items(8010, 5) # retrieve top five similar items to itemID 8010

itemID
3711    127.921851
4559    127.964839
330     129.715072
1311    129.722781
7328    129.761319
Name: distance, dtype: float64

### Task 6: remove books and users with less than 20 rating scores

In [181]:
df_item_fre = df_data1.groupby("itemID").count()
df_user_fre = df_data1.groupby("userID").count()
selected_items = df_item_fre[df_item_fre["userID"]>20].index
dense_matrix = dense_matrix[selected_items]
selected_users = df_user_fre[df_user_fre["itemID"]>20].index
dense_matrix = dense_matrix.loc[selected_users]
print(dense_matrix.shape)

(766, 776)


### Task 7: itemID8010 - value counts

In [182]:
#remove users that haven't rated itemID8010
df_itemID8010 = dense_matrix[dense_matrix[8010] != 0]

In [183]:
#counts of the different rating scores for itemID8010
df_itemID8010[8010].value_counts()

4.0    68
5.0    58
3.0    27
2.0    13
1.0     8
Name: 8010, dtype: int64

In [184]:
#shape of the dataset
print(df_itemID8010.shape)

(174, 776)


### Task 8: Partition data set

In [185]:
# print the dimensions of the training set and testing set

from sklearn.model_selection import train_test_split

df_x = df_itemID8010[[col for col in df_itemID8010.columns if col != 8010]] # create a data frame for the predictors
df_y = df_itemID8010[[8010]] # create a series for the outcome

train_x, test_x, train_y, test_y = train_test_split(df_x, df_y, test_size=0.25, random_state=0)
df_train_x = pd.DataFrame(train_x, columns=df_x.columns)
df_test_x = pd.DataFrame(test_x, columns=df_x.columns)
df_train_y = pd.DataFrame(train_y, columns=[8010])
df_test_y = pd.DataFrame(test_y, columns=[8010])

print("Training set dimensions: train_x is " + str(df_train_x.shape) + " train_y is " + str(df_train_y.shape) + ".")
print("Test set dimensions: test_x is " + str(df_test_x.shape) + " test_y is " + str(df_test_y.shape) + ".")

Training set dimensions: train_x is (130, 775) train_y is (130, 1).
Test set dimensions: test_x is (44, 775) test_y is (44, 1).


In [186]:
print("Training set mean rating for itemID8010: {:.6f}".format(train_y[8010].mean()))

Training set mean rating for itemID8010: 3.915385


In [187]:
print("Testing set mean rating for itemID8010: {:.6f}".format(test_y[8010].mean()))

Testing set mean rating for itemID8010: 3.818182


### Task 9: Predicted rating

In [188]:
# print the userID of the the user in the 5th row (not userID5) in the test dataset
uid = df_test_x.index[4] 
print(uid)

5614


In [189]:
k = 5 # specify the number of similar users to retrieve

def user_based_predict(user_number):
    # retrieve the top k similar users
    # copy from all the training predictors
    df_sim = df_train_x.copy()
    # for each user, calculate the distance between this user and the active user
    df_sim["distance"] = df_sim.apply(lambda x: euclidean(df_test_x.loc[user_number], x), axis=1)
    # create a new data frame to store the top k similar users
    df_sim_users = df_sim.loc[df_sim.sort_values(by="distance").head(k).index]
    # calculate these similar users' rating on itemID 8010, weighted by distance
    df_sim_users["weighed_d"] = list(map(lambda x: df_sim_users.loc[x]["distance"]*df_train_y.loc[x][8010], df_sim_users.index))
    predicted = df_sim_users["weighed_d"].sum()/df_sim_users["distance"].sum()
    return predicted

In [190]:
print("Predicted rating on itemID 8010:", user_based_predict(uid))
print("Actual rating on itemID 8010:     ", df_test_y.loc[uid][8010])

Predicted rating on itemID 8010: 4.013489148846391
Actual rating on itemID 8010:      4.0


### Task 10: Describe filtering methods

#### Use your own language (<= 6 sentences) to briefly describe content-based filtering. 

Content-based filtering:
This type of recommender system generates user preference information. These user preferences are then used to predict user ratings. Finally, the predicted ratings are used to recommend items to users. In this approach, the item feature vectors are already known, but the user feature vectors must be learned. Information on other users is not needed to utilize content-based filtering. 

#### Use your own language (<= 6 sentences) to briefly describe model-based collaborative filtering. 

Model-based collaborative filtering:
This type of recommender system generates user rating information. It takes a utility matrix of user ratings and items and performs singular value decomposition (SVD). Unknown feature vectors and unknown user vectors are initialized to small random values, and gradient descent is used to solve the optimzation problem. 

#### Summarize the differences between content-based filtering and model-based collaborative filtering? (at least two differences). 

In content-based filtering, item features are known, but user features are unknown. In model-based collaborative filtering, both item features and user features are not known.

Conent-based filtering can be used for new items because it doesn't suffer from cold start problems on items. Model-based collaborative filtering cannot be used on new items.
