In [1]:
import gzip
from collections import defaultdict
import math
import scipy.optimize
from sklearn import svm
import numpy
import string
import random
import string
from sklearn import linear_model
from sklearn.preprocessing import MinMaxScaler

In [2]:
import warnings
warnings.filterwarnings("ignore")

In [3]:
def assertFloat(x):
    assert type(float(x)) == float

def assertFloatList(items, N):
    assert len(items) == N
    assert [type(float(x)) for x in items] == [float]*N

In [4]:
def readGz(path):
    for l in gzip.open(path, 'rt'):
        yield eval(l)

In [5]:
def readCSV(path):
    f = gzip.open(path, 'rt')
    f.readline()
    for l in f:
        u,b,r = l.strip().split(',')
        r = int(r)
        yield u,b,r

In [6]:
allRatings = []
for l in readCSV("train_Interactions.csv.gz"):
    allRatings.append(l)

In [7]:
allRatings

[('u93397390', 'b52690052', 3),
 ('u93952353', 'b52355478', 1),
 ('u09633433', 'b89163374', 0),
 ('u91366781', 'b78391921', 5),
 ('u50251394', 'b80097453', 0),
 ('u98333620', 'b80430178', 3),
 ('u77448169', 'b59839746', 5),
 ('u26756934', 'b97033148', 5),
 ('u05133116', 'b80332084', 4),
 ('u83823213', 'b21044596', 0),
 ('u03796919', 'b06193959', 3),
 ('u27298212', 'b63754598', 4),
 ('u52859955', 'b50701271', 5),
 ('u68110777', 'b71589871', 4),
 ('u87078306', 'b72186649', 4),
 ('u57763741', 'b11312993', 0),
 ('u92956783', 'b35937909', 4),
 ('u16610380', 'b85897452', 2),
 ('u04899495', 'b42348418', 5),
 ('u68380623', 'b93099714', 2),
 ('u31697730', 'b64422042', 5),
 ('u03418373', 'b21086057', 5),
 ('u51020221', 'b76504829', 4),
 ('u49802588', 'b59839746', 5),
 ('u43834241', 'b23777223', 5),
 ('u18989834', 'b47202435', 5),
 ('u38062310', 'b64156373', 3),
 ('u96640180', 'b59171338', 4),
 ('u54345243', 'b10001514', 3),
 ('u45225765', 'b93506529', 4),
 ('u51008934', 'b01624394', 4),
 ('u4176

In [7]:
# Reviews 1 - 190,000 for training 
ratingsTrain = allRatings[:190000]
# Reviews 190,001 - 200,000 for validation 
ratingsValid = allRatings[190000:]
# For all data 
global_ratingsPerUser = defaultdict(list)
global_ratingsPerItem = defaultdict(list)
# For just training data
ratingsPerUser = defaultdict(list)
ratingsPerItem = defaultdict(list)

# For all data including training and validation 
for u,b,r in allRatings:
    global_ratingsPerUser[u].append((b,r))
    global_ratingsPerItem[b].append((u,r))
    
# Used only for training data 
for u,b,r in ratingsTrain:
    ratingsPerUser[u].append((b,r))
    ratingsPerItem[b].append((u,r))

#### Generating Negative Samples

In [8]:
# Data from all the books in the dataset because we want all possible books in order to select a book the user has not read 
all_books_data = set(global_ratingsPerItem.keys())
# Selecting all the readers in the validation dataset
readers_validation = [u[0] for u in ratingsValid]

for u in readers_validation:
    all_books_user_read = set(b[0] for b in global_ratingsPerUser[u])
    # Line below selects one book the user has not read yet from all possible books not read 
    book_user_no_read = random.sample(list(all_books_data - all_books_user_read), 1)[0]
    # Appending the negative sample, so appending the user, the book the user hasn't read, and a score representing what the user hasn't read
    ratingsValid.append((u, book_user_no_read, -1))

In [9]:
# Baseline 
bookCount = defaultdict(int)
totalRead = 0

for user,book,_ in readCSV("train_Interactions.csv.gz"):
    bookCount[book] += 1
    totalRead += 1

mostPopular = [(bookCount[x], x) for x in bookCount]
mostPopular.sort()
mostPopular.reverse()

return1 = set()
count = 0
threshold = 0.74
for ic, i in mostPopular:
    count += ic
    return1.add(i)
    if count > totalRead * threshold: break

In [10]:
from collections import defaultdict

book_ratings = defaultdict(list)
for user, book, rating in allRatings:
    book_ratings[book].append(rating)

book_avg_ratings = {}
for book, ratings in book_ratings.items():
    avg_rating = sum(ratings) / len(ratings)
    book_avg_ratings[book] = avg_rating

most_highly_rated_books = sorted(book_avg_ratings.items(), key=lambda x: x[1], reverse=True)

most_highly_rated_books = [book for (book, rating) in most_highly_rated_books if rating > 4]

In [11]:
user_interactions = defaultdict(list)
for user,book,_ in ratingsValid:
    user_interactions[user].append(bookCount[book])


user_threshold = defaultdict(int)
for user, _,_ in ratingsValid:
    interactions = user_interactions[user]
    interactions = sorted(interactions)
    user_threshold[user] = interactions[len(interactions)//2]

### Trying on Test Dataset

In [12]:
import pandas as pd
df_ratings_test = pd.read_csv('predictions_Read.csv')

In [13]:
book_count_test = defaultdict(int)
total_book_count = 0
for index, row in df_ratings_test.iterrows():
    user = row['userID']
    book = row['bookID']
    book_count_test[book] += 1
    total_book_count += 1

In [14]:
book_count_test

defaultdict(int,
            {'b80407575': 20,
             'b22251874': 12,
             'b59334959': 2,
             'b96807645': 1,
             'b93777449': 5,
             'b02250808': 6,
             'b12715496': 6,
             'b74622707': 1,
             'b33229445': 6,
             'b55417152': 1,
             'b06804790': 1,
             'b43975237': 3,
             'b56379602': 3,
             'b51752826': 54,
             'b35509876': 5,
             'b25350833': 19,
             'b32262230': 2,
             'b12138527': 2,
             'b85897452': 6,
             'b21827216': 4,
             'b39380938': 4,
             'b62181041': 4,
             'b65888554': 7,
             'b72571590': 4,
             'b93239672': 13,
             'b33892919': 35,
             'b18929421': 9,
             'b34817039': 2,
             'b48994797': 2,
             'b09990540': 3,
             'b10395308': 2,
             'b29795443': 22,
             'b00852710': 19,
             'b620

In [15]:
user_interactions = defaultdict(list)
for index, row in df_ratings_test.iterrows():
    user = row['userID']
    book = row['bookID']
    user_interactions[user].append(book_count_test[book])


user_threshold = defaultdict(int)
for index, row in df_ratings_test.iterrows():
    user = row['userID']
    interactions = user_interactions[user]
    interactions = sorted(interactions)
    user_threshold[user] = interactions[len(interactions)//2]

In [16]:
user_interactions = defaultdict(list)
for index, row in df_ratings_test.iterrows():
    user = row['userID']
    book = row['bookID']
    user_interactions[user].append(bookCount[book])


user_threshold_2 = defaultdict(int)
for index, row in df_ratings_test.iterrows():
    user = row['userID']
    interactions = user_interactions[user]
    interactions = sorted(interactions)
    user_threshold_2[user] = interactions[len(interactions)//2]

In [62]:
# Test prediction model code
def predictOnTest(user, book):
    sim_threshold = 0.0025
    '''
    ### Jaccard comparing b' to b 
    # users_set_book contains all users of the current book 
    users_set_book = set(u[0] for u in ratingsPerItem[book])
    # user_books_read contains all the books the user has read so we can compare b' to b 
    user_books_read = [b[0] for b in ratingsPerUser[user]]
    max_jacc_sim_books = 0
    avg_jacc_sim_book = []

    for b_prime in user_books_read:
        if b_prime == book:
            continue 
        users_b_prime = set(u[0] for u in ratingsPerItem[b_prime])
        jacc_sim = Jaccard(users_set_book, users_b_prime)
        max_jacc_sim_books = max(max_jacc_sim_books, jacc_sim)
        avg_jacc_sim_book.append(jacc_sim)
    
    if len(avg_jacc_sim_book) != 0:
        avg_jacc_sim_book = sum(avg_jacc_sim_book)/len(avg_jacc_sim_book)
    else:
        avg_jacc_sim_book = 0
        
    ### Jaccard comparing u' to u 
    # books_read_user contains all books read by current user 
    books_read_user = set([u[0] for u in ratingsPerUser[user]])
    # users_read_book contains all the users who read the book so we can compare u' to u 
    users_read_book = [b[0] for b in ratingsPerItem[book]]
    max_jacc_sim_users = 0
    avg_jacc_sim_users = []
    
    for u_prime in users_read_book:
        if u_prime == user:
            continue 
        user_prime_books = set(u[0] for u in ratingsPerUser[u_prime])
        jacc_sim = Jaccard(books_read_user, user_prime_books)
        max_jacc_sim_users = max(max_jacc_sim_users, jacc_sim)
        avg_jacc_sim_users.append(jacc_sim)
    
    if len(avg_jacc_sim_users) != 0:
        avg_jacc_sim_users = sum(avg_jacc_sim_users)/len(avg_jacc_sim_users)
    else:
        avg_jacc_sim_users = 0
   
  
    ### Popularity 
    popularity = 0 
    if book in return1:
        popularity = 1
    else: 
        popularity = 0

    # Top Half Set 
    top_half = int(bookCount[book] >= user_threshold[user])

    # bookCount[book]/totalRead,

    return max_jacc_sim_users > sim_threshold and max_jacc_sim_books > sim_threshold and \
            avg_jacc_sim_users > sim_threshold and avg_jacc_sim_book > sim_threshold and book in return1 \
            and bookCount[book] >= user_threshold[user]
    '''
    
    return ((book in return1) or (book in most_highly_rated_books)) and bookCount[book] >= user_threshold_2[user] 

In [63]:
predictions = open("predictions_Read.csv", 'w')
for l in open("pairs_Read.csv"):
    if l.startswith("userID"):
        predictions.write(l)
        continue
    u,b = l.strip().split(',')
    # Calling Model 
    prediction = predictOnTest(u, b)
    print(prediction)
    predictions.write(f"{u},{b},{int(prediction)}\n")

predictions.close()

False
True
False
False
False
True
True
False
True
False
False
True
False
True
False
True
False
False
True
False
False
False
True
True
True
True
True
False
False
False
False
True
True
False
True
True
False
False
True
True
True
False
False
True
False
False
True
True
False
False
False
False
False
True
True
False
True
False
True
False
True
False
True
True
False
True
True
True
False
False
False
True
False
False
False
False
False
False
False
True
False
True
False
False
False
False
True
True
False
True
False
False
False
False
False
False
False
False
False
False
False
True
True
False
False
False
False
True
False
False
False
False
True
False
True
False
True
False
True
False
False
False
False
True
True
True
True
True
True
False
False
False
False
True
False
False
False
True
False
True
False
False
True
False
True
False
False
False
True
True
True
True
False
True
True
False
False
False
True
False
False
True
True
False
True
False
True
False
False
True
True
True
False
False
True
True
True
False
True
T

# Rating Prediction - latent factor model approach

In [67]:
# Setup for rating prediction
# allRatings contains both train and test samples 
allRatings = []
for l in readCSV("train_Interactions.csv.gz"):
    allRatings.append(l)

In [68]:
allRatings[0]

('u93397390', 'b52690052', 3)

In [69]:
# Reviews 1 - 190,000 for training 
ratingsTrain = allRatings[:190000]
# Reviews 190,001 - 200,000 for validation 
ratingsValid = allRatings[190000:]
'''
ratingsPerUser = defaultdict(list)
ratingsPerItem = defaultdict(list)

# Only for training data 
for u,b,r in ratingsTrain:
    ratingsPerUser[u].append((b,r)) # every user and the list of reviews they gave 
    ratingsPerItem[b].append((u,r)) # every book and the list of users for that book 
'''

'\nratingsPerUser = defaultdict(list)\nratingsPerItem = defaultdict(list)\n\n# Only for training data \nfor u,b,r in ratingsTrain:\n    ratingsPerUser[u].append((b,r)) # every user and the list of reviews they gave \n    ratingsPerItem[b].append((u,r)) # every book and the list of users for that book \n'

In [70]:
# Getting the mean rating so we can substitute when user doesn't have ratings 
total_ratings = [d[2] for d in allRatings]
mean_global_rating = sum(total_ratings)/len(total_ratings)

In [71]:
# Setting up dictionaries for latent factor model 
global_ratingsPerUser = defaultdict(list)
global_ratingsPerItem = defaultdict(list)
global_userItem_rating = defaultdict(float)

for user, book, rating in allRatings:
    global_ratingsPerUser[user].append((book, rating))
    global_ratingsPerItem[book].append((user, rating))
    global_userItem_rating[(user, book)] = rating

In [72]:
# These will be updated in the gradient descent 
userBiases = defaultdict(float)
itemBiases = defaultdict(float)
alpha = mean_global_rating # intializing here because mean is a good starting point 

for user in global_ratingsPerUser:
    userBiases[user] = 0 

for book in global_ratingsPerItem:
    itemBiases[book] = 0

In [73]:
# Defining functions to iteratively adjust params alpha, userBiases, and itemBiases to 
# minimize MSE between the actual rating and the predicted rating 
# regularization term added to prevent overfitting 

# Iteratively adjusting alpha to reduce MSE 
def alpha_iteration():
    global alpha 
    # training on the training data
    numerator = sum(global_userItem_rating[(user, book)] - (userBiases[user] + itemBiases[book]) for user, book, _ in ratingsTrain)
    alpha = numerator/len(ratingsTrain)
    
# Iteratively adjusting itemBiases to reduce MSE 
def itemBiases_iteration(reg_lamb):
    global alpha 
    for book in global_ratingsPerItem:
        numerator = sum(global_userItem_rating[(user, book)] - (userBiases[user] + alpha) for user, rating in global_ratingsPerItem[book])
        itemBiases[book] = numerator/(reg_lamb + len(global_ratingsPerItem[book]))

# Iteratively adjusting userBiases to reduce MSE 
def userBiases_iteration(reg_lamb):
    global alpha 
    for user in global_ratingsPerUser:
        numerator = sum(global_userItem_rating[(user, book)] - (itemBiases[book] + alpha) for book, rating in global_ratingsPerUser[user])
        userBiases[user] = numerator/(reg_lamb + len(global_ratingsPerUser[user]))


In [74]:
# Defining MSE for check on validation set 
def MSE():
    error = sum((rating - predict(user, book))**2 for user, book, rating in ratingsValid)
    return error/len(ratingsValid)

In [75]:
# Iteration logic 
def updating_biases_and_alpha(iters, reg_lamb):
    MSE_min = 0
    for i in range(iters):
        alpha_iteration()
        userBiases_iteration(reg_lamb)
        itemBiases_iteration(reg_lamb)
        MSE_post_update = MSE()
        if i % 10 == 0:
            print("iteration ", i, " MSE: ", MSE_post_update)
        MSE_min = min(MSE_post_update, MSE_min)
    return MSE_post_update

In [77]:
validMSE = updating_biases_and_alpha(40, 4.0)

iteration  0  MSE:  1.11870844647806
iteration  10  MSE:  1.118684416648891
iteration  20  MSE:  1.1186756943648484
iteration  30  MSE:  1.118672460519689


In [78]:
alpha

3.538053157865437

In [79]:
# Prediction function based on defined user/item biases and alpha after iteration for adjusting MSE 
def predict(user, book):
    global alpha 
    # case analysis 
    if user not in global_ratingsPerUser and book not in global_ratingsPerItem:
        # cold start problem happens here 
        # return global mean because we don't have any other information 
        return alpha 
    elif user not in global_ratingsPerUser and book in global_ratingsPerItem:
        return alpha + itemBiases[book]
    elif user in global_ratingsPerUser and book not in global_ratingsPerItem:
        return alpha + userBiases[user]
    else:
        # book and user book exist 
        return alpha + itemBiases[book] + userBiases[user]

In [80]:
#userBiases

In [81]:
#itemBiases

In [82]:
predictions = open("predictions_Rating.csv", 'w')
for l in open("pairs_Rating.csv"):
    if l.startswith("userID"): # header
        predictions.write(l)
        continue
    u,b = l.strip().split(',') # Read the user and item from the "pairs" file and write out your prediction
    # Calling Model 
    prediction_value = predict(u, b)
    predictions.write(f"{u},{b},{prediction_value}\n")
    
predictions.close()