# Data Science Final Project

## Part 1: Data Selection and Preprocessing

In [98]:
#Import Necessary Libraries
import pandas as pd
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity

#### Loading Amazon Video Game dataset

In [92]:
game_df = pd.read_json('Video_Games_5.json', lines=True)
game_df

Unnamed: 0,overall,verified,reviewTime,reviewerID,asin,reviewerName,reviewText,summary,unixReviewTime,vote,style,image
0,5,True,"10 17, 2015",A1HP7NVNPFMA4N,0700026657,Ambrosia075,"This game is a bit hard to get the hang of, bu...",but when you do it's great.,1445040000,,,
1,4,False,"07 27, 2015",A1JGAP0185YJI6,0700026657,travis,I played it a while but it was alright. The st...,"But in spite of that it was fun, I liked it",1437955200,,,
2,3,True,"02 23, 2015",A1YJWEXHQBWK2B,0700026657,Vincent G. Mezera,ok game.,Three Stars,1424649600,,,
3,2,True,"02 20, 2015",A2204E1TH211HT,0700026657,Grandma KR,"found the game a bit too complicated, not what...",Two Stars,1424390400,,,
4,5,True,"12 25, 2014",A2RF5B5H74JLPE,0700026657,jon,"great game, I love it and have played it since...",love this game,1419465600,,,
...,...,...,...,...,...,...,...,...,...,...,...,...
497572,4,True,"07 1, 2017",AVECM71LSZLC5,B01HGPUTCA,boris teplitskiy,not OEM but good replacement parts,Four Stars,1498867200,,,
497573,3,True,"08 20, 2018",A1RS06313BL6WN,B01HH6JEOC,Tom Stopsign,Okay stuff.,Three Stars,1534723200,,"{'Edition:': ' Kids Room', 'Platform:': ' PC O...",
497574,3,True,"08 7, 2017",ACIZ77IGIX2JL,B01HH6JEOC,Era,This does add some kids room things that are v...,Only buy on sale.,1502064000,,"{'Edition:': ' Kids Room', 'Platform:': ' PC O...",
497575,4,False,"08 5, 2018",A34GG58TJ1A3SH,B01HIZF7XE,seamonkey10,I think I originally began playing Bioshock se...,"It's Okay, Nothing Profound",1533427200,,"{'Edition:': ' Collection', 'Platform:': ' Xbo...",


In [93]:
game_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 497577 entries, 0 to 497576
Data columns (total 12 columns):
 #   Column          Non-Null Count   Dtype 
---  ------          --------------   ----- 
 0   overall         497577 non-null  int64 
 1   verified        497577 non-null  bool  
 2   reviewTime      497577 non-null  object
 3   reviewerID      497577 non-null  object
 4   asin            497577 non-null  object
 5   reviewerName    497501 non-null  object
 6   reviewText      497419 non-null  object
 7   summary         497468 non-null  object
 8   unixReviewTime  497577 non-null  int64 
 9   vote            107793 non-null  object
 10  style           289237 non-null  object
 11  image           3634 non-null    object
dtypes: bool(1), int64(2), object(9)
memory usage: 42.2+ MB


In [None]:
# Creating training/testing datasets (Should run in about 30-40 secs)

train_list = []
test_list = []

for user, user_ratings in game_df.groupby('reviewerID'):
    train = user_ratings.sample(frac=0.8)
    test = user_ratings.drop(train.index)
    train_list.append(train)
    test_list.append(test)
    

train_df = pd.concat(train_list).reset_index(drop=True)
test_df = pd.concat(test_list).reset_index(drop=True)

In [95]:
# Check for 80/20 output
print(f'train: {round(train_df.size/game_df.size*100,2)}')
print(f'test: {round(test_df.size/game_df.size*100,2)}')

train: 80.54
test: 19.46


## Part 2: Rating Prediction

In [99]:
train_df = train_df[['reviewerID', 'asin', 'overall']]
test_df = test_df[['reviewerID', 'asin', 'overall']]

user_item_matrix = train_df.pivot_table(
    index='reviewerID',
    columns = 'asin',
    values = 'overall'
).fillna(0)

prod_ids = user_item_matrix.columns
user_ids = user_item_matrix.index

In [100]:
user_item_matrix


asin,0700026398,0700026657,0700099867,0804161380,3828770193,6050036071,7293000936,7544256944,8176503290,8565000168,...,B01HD1B76O,B01HD2TECW,B01HDJFJKG,B01HDJFJLK,B01HDJFJOM,B01HFRICLE,B01HGPUTCA,B01HH6JEOC,B01HIZF7XE,B01HIZGKOE
reviewerID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
A0059486XI1Z0P98KP35,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
A0220159ZRNBTRKLG08H,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
A0266076X6KPZ6CCHGVS,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
A0277912HT4JSJKVSL3E,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
A02836981FYG9912C66F,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
AZZNK89PXD006,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
AZZQCK9ZAKMFR,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
AZZT1ERHBSNQ8,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
AZZTC2OYVNE2Q,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [103]:
prod_sim = cosine_similarity(user_item_matrix.T)
prod_sim = pd.DataFrame(prod_sim, index=prod_ids, columns= prod_ids)

In [111]:
prod_sim

asin,0700026398,0700026657,0700099867,0804161380,3828770193,6050036071,7293000936,7544256944,8176503290,8565000168,...,B01HD1B76O,B01HD2TECW,B01HDJFJKG,B01HDJFJLK,B01HDJFJOM,B01HFRICLE,B01HGPUTCA,B01HH6JEOC,B01HIZF7XE,B01HIZGKOE
asin,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0700026398,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0
0700026657,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0
0700099867,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0
0804161380,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3828770193,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
B01HFRICLE,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.000000,0.0,0.0,1.0,0.0,0.0,0.0,0.0
B01HGPUTCA,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.000000,0.0,0.0,0.0,1.0,0.0,0.0,0.0
B01HH6JEOC,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,1.0,0.0,0.0
B01HIZF7XE,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.036468,0.0,0.0,0.0,0.0,0.0,1.0,0.0


In [None]:
def predict_rating(user, prod, user_item_matrix, prod_sim):

    if prod not in prod_sim.index:
        return np.nan

    user_ratings = user_item_matrix.loc[user]
    
    rated_items = user_ratings[user_ratings > 0].index
    
    numerator = 0
    denominator = 0
    for rated_item in rated_items:
        sim = prod_sim.loc[prod, rated_item]
        numerator += sim * user_ratings[rated_item]
        denominator += abs(sim)

    if denominator == 0:
        return np.nan
    return numerator / denominator

In [107]:
test_df['predicted_rating'] = test_df.apply(lambda row: predict_rating(row['reviewerID'], row['asin'], user_item_matrix, prod_sim),axis=1)

In [108]:
test_df

Unnamed: 0,reviewerID,asin,overall,predicted_rating
0,A0059486XI1Z0P98KP35,B012JMS4W2,5,
1,A0220159ZRNBTRKLG08H,B00A878J5I,5,3.565619
2,A0266076X6KPZ6CCHGVS,B003RDEV8E,5,
3,A0266076X6KPZ6CCHGVS,B0086V5TVU,5,5.000000
4,A0266076X6KPZ6CCHGVS,B00AECBKEY,5,
...,...,...,...,...
96838,AZZQCK9ZAKMFR,B00N4ABT1C,5,5.000000
96839,AZZQCK9ZAKMFR,B00YGHG8J0,5,5.000000
96840,AZZT1ERHBSNQ8,B00KWFCV32,5,5.000000
96841,AZZTC2OYVNE2Q,B001PB9J14,3,4.304989


In [110]:
from sklearn.metrics import mean_squared_error, mean_absolute_error

valid_predictions = test_df.dropna(subset=['predicted_rating'])

actual_ratings = valid_predictions['overall']
predicted_ratings = valid_predictions['predicted_rating']

rmse = np.sqrt(mean_squared_error(actual_ratings, predicted_ratings))

mae = mean_absolute_error(actual_ratings, predicted_ratings)

print(f"RMSE: {rmse}")
print(f"MAE: {mae}")


RMSE: 1.088012825642463
MAE: 0.6437052034851589
