# Data Science Final Project

## Part 1: Data Selection and Preprocessing

In [98]:
#Import Necessary Libraries
import pandas as pd
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity

#### Loading Amazon Video Game dataset

In [92]:
game_df = pd.read_json('Video_Games_5.json', lines=True)
game_df

Unnamed: 0,overall,verified,reviewTime,reviewerID,asin,reviewerName,reviewText,summary,unixReviewTime,vote,style,image
0,5,True,"10 17, 2015",A1HP7NVNPFMA4N,0700026657,Ambrosia075,"This game is a bit hard to get the hang of, bu...",but when you do it's great.,1445040000,,,
1,4,False,"07 27, 2015",A1JGAP0185YJI6,0700026657,travis,I played it a while but it was alright. The st...,"But in spite of that it was fun, I liked it",1437955200,,,
2,3,True,"02 23, 2015",A1YJWEXHQBWK2B,0700026657,Vincent G. Mezera,ok game.,Three Stars,1424649600,,,
3,2,True,"02 20, 2015",A2204E1TH211HT,0700026657,Grandma KR,"found the game a bit too complicated, not what...",Two Stars,1424390400,,,
4,5,True,"12 25, 2014",A2RF5B5H74JLPE,0700026657,jon,"great game, I love it and have played it since...",love this game,1419465600,,,
...,...,...,...,...,...,...,...,...,...,...,...,...
497572,4,True,"07 1, 2017",AVECM71LSZLC5,B01HGPUTCA,boris teplitskiy,not OEM but good replacement parts,Four Stars,1498867200,,,
497573,3,True,"08 20, 2018",A1RS06313BL6WN,B01HH6JEOC,Tom Stopsign,Okay stuff.,Three Stars,1534723200,,"{'Edition:': ' Kids Room', 'Platform:': ' PC O...",
497574,3,True,"08 7, 2017",ACIZ77IGIX2JL,B01HH6JEOC,Era,This does add some kids room things that are v...,Only buy on sale.,1502064000,,"{'Edition:': ' Kids Room', 'Platform:': ' PC O...",
497575,4,False,"08 5, 2018",A34GG58TJ1A3SH,B01HIZF7XE,seamonkey10,I think I originally began playing Bioshock se...,"It's Okay, Nothing Profound",1533427200,,"{'Edition:': ' Collection', 'Platform:': ' Xbo...",


In [93]:
game_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 497577 entries, 0 to 497576
Data columns (total 12 columns):
 #   Column          Non-Null Count   Dtype 
---  ------          --------------   ----- 
 0   overall         497577 non-null  int64 
 1   verified        497577 non-null  bool  
 2   reviewTime      497577 non-null  object
 3   reviewerID      497577 non-null  object
 4   asin            497577 non-null  object
 5   reviewerName    497501 non-null  object
 6   reviewText      497419 non-null  object
 7   summary         497468 non-null  object
 8   unixReviewTime  497577 non-null  int64 
 9   vote            107793 non-null  object
 10  style           289237 non-null  object
 11  image           3634 non-null    object
dtypes: bool(1), int64(2), object(9)
memory usage: 42.2+ MB


In [None]:
# Creating training/testing datasets (Should run in about 30-40 secs)

train_list = []
test_list = []

for user, user_ratings in game_df.groupby('reviewerID'):
    train = user_ratings.sample(frac=0.8)
    test = user_ratings.drop(train.index)
    train_list.append(train)
    test_list.append(test)
    

train_df = pd.concat(train_list).reset_index(drop=True)
test_df = pd.concat(test_list).reset_index(drop=True)

In [95]:
# Check for 80/20 output
print(f'train: {round(train_df.size/game_df.size*100,2)}')
print(f'test: {round(test_df.size/game_df.size*100,2)}')

train: 80.54
test: 19.46


## Part 2: Rating Prediction

In [None]:
# Create user-item matrix (rows: user, cols: product, values: ratings)
# Should run in 3-4 mins
train_df = train_df[['reviewerID', 'asin', 'overall']]
test_df = test_df[['reviewerID', 'asin', 'overall']]

user_item_matrix = train_df.pivot_table(
    index='reviewerID',
    columns = 'asin',
    values = 'overall'
).fillna(0)

prod_ids = user_item_matrix.columns
user_ids = user_item_matrix.index

In [100]:
user_item_matrix


asin,0700026398,0700026657,0700099867,0804161380,3828770193,6050036071,7293000936,7544256944,8176503290,8565000168,...,B01HD1B76O,B01HD2TECW,B01HDJFJKG,B01HDJFJLK,B01HDJFJOM,B01HFRICLE,B01HGPUTCA,B01HH6JEOC,B01HIZF7XE,B01HIZGKOE
reviewerID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
A0059486XI1Z0P98KP35,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
A0220159ZRNBTRKLG08H,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
A0266076X6KPZ6CCHGVS,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
A0277912HT4JSJKVSL3E,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
A02836981FYG9912C66F,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
AZZNK89PXD006,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
AZZQCK9ZAKMFR,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
AZZT1ERHBSNQ8,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
AZZTC2OYVNE2Q,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [None]:
# Create item similarity matrix (calculates pairwise cosine similarity between each product)
# Should run in 3-4 mins
prod_sim = cosine_similarity(user_item_matrix.T)
prod_sim = pd.DataFrame(prod_sim, index=prod_ids, columns= prod_ids)

In [111]:
prod_sim

asin,0700026398,0700026657,0700099867,0804161380,3828770193,6050036071,7293000936,7544256944,8176503290,8565000168,...,B01HD1B76O,B01HD2TECW,B01HDJFJKG,B01HDJFJLK,B01HDJFJOM,B01HFRICLE,B01HGPUTCA,B01HH6JEOC,B01HIZF7XE,B01HIZGKOE
asin,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0700026398,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0
0700026657,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0
0700099867,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0
0804161380,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3828770193,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
B01HFRICLE,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.000000,0.0,0.0,1.0,0.0,0.0,0.0,0.0
B01HGPUTCA,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.000000,0.0,0.0,0.0,1.0,0.0,0.0,0.0
B01HH6JEOC,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,1.0,0.0,0.0
B01HIZF7XE,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.036468,0.0,0.0,0.0,0.0,0.0,1.0,0.0


In [None]:
# function to create prediction scores
def predict_rating(user, prod, user_item_matrix, prod_sim):

    if prod not in prod_sim.index:
        return np.nan

    user_ratings = user_item_matrix.loc[user]
    
    rated_items = user_ratings[user_ratings > 0].index
    
    numerator = 0
    denominator = 0
    for rated_item in rated_items:
        sim = prod_sim.loc[prod, rated_item]
        numerator += sim * user_ratings[rated_item]
        denominator += abs(sim)

    if denominator == 0:
        return np.nan
    return numerator / denominator

In [None]:
# creates predicted rating column in test_df using predict_rating function
# should run in 1-2 mins

test_df['predicted_rating'] = test_df.apply(lambda row: predict_rating(row['reviewerID'], row['asin'], user_item_matrix, prod_sim),axis=1)

In [108]:
test_df

Unnamed: 0,reviewerID,asin,overall,predicted_rating
0,A0059486XI1Z0P98KP35,B012JMS4W2,5,
1,A0220159ZRNBTRKLG08H,B00A878J5I,5,3.565619
2,A0266076X6KPZ6CCHGVS,B003RDEV8E,5,
3,A0266076X6KPZ6CCHGVS,B0086V5TVU,5,5.000000
4,A0266076X6KPZ6CCHGVS,B00AECBKEY,5,
...,...,...,...,...
96838,AZZQCK9ZAKMFR,B00N4ABT1C,5,5.000000
96839,AZZQCK9ZAKMFR,B00YGHG8J0,5,5.000000
96840,AZZT1ERHBSNQ8,B00KWFCV32,5,5.000000
96841,AZZTC2OYVNE2Q,B001PB9J14,3,4.304989


In [None]:
# calculates RMSE and MAE for test data based on predictions
from sklearn.metrics import mean_squared_error, mean_absolute_error

valid_predictions = test_df.dropna(subset=['predicted_rating'])

actual_ratings = valid_predictions['overall']
predicted_ratings = valid_predictions['predicted_rating']

rmse = np.sqrt(mean_squared_error(actual_ratings, predicted_ratings))

mae = mean_absolute_error(actual_ratings, predicted_ratings)

print(f"RMSE: {rmse}")
print(f"MAE: {mae}")


RMSE: 1.088012825642463
MAE: 0.6437052034851589


## Part 3: Item Recommendation

In [None]:
# creates recommendation list for users (in-progress)
# rec_list is missing about 4000 users (lost when potential_recs is created)
# other issue: rec_list does not provide 10 recommendations for every user (looks like it just takes top 10 predicted ratings from test_df)
    # some users have < 10 products in test_df

user_rated_items = train_df.groupby('reviewerID')['asin'].apply(set).to_dict()

test_df['already_rated'] = test_df.apply(lambda row: row['asin'] in user_rated_items.get(row['reviewerID'], set()), axis=1)

potential_recs = test_df[~test_df['already_rated']]

def top_ten(group):
    return group.sort_values(by='predicted_rating', ascending=False).head(10)['asin'].tolist()

rec_list = (
    potential_recs.groupby('reviewerID')
    .apply(top_ten)
    .to_dict()
)

In [126]:
len(rec_list)

51595

In [122]:
rec_list

{'A0059486XI1Z0P98KP35': ['B012JMS4W2'],
 'A0220159ZRNBTRKLG08H': ['B00A878J5I'],
 'A0266076X6KPZ6CCHGVS': ['B0086V5TVU', 'B003RDEV8E', 'B00AECBKEY'],
 'A0277912HT4JSJKVSL3E': ['B00VU4J8YY', 'B00MU1YEE0'],
 'A02836981FYG9912C66F': ['B00YQ2KCWO'],
 'A0331487QH9BVBTYGNCV': ['B003WY86NE', 'B00MUTAU40'],
 'A03622674FSWUX0PXPAK': ['B00DD0B0BM'],
 'A0380485C177Q6QQNJIX': ['B00BGAA3S2',
  'B00HGLLRV2',
  'B0050SW1WW',
  'B0050SW8AC'],
 'A0385843DE41TCVXH2I1': ['B01AC3ZDCE'],
 'A0429448FT6MHSF724JB': ['B00SVVUOGU'],
 'A0435554Z2P98AIGLNCS': ['B00J48MUS4'],
 'A049248150WLX2UGA57G': ['B00GV4V8XC'],
 'A0667676ELTQ9GS4VZHH': ['B000J3O194'],
 'A0685888WB02Q69S553P': ['B004UDB9SA', 'B000FQBF1M', 'B0050SXX88'],
 'A0695568PX4DBZOQDN8': ['B003R7H5TC', 'B0073ESYZO'],
 'A0734719E2U9PZFCS116': ['B019OB663A'],
 'A0743345UFTOA4V1Z7W': ['B00IAVDPSA', 'B00DB9JYFY'],
 'A07716593573L93RJQ1E': ['B00E1L785E'],
 'A0815906MFVDTIFHGRQR': ['B000ND3XXA'],
 'A08761257GP04TJ0ROBS': ['B003O6E67A', 'B01EZAA2ZI'],
 'A08981

In [136]:
test_df[test_df['reviewerID'] == 'A0059486XI1Z0P98KP35']

Unnamed: 0,reviewerID,asin,overall,predicted_rating,already_rated
0,A0059486XI1Z0P98KP35,B012JMS4W2,5,,False


In [132]:
potential_recs['reviewerID'].nunique()

51595

In [None]:
# functions to calculate precision, recall, and f-measure

def calculate_precision_recall(user, recommendation_list, testing_data):
    testing_items = set(testing_data[testing_data['reviewerID'] == user]['asin'])
    recommended_items = set(recommendation_list.get(user, []))
    
    relevant_items = testing_items & recommended_items
    precision = len(relevant_items) / len(recommended_items) if recommended_items else 0
    recall = len(relevant_items) / len(testing_items) if testing_items else 0
    
    return precision, recall

def calculate_f_measure(precision, recall):
    if (precision + recall) > 0:
        return 2 * precision * recall / (precision + recall)
    else:
        return 0

In [None]:
# calculating precision, recall, and f-measure for data
# runs in about 21 mins

precision_scores = []
recall_scores = []
f_measure_scores = []

for user in test_df['reviewerID'].unique():
    precision, recall = calculate_precision_recall(user, rec_list, test_df)
    f_measure = calculate_f_measure(precision, recall)
   
    precision_scores.append(precision)
    recall_scores.append(recall)
    f_measure_scores.append(f_measure)

avg_precision = np.mean(precision_scores)
avg_recall = np.mean(recall_scores)
avg_f_measure = np.mean(f_measure_scores)

print(f"Average Precision: {avg_precision:.4f}")
print(f"Average Recall: {avg_recall:.4f}")
print(f"Average F-Measure: {avg_f_measure:.4f}")

Average Precision: 0.9343
Average Recall: 0.9054
Average F-Measure: 0.9155


In [None]:
# function to calculate ndcg

def calculate_ndcg(user, recommendation_list, testing_data, top_n=10):
    testing_items = set(testing_data[testing_data['reviewerID'] == user]['asin'])
    recommended_items = recommendation_list[user]
    
    dcg = 0
    for i, item in enumerate(recommended_items[:top_n]):
        rel = 1 if item in testing_items else 0
        dcg += rel / np.log2(i + 2)
    
    idcg = sum(1 / np.log2(i + 2) for i in range(min(len(testing_items), top_n)))

    if idcg > 0:
        return dcg / idcg
    else:
        return 0

In [None]:
# calculating ndcg (runs into error)

ndcg_scores = []

for user in user_ids:
    ndcg = calculate_ndcg(user, rec_list, test_df)
    ndcg_scores.append(ndcg)

avg_ndcg = np.mean(ndcg_scores)

print(f"Average NDCG: {avg_ndcg}")

KeyError: 'A0950144Z8AAFXJIXH4L'