# Data Science Final Project

## Part 1: Data Selection and Preprocessing

In [20]:
#Import Necessary Libraries
import pandas as pd
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity

#### Loading Amazon Industrial and Scientific dataset

In [None]:
sci_df = pd.read_json('Industrial_and_Scientific_5.json', lines=True)
sci_df

Unnamed: 0,overall,verified,reviewTime,reviewerID,asin,style,reviewerName,reviewText,summary,unixReviewTime,vote,image
0,5,True,"11 27, 2017",A1JB7HFWHRYHT7,B0000223SI,{'Size:': ' 1-(Pack)'},Alex W.,This worked really well for what I used it for...,Couldn't have been happier with it's performance,1511740800,,
1,5,True,"11 4, 2017",A2FCLJG5GV8SD6,B0000223SI,{'Size:': ' 1-(Pack)'},Randall Harris,Fast cutting and good adheasive.,Good paper.,1509753600,,
2,5,False,"10 27, 2017",A3IT9B33NWYQSL,B0000223SI,{'Size:': ' 1-(Pack)'},A. C.,Worked great for my lapping bench. I would li...,Handy!,1509062400,,
3,4,True,"01 13, 2018",AUL5LCV4TT73P,B0000223SK,{'Size:': ' 1-Pack'},TnT,As advertised,As advertised,1515801600,,
4,5,True,"10 7, 2017",A1V3I3L5JKO7TM,B0000223SK,{'Size:': ' 1-Pack'},John Jones,seems like a pretty good value as opposed to b...,seems like a pretty good value as opposed to b...,1507334400,,
...,...,...,...,...,...,...,...,...,...,...,...,...
77066,5,True,"12 19, 2017",A1UZ9AVZFWZS1A,B01HCVJ3K2,,Kindle Customer,So far it has worked like a champ. Great solut...,I recommend it.,1513641600,,
77067,5,True,"12 17, 2017",A1PMSQXD43WIS4,B01HCVJ3K2,,H. Arnold,Great quality solid state relay. I used this s...,Great quality solid state relay,1513468800,,[https://images-na.ssl-images-amazon.com/image...
77068,5,True,"09 20, 2018",A225WHD7XZVIXL,B01HEQVQAK,,John A. Schroeder,Came with everything needed to install in my M...,Exactly as described,1537401600,,
77069,5,True,"09 17, 2018",A3T05FOORNQI18,B01HEQVQAK,,Old,Installed a month ago in my Monoprice Maker Se...,Works Great,1537142400,,


In [None]:
sci_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 77071 entries, 0 to 77070
Data columns (total 12 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   overall         77071 non-null  int64 
 1   verified        77071 non-null  bool  
 2   reviewTime      77071 non-null  object
 3   reviewerID      77071 non-null  object
 4   asin            77071 non-null  object
 5   style           36037 non-null  object
 6   reviewerName    77044 non-null  object
 7   reviewText      77060 non-null  object
 8   summary         77061 non-null  object
 9   unixReviewTime  77071 non-null  int64 
 10  vote            9620 non-null   object
 11  image           1719 non-null   object
dtypes: bool(1), int64(2), object(9)
memory usage: 6.5+ MB


In [None]:
# Creating training/testing datasets (Should run in about 30-40 secs)

train_list = []
test_list = []

for user, user_ratings in sci_df.groupby('reviewerID'):
    train = user_ratings.sample(frac=0.8)
    test = user_ratings.drop(train.index)
    train_list.append(train)
    test_list.append(test)
    

train_df = pd.concat(train_list).reset_index(drop=True)
test_df = pd.concat(test_list).reset_index(drop=True)

In [None]:
# Check for 80/20 output
print(f'train: {round(train_df.size/sci_df.size*100,2)}')
print(f'test: {round(test_df.size/sci_df.size*100,2)}')

train: 80.84
test: 19.16


## Part 2: Rating Prediction

In [27]:
# Create user-item matrix (rows: user, cols: product, values: ratings)
# Should run in 2-3 mins
train_df = train_df[['reviewerID', 'asin', 'overall']]
test_df = test_df[['reviewerID', 'asin', 'overall']]

user_item_matrix = train_df.pivot_table(
    index='reviewerID',
    columns = 'asin',
    values = 'overall'
)

prod_ids = user_item_matrix.columns
user_ids = user_item_matrix.index

In [28]:
user_item_matrix

asin,B0000223SI,B0000223SK,B0000223UV,B00002246J,B0000224J0,B0000224MY,B0000225HB,B0000225HD,B0000225IO,B00002N6FE,...,B01H6J5QYC,B01HB6AOFG,B01HBPHSII,B01HBZYFT8,B01HCFJC0Y,B01HCQSHNG,B01HCVJ3K2,B01HDXZR5E,B01HDYEAOW,B01HEQVQAK
reviewerID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
A0096681Y127OL1H8W3U,,,,,,,,,,,...,,,,,,,,,,
A0196552RI15HI7JB9PW,,,,,,,,,,,...,,,,,,,,,,
A0289048PRWFY7ZXQKCD,,,,,,,,,,,...,,,,,,,,,,
A0455940O5EUXQDU46QL,,,,,,,,,,,...,,,,,,,,,,
A07936821FOVJO6NP4Q8,,,,,,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
AZXS6P5QWNMLC,,,,,,,,,,,...,,,,,,,,,,
AZY0M1ANDSEPL,,,,,,,,,,,...,,,,,,,,,,
AZYIBG912W011,,,,,,,,,,,...,,,,,,,,,,
AZYPAWSYSCISH,,,,,,,,,,,...,,,,,,,,,,


In [29]:
# Create item similarity matrix (calculates pairwise cosine similarity between each product)
# Should run in 3-4 mins
prod_sim = cosine_similarity(user_item_matrix.fillna(0).T)
prod_sim = pd.DataFrame(prod_sim, index=prod_ids, columns= prod_ids)

In [30]:
prod_sim

asin,B0000223SI,B0000223SK,B0000223UV,B00002246J,B0000224J0,B0000224MY,B0000225HB,B0000225HD,B0000225IO,B00002N6FE,...,B01H6J5QYC,B01HB6AOFG,B01HBPHSII,B01HBZYFT8,B01HCFJC0Y,B01HCQSHNG,B01HCVJ3K2,B01HDXZR5E,B01HDYEAOW,B01HEQVQAK
asin,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
B0000223SI,1.000000,0.317021,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,...,0.0,0.0,0.000000,0.0,0.0,0.0,0.000000,0.0,0.0,0.000000
B0000223SK,0.317021,1.000000,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,...,0.0,0.0,0.000000,0.0,0.0,0.0,0.000000,0.0,0.0,0.000000
B0000223UV,0.000000,0.000000,1.0,0.0,0.0,0.0,0.0,0.104329,0.0,0.0,...,0.0,0.0,0.000000,0.0,0.0,0.0,0.000000,0.0,0.0,0.000000
B00002246J,0.000000,0.000000,0.0,1.0,0.0,0.0,0.0,0.000000,0.0,0.0,...,0.0,0.0,0.000000,0.0,0.0,0.0,0.000000,0.0,0.0,0.000000
B0000224J0,0.000000,0.000000,0.0,0.0,1.0,0.0,0.0,0.000000,0.0,0.0,...,0.0,0.0,0.000000,0.0,0.0,0.0,0.000000,0.0,0.0,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
B01HCQSHNG,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,...,0.0,0.0,0.000000,0.0,0.0,1.0,0.000000,0.0,0.0,0.000000
B01HCVJ3K2,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,...,0.0,0.0,0.068752,0.0,0.0,0.0,1.000000,0.0,0.0,0.072373
B01HDXZR5E,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,...,0.0,0.0,0.000000,0.0,0.0,0.0,0.000000,1.0,0.0,0.000000
B01HDYEAOW,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,...,0.0,0.0,0.000000,0.0,0.0,0.0,0.000000,0.0,1.0,0.000000


In [31]:
# should run in about 1-2 mins

global_mean = user_item_matrix.mean().mean()
print(global_mean)

4.504846159509748


In [32]:
product_means = user_item_matrix.mean(axis=0)
product_means

asin
B0000223SI    5.000000
B0000223SK    4.666667
B0000223UV    4.586207
B00002246J    4.555556
B0000224J0    4.916667
                ...   
B01HCQSHNG    3.500000
B01HCVJ3K2    4.952381
B01HDXZR5E    4.800000
B01HDYEAOW    5.000000
B01HEQVQAK    4.571429
Length: 5330, dtype: float64

In [33]:
# function to create prediction scores
def predict_rating(user, prod, user_item_matrix, prod_sim, global_mean, product_means):

    if prod not in prod_sim.index:
        return global_mean

    user_ratings = user_item_matrix.loc[user]
    
    rated_items = user_ratings[user_ratings > 0].index
    
    numerator = 0
    denominator = 0
    for rated_item in rated_items:
        sim = prod_sim.loc[prod, rated_item]
        numerator += sim * user_ratings[rated_item]
        denominator += abs(sim)

    if denominator == 0:
        # return user_item_matrix[prod].mean()
        return product_means.get(prod, global_mean)
    
    return numerator / denominator

In [34]:
# creates predicted rating column in test_df using predict_rating function
# should run in 1-2 mins

test_df['predicted_rating'] = test_df.apply(lambda row: predict_rating(row['reviewerID'], row['asin'], user_item_matrix, prod_sim, global_mean, product_means),axis=1)

In [35]:
test_df

Unnamed: 0,reviewerID,asin,overall,predicted_rating
0,A0096681Y127OL1H8W3U,B00E8JOCOE,5,4.750000
1,A0196552RI15HI7JB9PW,B00598DJDI,5,5.000000
2,A0289048PRWFY7ZXQKCD,B00LORGK7U,4,4.000000
3,A0289048PRWFY7ZXQKCD,B0094WJIRG,4,5.000000
4,A0455940O5EUXQDU46QL,B001DZDCNS,3,4.666667
...,...,...,...,...
14759,AZY0M1ANDSEPL,B00MB3CV6K,5,5.000000
14760,AZYIBG912W011,B000LWXB44,5,4.681818
14761,AZYPAWSYSCISH,B00F05UI8O,5,4.625000
14762,AZYPAWSYSCISH,B00SIQ1DHM,5,4.333333


In [36]:
# calculates RMSE and MAE for test data based on predictions
from sklearn.metrics import mean_squared_error, mean_absolute_error

valid_predictions = test_df.dropna(subset=['predicted_rating'])

actual_ratings = valid_predictions['overall']
predicted_ratings = valid_predictions['predicted_rating']

rmse = np.sqrt(mean_squared_error(actual_ratings, predicted_ratings))

mae = mean_absolute_error(actual_ratings, predicted_ratings)

print(f"RMSE: {rmse}")
print(f"MAE: {mae}")


RMSE: 0.9438679156449279
MAE: 0.5170164861258911


## Part 3: Item Recommendation

In [73]:
import numpy as np

# Convert user-item matrix to a NumPy array
user_item_array = user_item_matrix.fillna(0).values

# Compute SVD manually using NumPy's linear algebra functions
U, sigma, Vt = np.linalg.svd(user_item_array, full_matrices=False)

# Reduce the dimensionality by selecting the top-k singular values
k = 400  # Number of latent factors
U_k = U[:, :k]  # Top-k user features
sigma_k = np.diag(sigma[:k])  # Top-k singular values
Vt_k = Vt[:k, :]  # Top-k item features


In [74]:
# Reconstruct the user-item matrix
reconstructed_ratings = np.dot(np.dot(U_k, sigma_k), Vt_k)

# Convert the reconstructed matrix back to a DataFrame for easy indexing
predicted_ratings_df = pd.DataFrame(
    reconstructed_ratings,
    index=user_item_matrix.index,
    columns=user_item_matrix.columns
)


In [75]:
def get_top_10_recommendations(user, predicted_ratings_df, user_rated_items, full_catalog):
    # Items the user has already rated
    already_rated = user_rated_items.get(user, set())
    
    # Items the user hasn't rated
    unrated_items = list(full_catalog - already_rated)
    
    # Get predicted ratings for unrated items
    user_predictions = predicted_ratings_df.loc[user, unrated_items]
    
    # Rank items by predicted rating and select the top 10
    top_10_items = user_predictions.sort_values(ascending=False).head(10).index.tolist()
    
    return top_10_items


In [76]:
# Full catalog of items
full_catalog = set(user_item_matrix.columns)

# Items rated by each user
user_rated_items = train_df.groupby('reviewerID')['asin'].apply(set).to_dict()

# Generate recommendations for all users
recommendation_list = {}
for user in user_item_matrix.index:
    recommendation_list[user] = get_top_10_recommendations(
        user, 
        predicted_ratings_df, 
        user_rated_items, 
        full_catalog
    )

# Output recommendation list
print(recommendation_list)


{'A0096681Y127OL1H8W3U': ['B000BT32UA', 'B00WW4H8XY', 'B00I51SJOA', 'B01D8E6MJC', 'B00F3IXF9M', 'B006UKIABA', 'B019FSPV88', 'B016SH9ITC', 'B01GDC4GQM', 'B00002ND4I'], 'A0196552RI15HI7JB9PW': ['B00006IC2O', 'B014ONCP88', 'B0050GW73C', 'B00GYS1SXU', 'B006UKICQS', 'B00HWRO744', 'B004MUFHUM', 'B000K7GREQ', 'B0013RVDG4', 'B008VIHFSI'], 'A0289048PRWFY7ZXQKCD': ['B004EXMS4U', 'B007LQC3Q0', 'B002PMSSC8', 'B000HE5DUQ', 'B00OXPE8U6', 'B000BQKDPA', 'B00ZBCLJSY', 'B000UKUHXK', 'B0061OT1A4', 'B00004WCCP'], 'A0455940O5EUXQDU46QL': ['B00YSOZFQI', 'B00YQBBZ8A', 'B00VTVWJOE', 'B01E1HANLS', 'B00J0H4EVU', 'B00EFOQEUM', 'B00J0GQ2OS', 'B00W8FL5BS', 'B00F839VNQ', 'B008DS266M'], 'A07936821FOVJO6NP4Q8': ['B000BODTKI', 'B00E8JPEMS', 'B000LWXB44', 'B00G9IEMJM', 'B0008G1Z32', 'B01A7AVQKA', 'B001NSYSSI', 'B00E8JST6Q', 'B01DPS8DPM', 'B01ETMVQOI'], 'A0996171TZQWTHFU5YKX': ['B00HWROGHW', 'B003ULZGFU', 'B00004TBJD', 'B00O2LLT30', 'B000BO9CJ0', 'B00OAAUAX8', 'B007QAJUUS', 'B016Q6T7Q4', 'B002GQ5AKG', 'B001HETINI'], 'A1

In [71]:
for key in recommendation_list:
    print(f'Key: {key}, Recs: {recommendation_list[key]}')

Key: A0096681Y127OL1H8W3U, Recs: ['B000BT32UA', 'B00WW4H8XY', 'B00I51SJOA', 'B01D8E6MJC', 'B00F3IXF9M', 'B006UKIABA', 'B019FSPV88', 'B016SH9ITC', 'B01GDC4GQM', 'B00002ND4I']
Key: A0196552RI15HI7JB9PW, Recs: ['B00006IC2O', 'B014ONCP88', 'B0050GW73C', 'B00GYS1SXU', 'B006UKICQS', 'B00HWRO744', 'B004MUFHUM', 'B000K7GREQ', 'B0013RVDG4', 'B008VIHFSI']
Key: A0289048PRWFY7ZXQKCD, Recs: ['B004EXMS4U', 'B007LQC3Q0', 'B002PMSSC8', 'B000HE5DUQ', 'B00OXPE8U6', 'B000BQKDPA', 'B00ZBCLJSY', 'B000UKUHXK', 'B0061OT1A4', 'B00004WCCP']
Key: A0455940O5EUXQDU46QL, Recs: ['B00YSOZFQI', 'B00YQBBZ8A', 'B00VTVWJOE', 'B01E1HANLS', 'B00J0H4EVU', 'B00EFOQEUM', 'B00J0GQ2OS', 'B00W8FL5BS', 'B00F839VNQ', 'B008DS266M']
Key: A07936821FOVJO6NP4Q8, Recs: ['B000BODTKI', 'B00E8JPEMS', 'B000LWXB44', 'B00G9IEMJM', 'B0008G1Z32', 'B01A7AVQKA', 'B001NSYSSI', 'B00E8JST6Q', 'B01DPS8DPM', 'B01ETMVQOI']
Key: A0996171TZQWTHFU5YKX, Recs: ['B00HWROGHW', 'B003ULZGFU', 'B00004TBJD', 'B00O2LLT30', 'B000BO9CJ0', 'B00OAAUAX8', 'B007QAJUUS'

In [37]:
# Step 1: Full catalog of items
full_catalog = set(test_df['asin'].unique()).union(set(train_df['asin'].unique()))

# Items rated by each user in training data
user_rated_items = train_df.groupby('reviewerID')['asin'].apply(set).to_dict()

# Precompute predicted ratings from test_df for faster lookups
predicted_ratings = {(row['reviewerID'], row['asin']): row['predicted_rating'] for _, row in test_df.iterrows()}

# Function to predict ratings for unrated items and get top-10 recommendations
def get_top_10_recommendations(user, user_rated_items, full_catalog, predicted_ratings, user_item_matrix, prod_sim, global_mean, product_means):
    # Items the user has already rated
    already_rated = user_rated_items.get(user, set())
    
    # Items the user hasn't rated
    unrated_items = full_catalog - already_rated
    
    # Predict ratings for unrated items
    user_predictions = []
    for item in unrated_items:
        # Check if precomputed rating exists
        if (user, item) in predicted_ratings:
            predicted_rating = predicted_ratings[(user, item)]
        else:
            # Dynamically compute the rating
            predicted_rating = predict_rating(user, item, user_item_matrix, prod_sim, global_mean, product_means)

        # Add to predictions if rating is 5 stars        
        if predicted_rating == 5.0:
            user_predictions.append((item, predicted_rating))
        
        # Stop if 10 items with a rating of 5 are found
        if len(user_predictions) >= 10:
            break

    # If fewer than 10 items rated as 5 stars are found, fill the rest with the top-rated items
    # if len(user_predictions) < 10:
    #     # Continue predicting and adding other items if needed
    #     for item in unrated_items:
    #         if item not in [x[0] for x in user_predictions]:  # Avoid duplicates
    #             if (user, item) in predicted_ratings:
    #                 predicted_rating = predicted_ratings[(user, item)]
    #             else:
    #                 predicted_rating = predict_rating(user, item, user_item_matrix, prod_sim, global_mean, product_means)
    #             user_predictions.append((item, predicted_rating))
    #             if len(user_predictions) >= 10:
    #                 break        
        
    
    # Rank items by predicted ratings
    ranked_items = sorted(user_predictions, key=lambda x: x[1], reverse=True)
    
    # Return the top 10 items
    return [item for item, _ in ranked_items[:10]]


In [38]:
# Step 2: Generate top-10 recommendations for each user
recommendation_list = {}
for user in train_df['reviewerID'].unique():
    recommendation_list[user] = get_top_10_recommendations(user, user_rated_items, full_catalog, predicted_ratings, user_item_matrix, prod_sim, global_mean, product_means)

# recommendation_list is a dictionary: {user_id: [top 10 recommendations]}

In [40]:
for key in recommendation_list:
    print(f'Key: {key}, Recs: {recommendation_list[key]}')

Key: A0096681Y127OL1H8W3U, Recs: ['B00168264S', 'B0095N2SY4', 'B00AFCH5XS', 'B00FXK6O4S', 'B00ITPHXZI', 'B00AAME8W4', 'B01GJKZZYU', 'B0028TV2DA', 'B007YBEHIO', 'B0018MGRQU']
Key: A0196552RI15HI7JB9PW, Recs: ['B00168264S', 'B000MOI9G6', 'B0095N2SY4', 'B00AFCH5XS', 'B00FXK6O4S', 'B00ITPHXZI', 'B00AAME8W4', 'B01GJKZZYU', 'B007YBEHIO', 'B00ZSDDJ4S']
Key: A0289048PRWFY7ZXQKCD, Recs: ['B00168264S', 'B0095N2SY4', 'B00AFCH5XS', 'B00FXK6O4S', 'B00ITPHXZI', 'B00AAME8W4', 'B01GJKZZYU', 'B00N76TDO8', 'B0035FZU2U', 'B00X77L548']
Key: A0455940O5EUXQDU46QL, Recs: ['B00168264S', 'B0095N2SY4', 'B00AFCH5XS', 'B00FXK6O4S', 'B00ITPHXZI', 'B00AAME8W4', 'B01GJKZZYU', 'B00ZSDDJ4S', 'B00YQBDPF6', 'B00N76TDO8']
Key: A07936821FOVJO6NP4Q8, Recs: ['B00168264S', 'B0095N2SY4', 'B00AFCH5XS', 'B00FXK6O4S', 'B00ITPHXZI', 'B00AAME8W4', 'B01GJKZZYU', 'B00DBS0N0A', 'B00ZSDDJ4S', 'B00N76TDO8']
Key: A0996171TZQWTHFU5YKX, Recs: ['B00168264S', 'B004N8OSLC', 'B00JFPF0UQ', 'B000MOI9G6', 'B0013CFNKG', 'B0095N2SY4', 'B00AFCH5XS'

In [22]:
# creates recommendation list for users (in-progress)
# rec_list is missing about 4000 users (lost when potential_recs is created)
# other issue: rec_list does not provide 10 recommendations for every user (looks like it just takes top 10 predicted ratings from test_df)
    # some users have < 10 products in test_df

user_rated_items = train_df.groupby('reviewerID')['asin'].apply(set).to_dict()

test_df['already_rated'] = test_df.apply(lambda row: row['asin'] in user_rated_items.get(row['reviewerID'], set()), axis=1)

potential_recs = test_df[~test_df['already_rated']]

def top_ten(group):
    return group.sort_values(by='predicted_rating', ascending=False).head(10)['asin'].tolist()

rec_list = (
    potential_recs.groupby('reviewerID')
    .apply(top_ten)
    .to_dict()
)

In [47]:
len(recommendation_list)

11041

In [24]:
rec_list

{'A0059486XI1Z0P98KP35': ['B012JMS4W2'],
 'A0220159ZRNBTRKLG08H': ['B00LSBNSJA'],
 'A0266076X6KPZ6CCHGVS': ['B000046S41', 'B0053BG1EU', 'B0073J8BYS'],
 'A0277912HT4JSJKVSL3E': ['B00CX9T598', 'B00KY1HZ2G'],
 'A02836981FYG9912C66F': ['B01GD490UM'],
 'A0331487QH9BVBTYGNCV': ['B01B61X1UC', 'B003WY86NE'],
 'A03622674FSWUX0PXPAK': ['B00KWG4HG0'],
 'A0380485C177Q6QQNJIX': ['B001IKC3QA',
  'B001UWMZZ0',
  'B00BGA9YZK',
  'B000FQ2D5E'],
 'A0385843DE41TCVXH2I1': ['B009157JPO'],
 'A0429448FT6MHSF724JB': ['B00DB9JV5W'],
 'A0435554Z2P98AIGLNCS': ['B00KBZHSVI'],
 'A049248150WLX2UGA57G': ['B004RJJMBI'],
 'A0667676ELTQ9GS4VZHH': ['B01GW8ZA9Y'],
 'A0685888WB02Q69S553P': ['B0002CHIUQ', 'B0050SXLQC', 'B00G6MW4SM'],
 'A0695568PX4DBZOQDN8': ['B0074LJ3CE', 'B00HK74G2E'],
 'A0734719E2U9PZFCS116': ['B00HTK1NCS'],
 'A0743345UFTOA4V1Z7W': ['B00CXCCB64', 'B00DB9JYFY'],
 'A07716593573L93RJQ1E': ['B00KBZHTBC'],
 'A0815906MFVDTIFHGRQR': ['B00ATF5YY8'],
 'A08761257GP04TJ0ROBS': ['B00GJSUUC0', 'B00CQOG8LI'],
 'A08981

In [25]:
test_df[test_df['reviewerID'] == 'A0059486XI1Z0P98KP35']

Unnamed: 0,reviewerID,asin,overall,predicted_rating,already_rated
0,A0059486XI1Z0P98KP35,B012JMS4W2,5,4.714286,False


In [26]:
potential_recs['reviewerID'].nunique()

51672

In [42]:
# functions to calculate precision, recall, and f-measure

def calculate_precision_recall(user, recommendation_list, testing_data):
    testing_items = set(testing_data[testing_data['reviewerID'] == user]['asin'])
    recommended_items = set(recommendation_list.get(user, []))
    
    relevant_items = testing_items & recommended_items
    precision = len(relevant_items) / len(recommended_items) if recommended_items else 0
    recall = len(relevant_items) / len(testing_items) if testing_items else 0
    
    return precision, recall

def calculate_f_measure(precision, recall):
    if (precision + recall) > 0:
        return 2 * precision * recall / (precision + recall)
    else:
        return 0

In [77]:
from sklearn.metrics import ndcg_score

def calculate_metrics(test_data, recommendations, top_n=10):
    """
    Calculate Precision, Recall, F-measure, and NDCG for a recommendation system.
    
    Parameters:
    - test_data: DataFrame with columns ['reviewerID', 'asin'], representing actual purchases or interactions in the testing set.
    - recommendations: Dict with user IDs as keys and lists of recommended items as values.
    - top_n: Number of items in the recommendation list to consider.
    
    Returns:
    - metrics: Dictionary with overall precision, recall, F-measure, and NDCG.
    """
    precision_list = []
    recall_list = []
    ndcg_list = []
    
    # Group test data by user for quick lookup
    test_data_grouped = test_data.groupby('reviewerID')['asin'].apply(set).to_dict()

    for user, recommended_items in recommendations.items():
        # Get the actual items from the test set for this user
        actual_items = test_data_grouped.get(user, set())
        
        if not actual_items:  # Skip users without testing data
            continue
        
        # Convert recommended items to a set for intersection calculation
        recommended_set = set(recommended_items[:top_n])
        
        # Calculate Precision and Recall for this user
        relevant_items = recommended_set.intersection(actual_items)
        precision = len(relevant_items) / top_n
        recall = len(relevant_items) / len(actual_items)
        
        # Calculate NDCG
        # Convert actual_items into relevance scores
        relevance = [1 if item in actual_items else 0 for item in recommended_items[:top_n]]
        ndcg = ndcg_score([relevance], [list(range(len(relevance), 0, -1))])  # Ideal ranking assumed
        
        # Store metrics for this user
        precision_list.append(precision)
        recall_list.append(recall)
        ndcg_list.append(ndcg)
    
    # Calculate F-measure
    avg_precision = np.mean(precision_list) if precision_list else 0
    avg_recall = np.mean(recall_list) if recall_list else 0
    f_measure = (2 * avg_precision * avg_recall) / (avg_precision + avg_recall) if (avg_precision + avg_recall) > 0 else 0
    avg_ndcg = np.mean(ndcg_list) if ndcg_list else 0
    
    return {
        "Precision": avg_precision,
        "Recall": avg_recall,
        "F-measure": f_measure,
        "NDCG": avg_ndcg
    }

# Example Usage
metrics = calculate_metrics(test_df, recommendation_list, top_n=10)

# Display metrics
print("Evaluation Metrics:")
print(f"Precision: {metrics['Precision']:.4f}")
print(f"Recall: {metrics['Recall']:.4f}")
print(f"F-measure: {metrics['F-measure']:.4f}")
print(f"NDCG: {metrics['NDCG']:.4f}")

Evaluation Metrics:
Precision: 0.0101
Recall: 0.0762
F-measure: 0.0179
NDCG: 0.0713


In [46]:
# calculating precision, recall, and f-measure for data
# runs in about 21 mins

precision_scores = []
recall_scores = []
f_measure_scores = []

for user in test_df['reviewerID'].unique():
    precision, recall = calculate_precision_recall(user, recommendation_list, test_df)
    f_measure = calculate_f_measure(precision, recall)
   
    precision_scores.append(precision)
    recall_scores.append(recall)
    f_measure_scores.append(f_measure)

avg_precision = np.mean(precision_scores)
avg_recall = np.mean(recall_scores)
avg_f_measure = np.mean(f_measure_scores)

print(f"Average Precision: {avg_precision:.4f}")
print(f"Average Recall: {avg_recall:.4f}")
print(f"Average F-Measure: {avg_f_measure:.4f}")

Average Precision: 0.0003
Average Recall: 0.0020
Average F-Measure: 0.0005


In [57]:
# function to calculate ndcg

def calculate_ndcg(user, recommendation_list, testing_data, top_n=10):
    testing_items = set(testing_data[testing_data['reviewerID'] == user]['asin'])
    recommended_items = recommendation_list[user]
    
    dcg = 0
    for i, item in enumerate(recommended_items[:top_n]):
        rel = 1 if item in testing_items else 0
        dcg += rel / np.log2(i + 2)
    
    idcg = sum(1 / np.log2(i + 2) for i in range(min(len(testing_items), top_n)))

    if idcg > 0:
        return dcg / idcg
    else:
        return 0

In [59]:
# calculating ndcg (runs into error)

ndcg_scores = []

for user in user_ids:
    ndcg = calculate_ndcg(user, recommendation_list, test_df)
    ndcg_scores.append(ndcg)

avg_ndcg = np.mean(ndcg_scores)

print(f"Average NDCG: {avg_ndcg}")

Average NDCG: 0.00042477805154424967


In [67]:
# product_means.get(prod, global_mean)
rated_5 = []
for prod, value in product_means.items():
    if value == 5:
        rated_5.append((prod,value))
        # print(prod, value)

len(rated_5)
    

775