In [4]:
import pandas as pd

# This will now work as long as the file is closed in Excel
df = pd.read_csv('amazon_products.csv/products.csv', nrows=1600)

# Cleaning logic (Milestone 1 requirement)
df_clean = df.dropna(subset=['reviews.username', 'reviews.rating'])
df_clean = df_clean.rename(columns={
    'reviews.username': 'userId', 
    'id': 'productId', 
    'reviews.rating': 'rating'
})

# Create and save the matrix so Milestone 2 can use it
user_item_matrix = df_clean.pivot_table(index='userId', columns='productId', values='rating')
user_item_matrix.to_csv('user_item_matrix.csv')

print("SUCCESS: File created. You can now move to Milestone 2 training.")

SUCCESS: File created. You can now move to Milestone 2 training.


In [1]:
from sklearn.metrics.pairwise import cosine_similarity
import pandas as pd

# 1. Load the matrix (ensuring NaN are 0 for the math to work)
user_item_matrix_filled = pd.read_csv('user_item_matrix.csv', index_col=0).fillna(0)

# 2. Calculate the similarity between users
# This is the "Training" phase of your model
user_sim = cosine_similarity(user_item_matrix_filled)

# 3. Create a clean DataFrame of these similarity scores
user_sim_df = pd.DataFrame(user_sim, 
                           index=user_item_matrix_filled.index, 
                           columns=user_item_matrix_filled.index)

print("Model Training Complete! Similarity Matrix generated.")
user_sim_df.head()

Model Training Complete! Similarity Matrix generated.


userId,1-Apr,1215,1234,1soni,25Firefighter,5bros,7011,A. Dent Aragorn,A. Younan,A.C,...,toeka,ts120,txtech1997,unplug,vishal,wadas1989,wax0pal,william lombardo,wirelesssassyowner,zman
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1-Apr,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0,0.0,0.0,...,1.0,1.0,0.0,1.0,1.0,0.0,1.0,0.0,1.0,0.0
1215,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0,0.0,0.0,...,1.0,1.0,0.0,1.0,1.0,0.0,1.0,0.0,1.0,0.0
1234,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0,0.0,0.0,...,1.0,1.0,0.0,1.0,1.0,0.0,1.0,0.0,1.0,0.0
1soni,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0,0.0,0.0,...,1.0,1.0,0.0,1.0,1.0,0.0,1.0,0.0,1.0,0.0
25Firefighter,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0,0.0,0.0,...,1.0,1.0,0.0,1.0,1.0,0.0,1.0,0.0,1.0,0.0


In [2]:
def get_recommendations(user_id, num_recommendations=5):
    # Find the user most similar to our target user (excluding themselves)
    similar_users = user_sim_df[user_id].sort_values(ascending=False)
    best_match_user = similar_users.index[1] 
    
    # Find products the best match user liked but the target user hasn't rated
    user_ratings = user_item_matrix_filled.loc[user_id]
    best_match_ratings = user_item_matrix_filled.loc[best_match_user]
    
    # Recommend items where target user has 0 but best match has a high rating
    recommendations = best_match_ratings[user_ratings == 0].sort_values(ascending=False)
    
    return recommendations.head(num_recommendations)

# TEST THE MODEL: Pick a random UserID from your matrix to see the results
sample_user = user_item_matrix_filled.index[0]
print(f"Top recommendations for User {sample_user}:")
print(get_recommendations(sample_user))

Top recommendations for User 1-Apr:
AV000tWuGV-KLJ3ac2-b    0.0
AV00l7jV-jtxr-f30lnX    0.0
AV00lzP7GV-KLJ3ac0uk    0.0
AV00lzd5GV-KLJ3ac0ul    0.0
AV1Nik13-jtxr-f31AFO    0.0
Name: 1-Apr, dtype: float64


In [3]:
# Calculate the average similarity score (excluding the 1.0 diagonal)
avg_sim = (user_sim_df.values.sum() - len(user_sim_df)) / (len(user_sim_df)**2 - len(user_sim_df))
print(f"Initial Benchmark - Average User Similarity Score: {avg_sim:.4f}")

Initial Benchmark - Average User Similarity Score: 0.3973
