Load and Preprocess Data

In [None]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder

# Load the dataset
file_path = '/content/amazon.csv'
data = pd.read_csv(file_path)

#Inspect the data
print(data.head())

# Basic preprocessing
data = data.dropna(subset=['user_id', 'product_id', 'rating'])  # Drop rows with missing user_id, product_id, or rating

# Convert 'rating' column to numeric, coercing errors
data['rating'] = pd.to_numeric(data['rating'], errors='coerce')

# Encode user_id and product_id to numerical values
user_encoder = LabelEncoder()
product_encoder = LabelEncoder()

data['user_id'] = user_encoder.fit_transform(data['user_id'])
data['product_id'] = product_encoder.fit_transform(data['product_id'])

# Ensure unique user-product pairs by averaging duplicate ratings
data = data.groupby(['user_id', 'product_id']).rating.mean().reset_index()

# Create user-item interaction matrix
user_item_matrix = data.pivot(index='user_id', columns='product_id', values='rating').fillna(0)
print(user_item_matrix.head())

   product_id                                       product_name  \
0  B07JW9H4J1  Wayona Nylon Braided USB to Lightning Fast Cha...   
1  B098NS6PVG  Ambrane Unbreakable 60W / 3A Fast Charging 1.5...   
2  B096MSW6CT  Sounce Fast Phone Charging Cable & Data Sync U...   
3  B08HDJ86NZ  boAt Deuce USB 300 2 in 1 Type-C & Micro USB S...   
4  B08CF3B7N1  Portronics Konnect L 1.2M Fast Charging 3A 8 P...   

                                            category discounted_price  \
0  Computers&Accessories|Accessories&Peripherals|...             ₹399   
1  Computers&Accessories|Accessories&Peripherals|...             ₹199   
2  Computers&Accessories|Accessories&Peripherals|...             ₹199   
3  Computers&Accessories|Accessories&Peripherals|...             ₹329   
4  Computers&Accessories|Accessories&Peripherals|...             ₹154   

  actual_price discount_percentage rating rating_count  \
0       ₹1,099                 64%    4.2       24,269   
1         ₹349                 43%  

**Calculate Similarities**

In [None]:
from sklearn.metrics.pairwise import cosine_similarity

# Item-based similarity
item_similarity = cosine_similarity(user_item_matrix.T)
item_similarity_df = pd.DataFrame(item_similarity, index=user_item_matrix.columns, columns=user_item_matrix.columns)

# User-based similarity
user_similarity = cosine_similarity(user_item_matrix)
user_similarity_df = pd.DataFrame(user_similarity, index=user_item_matrix.index, columns=user_item_matrix.index)


**Generate Recommendations**

In [None]:
# User-based recommendation
def recommend_items_user_based(user_id, user_similarity_df, user_item_matrix, product_encoder, num_recommendations=5):
    if user_id not in user_similarity_df.index:
        return []
    similar_users = user_similarity_df[user_id].sort_values(ascending=False).index[1:]  # Exclude the user itself
    user_ratings = user_item_matrix.loc[similar_users]
    user_ratings_mean = user_ratings.mean(axis=0)
    already_rated = user_item_matrix.loc[user_id][user_item_matrix.loc[user_id] > 0].index
    recommendations = user_ratings_mean.drop(already_rated).sort_values(ascending=False).head(num_recommendations)
    return product_encoder.inverse_transform(recommendations.index)

# Generate recommendations for all users
recommendations_dict = {'user_id': [], 'recommendations': []}

for user_id in user_item_matrix.index:
    user_recommendations = recommend_items_user_based(user_id, user_similarity_df, user_item_matrix, product_encoder)
    recommendations_dict['user_id'].append(user_encoder.inverse_transform([user_id])[0])
    recommendations_dict['recommendations'].append(user_recommendations)

# Convert the recommendations dictionary to a DataFrame
recommendations_df = pd.DataFrame(recommendations_dict)

# Display the recommendations table
print(recommendations_df)


                                                user_id  \
0     AE22Y3KIS7SE6LI3HE2VS6WWPU4Q,AHWEYO2IJ5I5GDWZA...   
1     AE23RS3W7GZO7LHYKJU6KSKVM4MQ,AEQUNEY6GQOTEGUMS...   
2                          AE242TR3GQ6TYC6W4SJ5UYYKBTYQ   
3     AE27UOZENYSWCQVQRRUQIV2ZM7VA,AGMYSLV6NNOAYES25...   
4     AE2JTMRKTUOIVIZWS2WDGTMNTU4Q,AF4QXCB32VC2DVE7O...   
...                                                 ...   
1189  AHZFKWGDBRQKNMNQ4ZPL52OZBRKA,AGBEFVJFOQIRF7C7K...   
1190  AHZJHJWFZLYD64GVP4PXVI2F4LXA,AEUCRZPOISXKHXMCZ...   
1191  AHZNSNBVKQR4OGJAQHE4DCDA4YHA,AFBW6COTZXGHQMWVD...   
1192  AHZWJCVEIEI76H2VGMUSN5D735IQ,AH2DFUHFTG4CKQFVG...   
1193  AHZWXUWE3RGLDH4JJUK3HT3VMBJA,AFWUWJMEO4IQEMHKM...   

                                        recommendations  
0     [B09MT84WV5, B077Z65HSD, B09YV4MW2T, B09ZQK9X8...  
1     [B09MT84WV5, B077Z65HSD, B09YV4MW2T, B08WRBG3X...  
2     [B09MT84WV5, B077Z65HSD, B09YV4MW2T, B08WRBG3X...  
3     [B09MT84WV5, B077Z65HSD, B09YV4MW2T, B08WRBG3X...  
4

In [None]:
# Show the first few rows of the recommendations table
recommendations_df.head()


Unnamed: 0,user_id,recommendations
0,"AE22Y3KIS7SE6LI3HE2VS6WWPU4Q,AHWEYO2IJ5I5GDWZA...","[B09MT84WV5, B077Z65HSD, B09YV4MW2T, B09ZQK9X8..."
1,"AE23RS3W7GZO7LHYKJU6KSKVM4MQ,AEQUNEY6GQOTEGUMS...","[B09MT84WV5, B077Z65HSD, B09YV4MW2T, B08WRBG3X..."
2,AE242TR3GQ6TYC6W4SJ5UYYKBTYQ,"[B09MT84WV5, B077Z65HSD, B09YV4MW2T, B08WRBG3X..."
3,"AE27UOZENYSWCQVQRRUQIV2ZM7VA,AGMYSLV6NNOAYES25...","[B09MT84WV5, B077Z65HSD, B09YV4MW2T, B08WRBG3X..."
4,"AE2JTMRKTUOIVIZWS2WDGTMNTU4Q,AF4QXCB32VC2DVE7O...","[B09MT84WV5, B077Z65HSD, B09YV4MW2T, B09ZQK9X8..."


