In [44]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt 
import ast
from sklearn.feature_extraction.text import CountVectorizer
import nltk
from nltk.stem.porter import PorterStemmer
from sklearn.metrics.pairwise import cosine_similarity
import pickle
%matplotlib inline

In [45]:
df = pd.read_csv('Dummy_Detergent_Dataset.csv')

In [46]:
df.head()

Unnamed: 0,UserId,ProductId,Rating,Timestamp,ProductCategory
0,User916,D021,4,2024-11-07 19:23:00,Heavy-duty
1,User456,D027,1,2024-11-25 22:06:00,Budget
2,User490,D011,4,2024-11-10 20:50:00,Scented
3,User371,D021,3,2024-11-13 13:48:00,Organic
4,User655,D042,1,2024-11-19 16:03:00,Budget


In [11]:
from sklearn.preprocessing import LabelEncoder

# Initialize the LabelEncoder for UserId only
user_encoder = LabelEncoder()

# Fit the encoder to the UserId column
df['UserId'] = user_encoder.fit_transform(df['UserId'])


In [12]:
df.head()

Unnamed: 0,UserId,ProductId,Rating,Timestamp,ProductCategory
0,583,D021,4,2024-11-07 19:23:00,Heavy-duty
1,297,D027,1,2024-11-25 22:06:00,Budget
2,318,D011,4,2024-11-10 20:50:00,Scented
3,242,D021,3,2024-11-13 13:48:00,Organic
4,426,D042,1,2024-11-19 16:03:00,Budget


In [13]:
# Deleting the row since it is only 3 null value

df.dropna(inplace=True)

In [14]:
def recommend_products(user_id, user_similarity, user_item_matrix):
    # Get the products that the user has not rated yet.
    unrated_products = user_item_matrix.loc[user_id, :].isna()
    unrated_products = unrated_products.index.values

    # Get the top-k similar users.
    k = 5
    similar_users = user_similarity[user_id].argsort()[-k:][::-1]

    # Get the ratings of the unrated products by the similar users.
    similar_user_ratings = user_item_matrix.loc[similar_users, unrated_products]

    # Calculate the weighted average of the ratings.
    recommended_ratings = similar_user_ratings.mean(axis=0)

    # Sort the recommended ratings by descending order.
    recommended_ratings = np.array(recommended_ratings).argsort()[::-1]

    # Return the top-k recommended products.
    return unrated_products[recommended_ratings[:k]]

In [15]:
df= df.sort_values(by = 'UserId', ascending = True)
df.head()

Unnamed: 0,UserId,ProductId,Rating,Timestamp,ProductCategory
142,0,D019,1,2024-11-27 20:34:00,Natural
134,0,D007,5,2024-11-01 17:15:00,Scented
926,0,D040,3,2024-11-30 12:17:00,Anti-bacterial
916,0,D049,3,2024-11-15 12:42:00,Scented
879,1,D046,3,2024-11-08 17:44:00,Natural


In [16]:
user_item_matrix = df.pivot_table(index='UserId', columns='ProductId', values='Rating', fill_value=0)
user_item_matrix

ProductId,D001,D002,D003,D004,D005,D006,D007,D008,D009,D010,...,D041,D042,D043,D044,D045,D046,D047,D048,D049,D050
UserId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,0,0,0,0,0,0,5.0,0,0,0,...,0,0,0,0,0,0,0,0,3,0
1,0,0,0,0,0,0,0.0,0,0,0,...,0,0,0,0,0,3,0,0,0,0
2,0,0,0,0,0,0,0.0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0.0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,1,0,0,0.0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
621,0,0,0,0,0,0,0.0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
622,0,0,0,0,0,0,0.0,2,0,0,...,0,0,0,0,0,0,0,0,0,0
623,0,0,0,0,0,0,0.0,0,0,0,...,0,0,0,0,0,0,0,0,0,5
624,0,0,0,0,0,0,0.0,0,0,0,...,0,0,0,5,0,0,0,0,0,0


In [17]:
len(df['ProductId'].unique())

50

In [18]:
user_item_matrix = df.pivot_table(index='UserId', columns='ProductId', values='Rating', fill_value=0)
user_item_matrix

ProductId,D001,D002,D003,D004,D005,D006,D007,D008,D009,D010,...,D041,D042,D043,D044,D045,D046,D047,D048,D049,D050
UserId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,0,0,0,0,0,0,5.0,0,0,0,...,0,0,0,0,0,0,0,0,3,0
1,0,0,0,0,0,0,0.0,0,0,0,...,0,0,0,0,0,3,0,0,0,0
2,0,0,0,0,0,0,0.0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0.0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,1,0,0,0.0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
621,0,0,0,0,0,0,0.0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
622,0,0,0,0,0,0,0.0,2,0,0,...,0,0,0,0,0,0,0,0,0,0
623,0,0,0,0,0,0,0.0,0,0,0,...,0,0,0,0,0,0,0,0,0,5
624,0,0,0,0,0,0,0.0,0,0,0,...,0,0,0,5,0,0,0,0,0,0


In [19]:
%%time
user_similarity = cosine_similarity(user_item_matrix)

CPU times: total: 0 ns
Wall time: 1.32 ms


In [20]:
# Calculate item-item similarity matrix (cosine similarity)
item_sim_matrix = np.dot(user_item_matrix.T, user_item_matrix)

In [21]:
pd.DataFrame(item_sim_matrix)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,40,41,42,43,44,45,46,47,48,49
0,223.0,0.0,0.0,20.0,0.0,0.0,0.0,0.0,0.0,0.0,...,16.0,2.0,0.0,0.0,0.0,4.0,12.0,0.0,0.0,0.0
1,0.0,163.0,0.0,4.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
2,0.0,0.0,166.0,3.0,3.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,6.0,0.0,0.0,19.0,0.0,0.0,3.0
3,20.0,4.0,3.0,178.0,0.0,0.0,8.0,0.0,3.0,0.0,...,20.0,0.0,0.0,0.0,4.0,0.0,2.0,9.0,15.0,0.0
4,0.0,0.0,3.0,0.0,131.0,15.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,5.0,0.0,1.0
5,0.0,0.0,0.0,0.0,15.0,196.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6,0.0,0.0,0.0,8.0,0.0,0.0,191.25,0.0,4.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,8.0,15.0,0.0
7,0.0,0.0,0.0,0.0,0.0,0.0,0.0,222.0,0.0,0.0,...,16.0,20.0,8.0,0.0,0.0,0.0,15.0,0.0,41.0,0.0
8,0.0,0.0,0.0,3.0,0.0,0.0,4.0,0.0,206.0,0.0,...,0.0,20.0,6.0,12.0,4.0,0.0,0.0,3.0,0.0,0.0
9,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,182.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,9.0,0.0,25.0


In [22]:
pd.DataFrame(user_similarity)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,616,617,618,619,620,621,622,623,624,625
0,1.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.395904,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0
1,0.0,1.0,0.000000,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.000000,0.0
2,0.0,0.0,1.000000,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.196116,0.0
3,0.0,0.0,0.000000,1.0,0.0,0.0,0.0,0.000000,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0
4,0.0,0.0,0.000000,0.0,1.0,0.0,0.0,0.000000,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
621,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.000000,0.0
622,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.000000,0.0
623,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.000000,0.0
624,0.0,0.0,0.196116,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.000000,0.0


In [23]:
# Pivot the table to get ratings as columns and count frequency
users_ratings = df.pivot_table(index='UserId', columns='Rating', aggfunc='size', fill_value=0)

# Add the 'Frequency' column
users_ratings['Frequency'] = users_ratings.sum(axis=1)

# Rename the columns
users_ratings.columns = ['Rating ' + str(col) for col in users_ratings.columns]

# Reset the index to make 'UserId' a regular column
users_ratings = users_ratings.reset_index()

users_ratings = users_ratings.sort_values(by = 'Rating Frequency', ascending = False)

# Set the 'UserId' column as the index
users_ratings.set_index('UserId', inplace=True)

users_ratings.head(10)

Unnamed: 0_level_0,Rating 1,Rating 2,Rating 3,Rating 4,Rating 5,Rating Frequency
UserId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
233,3,0,1,3,0,7
150,0,2,1,2,1,6
194,2,2,0,2,0,6
10,1,2,0,2,1,6
99,1,1,0,0,3,5
492,1,2,0,1,1,5
489,3,0,1,1,0,5
410,2,0,2,0,1,5
545,1,1,0,3,0,5
154,4,0,0,0,1,5


In [24]:
df['UserId'].value_counts().idxmax()

233

In [25]:
print(user_similarity.shape)  # This will print the dimensions of the array


(626, 626)


In [26]:
# Get the user ID of the user you want to recommend products to.
user_id = df['UserId'].value_counts().idxmax()


In [27]:
# Recommend products to the user.
recommended_products = recommend_products(int(user_id), user_similarity, user_item_matrix)

# Print the recommended products.
print(recommended_products)


['D001' 'D031' 'D047' 'D034' 'D015']


In [28]:

# Make recommendations for a given user
UserId = df['UserId'].value_counts().idxmax()
top_n = 3

# Retrieve the items not rated by the user
user_items = df.loc[df['UserId'] == user_id, 'ProductId'].tolist()
all_items = df['ProductId'].unique()
items_to_rate = list(set(all_items) - set(user_items))

# Predict ratings for the items not rated by the user
item_predictions = np.dot(user_item_matrix.loc[user_id], item_sim_matrix) / np.sum(item_sim_matrix, axis=1)

# Get top N item recommendations based on predicted ratings
top_item_indices = np.argsort(item_predictions)[::-1][:top_n]
top_item_recommendations = [all_items[i] for i in top_item_indices]

print(f"Top {top_n} Recommendations for User {user_id}: {top_item_recommendations}")

Top 3 Recommendations for User 233: ['D019', 'D029', 'D037']


In [50]:
import pandas as pd
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
import pickle


# Manually encoding UserId
user_encoder = {user: index for index, user in enumerate(df['UserId'].unique())}
# Save the encoder using pickle
with open('user_encoder.pkl', 'wb') as f:
    pickle.dump(user_encoder, f)

# Create user-item matrix
user_item_matrix = df.pivot_table(index='UserId', columns='ProductId', values='Rating', fill_value=0)

# Function to calculate cosine similarity and recommend products for a given user
def recommend_products(user_id, user_similarity, user_item_matrix):
    user_index = user_encoder[user_id]  # Use the manually encoded UserId
    similarity_scores = user_similarity[user_index]

    # Get top-k similar users
    k = 3
    similar_users = np.argsort(similarity_scores)[-k:][::-1]
    
    # Get the products rated by similar users but not rated by the current user
    unrated_products = user_item_matrix.loc[user_id, :].isna()
    unrated_products = unrated_products.index.values

    # Get similar users' ratings for unrated products
    similar_user_ratings = user_item_matrix.iloc[similar_users, :]

    # Calculate the weighted average of ratings from similar users
    recommended_ratings = similar_user_ratings.loc[:, unrated_products].mean(axis=0)
    
    # Return the top-k recommended products
    recommended_ratings_sorted = recommended_ratings.sort_values(ascending=False)
    return recommended_ratings_sorted.index[:k]

# Create a similarity matrix (cosine similarity)
user_similarity = cosine_similarity(user_item_matrix)

# Save the user similarity matrix using pickle
with open('user_similarity.pkl', 'wb') as f:
    pickle.dump(user_similarity, f)

# Optionally, you can save the user-item matrix or any other objects necessary for your system
with open('user_item_matrix.pkl', 'wb') as f:
    pickle.dump(user_item_matrix, f)

# You can test the saved model by reloading and running the recommendation
# For example, to reload and test:
with open('user_encoder.pkl', 'rb') as f:
    loaded_user_encoder = pickle.load(f)

with open('user_similarity.pkl', 'rb') as f:
    loaded_user_similarity = pickle.load(f)

with open('user_item_matrix.pkl', 'rb') as f:
    loaded_user_item_matrix = pickle.load(f)


In [51]:
# Test the recommendation system again
user_id = 'User916'  # Test for User1
recommended_products = recommend_products(user_id, loaded_user_similarity, loaded_user_item_matrix)
print(f"Recommended products for {user_id}: {recommended_products}")


Recommended products for User916: Index(['D007', 'D049', 'D040'], dtype='object', name='ProductId')
