# *Set Up The Environment*

---



In [4]:
import pandas as pd
import numpy as np
import seaborn as sns
from matplotlib import pyplot as plt
import warnings
import os
import importlib
warnings.filterwarnings('ignore')
sns.set_theme(color_codes=True)

# *Automatically Check and Install Python Libraries at Runtime*

---




In [5]:
# user-defined function to check library is installed or not, if not installed then it will install automatically at runtime.
def check_and_install_library(library_name):
    try:
        importlib.import_module(library_name)
        print(f"{library_name} is already installed.")
    except ImportError:
        print(f"{library_name} is not installed. Installing...")
        try:
            import pip
            pip.main(['install', library_name])
        except:
            print("Error: Failed to install the library. Please install it manually.")

# *Load The Dataset*

---



In [6]:
if 'amazon-product-reviews' not in os.listdir():
  check_and_install_library('opendatasets')
  import opendatasets as od
  od.download('https://www.kaggle.com/datasets/irvifa/amazon-product-reviews')

opendatasets is not installed. Installing...


Please see https://github.com/pypa/pip/issues/5599 for advice on fixing the underlying issue.
To avoid this problem you can invoke Python with '-m pip' instead of running pip directly.


Please provide your Kaggle credentials to download this dataset. Learn more: http://bit.ly/kaggle-creds
Your Kaggle username: aishwaryarcse
Your Kaggle Key: ··········
Dataset URL: https://www.kaggle.com/datasets/irvifa/amazon-product-reviews
Downloading amazon-product-reviews.zip to ./amazon-product-reviews


100%|██████████| 109M/109M [00:00<00:00, 152MB/s]





In [7]:
#load the dataframe and set column name
df=pd.read_csv('amazon-product-reviews/ratings_Electronics.csv',names=['userId', 'productId','rating','timestamp'])
print(df.shape)
print(df.columns)

(7824482, 4)
Index(['userId', 'productId', 'rating', 'timestamp'], dtype='object')


# *Sampling 20% of a Large Dataset for Efficient Analysis*

---



In [8]:
electronics_data=df.sample(n=1564896,ignore_index=True)

In [9]:
#after taking samples drop df to release the memory occupied by entire dataframe
del df

In [10]:
#print top 5 records of the dataset
electronics_data.head()

Unnamed: 0,userId,productId,rating,timestamp
0,A2TYKAXDDND7OE,B0052SCU8U,5.0,1363737600
1,A11KRSBJMQILU,B002SEKD5I,5.0,1381881600
2,A26SS7BO0QJ25M,B003ES5ZUU,5.0,1356134400
3,AXBZZJ2GUT10L,B0034HECE6,3.0,1400716800
4,A2UZUH4ZY6KFQF,B001U2HBLI,4.0,1395100800


In [11]:
#print the concise information of the dataset
electronics_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1564896 entries, 0 to 1564895
Data columns (total 4 columns):
 #   Column     Non-Null Count    Dtype  
---  ------     --------------    -----  
 0   userId     1564896 non-null  object 
 1   productId  1564896 non-null  object 
 2   rating     1564896 non-null  float64
 3   timestamp  1564896 non-null  int64  
dtypes: float64(1), int64(1), object(2)
memory usage: 47.8+ MB


In [12]:
#drop timestamp column
electronics_data.drop('timestamp',axis=1,inplace=True)

In [13]:
electronics_data.describe()

Unnamed: 0,rating
count,1564896.0
mean,4.00991
std,1.382609
min,1.0
25%,3.0
50%,5.0
75%,5.0
max,5.0


In [14]:
#handle missing values
electronics_data.isnull().sum()

Unnamed: 0,0
userId,0
productId,0
rating,0


In [15]:
#handling duplicate records
electronics_data[electronics_data.duplicated()].shape[0]

0

In [16]:
electronics_data.head()

Unnamed: 0,userId,productId,rating
0,A2TYKAXDDND7OE,B0052SCU8U,5.0
1,A11KRSBJMQILU,B002SEKD5I,5.0
2,A26SS7BO0QJ25M,B003ES5ZUU,5.0
3,AXBZZJ2GUT10L,B0034HECE6,3.0
4,A2UZUH4ZY6KFQF,B001U2HBLI,4.0


In [17]:
print('Total rating : ',electronics_data.shape[0])
print('Total unique users : ',electronics_data['userId'].unique().shape[0])
print('Total unique products : ',electronics_data['productId'].unique().shape[0])

Total rating :  1564896
Total unique users :  1226380
Total unique products :  237545


# *Analyzing the rating by user*

---



In [18]:
no_of_rated_products_per_user = electronics_data.groupby(by='userId')['rating'].count().sort_values(ascending=False)
no_of_rated_products_per_user.head()

Unnamed: 0_level_0,rating
userId,Unnamed: 1_level_1
A5JLAU2ARJ0BO,118
A3OXHLG6DIBRW8,104
ADLVFFE4VBT8,103
A680RUE1FDO8B,96
A1ODOGXEYECQQ8,79


# *Popularity Based Recommendation*

---



In [19]:
data=electronics_data.groupby('productId').filter(lambda x:x['rating'].count()>=50)
data.head()

Unnamed: 0,userId,productId,rating
0,A2TYKAXDDND7OE,B0052SCU8U,5.0
2,A26SS7BO0QJ25M,B003ES5ZUU,5.0
6,A1BU741P0BT7UM,B004BJLXAM,5.0
13,A6ME5B3NRT981,B001AAVA08,5.0
14,A1M2MPPL6OK3QJ,B002QDCP3Y,4.0


In [20]:
no_of_rating_per_product=data.groupby('productId')['rating'].count().sort_values(ascending=False)
no_of_rating_per_product.head()

Unnamed: 0_level_0,rating
productId,Unnamed: 1_level_1
B0074BW614,3722
B00DR0PDNE,3289
B007WTAJTO,2897
B0019EHU8G,2552
B006GWO5WK,2483


# *Collaberative filtering (Item-Item recommedation)*

---



In [21]:
#import surprise library for collebrative filtering
check_and_install_library('surprise')
from surprise import KNNWithMeans
from surprise import Dataset
from surprise import accuracy
from surprise import Reader
from surprise.model_selection import train_test_split

Please see https://github.com/pypa/pip/issues/5599 for advice on fixing the underlying issue.
To avoid this problem you can invoke Python with '-m pip' instead of running pip directly.


surprise is not installed. Installing...


Output()

In [22]:
#Reading the dataset
reader = Reader(rating_scale=(1, 5))
surprise_data = Dataset.load_from_df(data,reader)
#Splitting surprise the dataset into 80,20 ratio using train_test_split
trainset, testset = train_test_split(surprise_data, test_size=0.3,random_state=42)
# Use user_based true/false to switch between user-based or item-based collaborative filtering
algo = KNNWithMeans(k=5, sim_options={'name': 'pearson_baseline', 'user_based': False})
algo.fit(trainset)
#make prediction using testset
test_pred=algo.test(testset)
#print RMSE
print("Item-based Model : Test Set")
accuracy.rmse(test_pred ,verbose=True)


Estimating biases using als...
Computing the pearson_baseline similarity matrix...
Done computing similarity matrix.
Item-based Model : Test Set
RMSE: 1.3118


1.3118189998789758

# *Model-based collaborative filtering system*

---



In [23]:
data2=data.sample(20000)
ratings_matrix = data2.pivot_table(values='rating', index='userId', columns='productId', fill_value=0)
ratings_matrix.head()

productId,0972683275,1400532655,140053271X,B00000DM9W,B00000J1V5,B00000JI4F,B00000K135,B00000K2YR,B00001P4XA,B00001P4XH,...,B00HTPHK5W,B00HVLUR86,B00HWMPRKW,B00I2VIR2M,B00I2ZBD1U,B00I94IPTW,B00IGISO9C,B00INNP5VU,B00ISGCAJM,B00IVPU786
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
A00114642GUWAUJDR3BFN,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
A00744712QEFC7AWV5R09,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
A0140712R69E7ZY58CEW,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
A027389427NLQ1FLHLQH0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
A0498079L5Z4YA0JYG18,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


# *Content Based Filtering*

---



In [24]:
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

# Compute the similarity matrix for items (products)
item_similarity = cosine_similarity(ratings_matrix.T)

# Convert to a DataFrame for easier handling
item_similarity_df = pd.DataFrame(item_similarity, index=ratings_matrix.columns, columns=ratings_matrix.columns)

# Function to get top N similar products for a given product
def get_similar_products(product_id, top_n=5):
    similar_products = item_similarity_df[product_id].sort_values(ascending=False)[1:top_n+1]
    return similar_products

# Example: Get similar products for a specific product ID
product_id = ratings_matrix.columns[0]  # Replace with a specific product ID if known
print(f"Top 5 similar products to {product_id}:")
print(get_similar_products(product_id, top_n=5))


Top 5 similar products to 0972683275:
productId
B005PXOGWU    0.0
B005PXMKI2    0.0
B005PUZOYM    0.0
B005PSTV2K    0.0
B005PSQ5TM    0.0
Name: 0972683275, dtype: float64


# *Neural Collaborative Filtering (NCF)*

---



In [25]:
pip install tensorflow




# *1. Define the NCF Model:*

---



In [26]:
from tensorflow.keras.layers import Input, Embedding, Flatten, Concatenate, Dense, Dropout
from tensorflow.keras.models import Model
# Define the number of users and items
num_users = ratings_matrix.shape[0]  # Number of unique users
num_items = ratings_matrix.shape[1]  # Number of unique items

# Define model inputs
user_input = Input(shape=(1,), name='user_input')
item_input = Input(shape=(1,), name='item_input')

# Embedding layers
user_embedding = Embedding(input_dim=num_users, output_dim=50, name='user_embedding')(user_input)
item_embedding = Embedding(input_dim=num_items, output_dim=50, name='item_embedding')(item_input)

# Flatten embeddings
user_vec = Flatten()(user_embedding)
item_vec = Flatten()(item_embedding)

# Concatenate user and item vectors
concat = Concatenate()([user_vec, item_vec])

# Fully connected layers
dense = Dense(128, activation='relu')(concat)
dense = Dropout(0.2)(dense)
dense = Dense(64, activation='relu')(dense)
output = Dense(1, activation='sigmoid')(dense)

# Build and compile the model
model = Model([user_input, item_input], output)
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])


# *2. Prepare Training Data:*

---



In [27]:
# Prepare the training data
user_ids = []
item_ids = []
ratings = []

# Iterate through the ratings matrix to extract user-item interactions
for user_index, (user_id, row) in enumerate(ratings_matrix.iterrows()):
    for item_index, (item_id, rating) in enumerate(row.items()):
        # Consider interactions as implicit positive feedback
        if rating > 0:
            user_ids.append(user_index)  # Use numerical user index
            item_ids.append(item_index)  # Use numerical item index
            ratings.append(1)  # Implicit feedback, 1 for interaction

# Convert lists to NumPy arrays
user_ids = np.array(user_ids)
item_ids = np.array(item_ids)
ratings = np.array(ratings)


# *3. Train the Model:*

---



In [28]:
# Train the NCF model
history = model.fit([user_ids, item_ids], ratings,
                    batch_size=64, epochs=10, validation_split=0.1)


Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


# 4. Predict Recommendations

---


In [29]:
# Predict for a specific user
def recommend_items(user_id, top_n=5):
    user_index = ratings_matrix.index.get_loc(user_id)  # Get user index
    all_item_indices = np.arange(num_items)  # Indices of all items

    # Predict scores for all items for the given user
    predictions = model.predict([np.full(num_items, user_index), all_item_indices])

    # Get the top N recommended item indices
    top_items = np.argsort(-predictions.flatten())[:top_n]

    # Map indices back to product IDs
    recommended_items = ratings_matrix.columns[top_items]
    return recommended_items

# Example: Recommend items for a specific user
user_id = ratings_matrix.index[0]  # Replace with actual user_id
print(f"Recommended items for user {user_id}:")
print(recommend_items(user_id, top_n=5))


Recommended items for user A00114642GUWAUJDR3BFN:
Index(['B003M0NT1M', 'B003M5IQLU', 'B008G1FFBO', 'B008DBI5RI', 'B0035PBHX6'], dtype='object', name='productId')


# *Implement Autoencoders for Recommendation*

---



a. Model Definition

---



In [30]:
from tensorflow.keras.layers import Input, Dense
from tensorflow.keras.models import Model

# Input layer based on the number of items
input_layer = Input(shape=(ratings_matrix.shape[1],))

# Encoder
encoded = Dense(512, activation='relu')(input_layer)
encoded = Dense(256, activation='relu')(encoded)
encoded = Dense(128, activation='relu')(encoded)

# Decoder
decoded = Dense(256, activation='relu')(encoded)
decoded = Dense(512, activation='relu')(decoded)
decoded = Dense(ratings_matrix.shape[1], activation='sigmoid')(decoded)

# Autoencoder Model
autoencoder = Model(input_layer, decoded)
autoencoder.compile(optimizer='adam', loss='mse')

# Train the autoencoder
history = autoencoder.fit(ratings_matrix.values, ratings_matrix.values,
                          epochs=50, batch_size=128, validation_split=0.1)


Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50


b. Get Recommendations

---



In [31]:
# Get the reconstructed user-item matrix
reconstructed_matrix = autoencoder.predict(ratings_matrix.values)

# Recommend items for a user
def recommend_items_autoencoder(user_id, top_n=5):
    user_index = ratings_matrix.index.get_loc(user_id)
    user_ratings = reconstructed_matrix[user_index]
    top_items = user_ratings.argsort()[-top_n:][::-1]
    recommended_items = ratings_matrix.columns[top_items]
    return recommended_items

user_id = ratings_matrix.index[0]  # Example user ID
print(recommend_items_autoencoder(user_id))


Index(['B009VN9EQS', 'B002QCGLHQ', 'B002LBQWMG', 'B000VJRUKS', 'B004YADU4A'], dtype='object', name='productId')


# *Implement RNN for Sequential Recommendations*

---



In [40]:

# Assuming 'ratings_matrix' is already defined from the previous code

import tensorflow as tf
from tensorflow.keras.layers import LSTM, Dense
from tensorflow.keras.models import Sequential

# Prepare sequence data for RNN
def create_sequences(user_ratings, sequence_length):
    sequences = []
    targets = []
    for i in range(len(user_ratings) - sequence_length):
        sequences.append(user_ratings[i:i+sequence_length])
        targets.append(user_ratings[i+sequence_length])
    return np.array(sequences), np.array(targets)

# Example usage for a single user (adapt for all users)
user_id = ratings_matrix.index[0]
user_ratings = ratings_matrix.loc[user_id].values  # Get ratings for the user

sequence_length = 5  # Length of the input sequence
sequences, targets = create_sequences(user_ratings, sequence_length)


# RNN model
model = Sequential()
model.add(LSTM(64, input_shape=(sequence_length, 1)))  # Input shape adjusted
model.add(Dense(1, activation='linear'))   # Output layer

model.compile(loss='mse', optimizer='adam')

# Reshape data for LSTM input
sequences = sequences.reshape(sequences.shape[0], sequence_length, 1)

# Train the model
model.fit(sequences, targets, epochs=10, batch_size=32)


# Predict the next item in the sequence
def predict_next_item(user_id, sequence):
  # Make sure the input sequence has the correct shape
  sequence = np.array(sequence).reshape(1, sequence_length, 1)
  prediction = model.predict(sequence)
  # Get the index of the item with the highest predicted rating
  predicted_item_index = np.argmax(prediction)
  # Return the product ID corresponding to the predicted index
  return ratings_matrix.columns[predicted_item_index]

# Example usage
example_sequence = list(user_ratings[:sequence_length]) # Example sequence (replace with actual user's past interactions)
predicted_item = predict_next_item(user_id, example_sequence)
print("Predicted next item:", predicted_item)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Predicted next item: 0972683275


# *Evaluate Using MSE and MAE*

---



In [41]:
from sklearn.metrics import mean_squared_error, mean_absolute_error
import numpy as np

# Prepare test data
test_sequences, test_targets = create_sequences(user_ratings, sequence_length)

# Reshape test sequences for LSTM input
test_sequences = test_sequences.reshape(test_sequences.shape[0], sequence_length, 1)

# Predictions on test data
predictions = model.predict(test_sequences)

# Calculate metrics
mse = mean_squared_error(test_targets, predictions)
mae = mean_absolute_error(test_targets, predictions)
rmse = np.sqrt(mse)

print(f"Mean Squared Error (MSE): {mse}")
print(f"Mean Absolute Error (MAE): {mae}")
print(f"Root Mean Squared Error (RMSE): {rmse}")


Mean Squared Error (MSE): 0.005702881597978604
Mean Absolute Error (MAE): 0.0011762839622084344
Root Mean Squared Error (RMSE): 0.07551742579020158


# *Evaluate Using Precision@K and Recall@K*

---



In [43]:
def precision_at_k(predictions, targets, k=10):
    # Get the indices of the top K predictions
    top_k_preds = np.argsort(predictions.flatten())[-k:]
    # Get the indices of the top K actual targets
    top_k_targets = np.argsort(targets)[-k:]
    # Calculate precision@K
    return len(set(top_k_preds) & set(top_k_targets)) / k

def recall_at_k(predictions, targets, k=10):
    # Get the indices of the top K predictions
    top_k_preds = np.argsort(predictions.flatten())[-k:]
    # Get the indices of the top K actual targets
    top_k_targets = np.argsort(targets)[-k:]
    # Calculate recall@K
    return len(set(top_k_preds) & set(top_k_targets)) / len(top_k_targets)
# Example usage for Precision@K and Recall@K
precision = precision_at_k(predictions, test_targets, k=10)
recall = recall_at_k(predictions, test_targets, k=10)

print(f"Precision@10: {precision}")
print(f"Recall@10: {recall}")


Precision@10: 0.0
Recall@10: 0.0


# *Evaluate Using F1-Score*

---



In [44]:
def f1_score(precision, recall):
    if precision + recall == 0:
        return 0
    return 2 * (precision * recall) / (precision + recall)

f1 = f1_score(precision, recall)
print(f"F1-Score@10: {f1}")


F1-Score@10: 0
