In [1]:
# Necessary library for this model
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.decomposition import PCA,NMF,TruncatedSVD
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics.pairwise import cosine_similarity

In [2]:
df = pd.read_csv("Preprocessed_recommendation.csv")
df.sample(3)

Unnamed: 0,UserID,ItemID,Rating,Clicks,Views,TimeSpentOnItem,SessionDuration,DeviceType,Age,Gender,...,Description_198,Description_199,Reviews_Sentiment,Device_Desktop,Device_Mobile,Device_Tablet,Time_Afternoon,Time_Evening,Time_Morning,Time_Night
4090,U02541,Item00203,2,0.684211,0.77551,0.956124,0.778198,1,0.392157,0,...,0.0,0.0,0.166667,1,0,0,0,1,0,0
3059,U02049,Item00358,5,0.210526,0.938776,0.093507,0.309698,0,0.215686,2,...,0.0,0.0,0.115476,0,1,0,0,0,0,1
4704,U04905,Item00941,2,0.684211,0.285714,0.444612,0.598621,1,0.45098,1,...,0.0,0.0,0.0,0,1,0,0,0,1,0


# user-item matrix
first we can create a user_item matrix UserID as rows and ItemID as columns
- user-item matrix organizes the interaction data between users and items in a way that is easy to analyze and apply recommendation algorithms. It allows us to use mathematical techniques to extract patterns and make predictions about which items a user might enjoy or be interested in.


In [3]:
# Create a user-item matrix with UserID as rows,ItemID as columns,and Rating as values
user_item_matrix = df.pivot_table(index='UserID',columns='ItemID',values='Rating').fillna(0)

In [4]:
user_item_matrix # matrix of UserID as rows and ItemID as columns

ItemID,Item00001,Item00002,Item00003,Item00004,Item00005,Item00006,Item00007,Item00008,Item00009,Item00010,...,Item00990,Item00991,Item00992,Item00993,Item00994,Item00995,Item00996,Item00998,Item00999,Item01000
UserID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
U00001,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
U00003,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
U00004,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
U00006,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
U00007,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
U04994,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
U04995,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
U04997,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
U04998,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


- Here, 0 means that the user has not interacted with the item (or hasn’t rated it).
- User 1 has rated Item 1 with a score of 4 and Item 3 with a score of 3, but hasn’t interacted with Item 2.

# Apply Matrix factorization with NMF

In [5]:
n_nmf_components = 20   # we can adjust the number of components according to our dataset 
nmf_model = NMF(n_components = n_nmf_components,init = 'random',random_state=42)

In [6]:
# fit the NMF model to the user-item matrix
user_features = nmf_model.fit_transform(user_item_matrix)  # User-feature matrix
item_features = nmf_model.components_ # Item-features matrix

# Reconstruct the User-Item matrix

In [7]:
reconstructed_matrix = np.dot(user_features,item_features)

In [8]:
# Calculate RMSE to evaluate reconstruction quality on the entire matrix
original_matrix = user_item_matrix.values
rmse = np.sqrt(mean_squared_error(original_matrix,reconstructed_matrix))
print(f'Overall Reconstruction RMSE: {rmse}')

Overall Reconstruction RMSE: 0.12858769292702305


- The RMSE of 0.1286 shows that the NMF model is doing a good job in predicting user-item ratings. It means the model is effectively capturing patterns in the data. However, to fully assess the recommendation quality, I should also look at other metrics like Precision@K, Recall@K, and MAP. These will give a better understanding of how well the model is recommending items.

- The Overall Reconstruction RMSE of 0.1286 in our code represents the Root Mean Squared Error (RMSE) between the original user-item matrix (with actual ratings) and the reconstructed matrix generated by the NMF (Non-negative Matrix Factorization)

# Define Recommendation Function
- Now we can createfunction of a collaborative recommendation system

In [9]:
def recommend_items(user_id, user_item_matrix, reconstructed_matrix, n_recommendations=7):
    """
    Recommends top-N items for a given user based on the reconstructed user-item matrix.
    
    Parameters:
    - user_id: The ID of the user for whom to make recommendations.
    - user_item_matrix: The original user-item DataFrame.
    - reconstructed_matrix: The matrix reconstructed from NMF factors.
    - n_recommendations: The number of recommendations to return.
    
    Returns:
    - A DataFrame with recommended items and their predicted ratings.
    """
    user_index = user_item_matrix.index.get_loc(user_id)
    user_ratings = reconstructed_matrix[user_index]
    rated_items = user_item_matrix.columns[user_item_matrix.iloc[user_index] > 0].tolist()

    # Corrected the variable name here to match the one used above
    unrated_indices = [i for i, item in enumerate(user_item_matrix.columns) if item not in rated_items]
    
    # Getting recommendations for unrated items
    recommendations = [(user_item_matrix.columns[i], user_ratings[i]) for i in unrated_indices]

    # Sorting recommendations by predicted rating in descending order and selecting top N
    recommendations = sorted(recommendations, key=lambda x: x[1], reverse=True)[:n_recommendations]
    
    return pd.DataFrame(recommendations, columns=['ItemID', 'PredictedRating'])


In [10]:
# Example: Calling the recommend_items function for a specific user
user_id = 'U00332'  # Replace with a valid user ID from your dataset

# Assuming you have already created your user_item_matrix and reconstructed_matrix from NMF
recommendations = recommend_items(user_id, user_item_matrix, reconstructed_matrix, n_recommendations=7)

# Display the recommendations
print(recommendations)



      ItemID  PredictedRating
0  Item00238         0.299044
1  Item00665         0.113591
2  Item00807         0.104447
3  Item00301         0.098806
4  Item00433         0.078779
5  Item00858         0.061290
6  Item00809         0.058764


# Split the Data Into Training and Testing Sets
- To define test, split the user_item_matrix into training and testing datasets.

In [11]:
from sklearn.model_selection import train_test_split

# Assume user_item_matrix is your full dataset
train, test = train_test_split(user_item_matrix, test_size=0.2, random_state=42)

# Convert the splits back to DataFrames with the same indices and columns
train = pd.DataFrame(train, index=user_item_matrix.index, columns=user_item_matrix.columns)
test = pd.DataFrame(test, index=user_item_matrix.index, columns=user_item_matrix.columns)


# Use the test Dataset in the Metric Calculation
- Now that test is defined, use it to evaluate your recommender system.

In [12]:
def precision_at_k(recommendations, test_data, k=7):
    """
    Calculate Precision@K for a single user.
    
    Parameters:
    - recommendations: DataFrame of recommended items and their predicted ratings.
    - test_data: Series or DataFrame of the actual user-item interactions from the test set.
    - k: Number of recommendations to consider (default: 7).
    
    Returns:
    - Precision@K score.
    """
    # Get relevant items from the test set (items with ratings > 0)
    relevant_items = test_data[test_data > 0].index.tolist()
    
    # Get the top-K recommended items
    recommended_items = recommendations['ItemID'][:k].tolist()
    
    # Calculate the intersection of recommended and relevant items
    intersection = set(recommended_items).intersection(relevant_items)
    
    # Precision@K is the ratio of relevant recommended items to the total recommendations
    precision = len(intersection) / k
    return precision


In [13]:
k = 7
precision_scores = []

for user_id in test.index:
    # Make sure to handle users who have no data in the test set
    if test.loc[user_id].sum() > 0:  # Ensure the user has rated items in the test set
        recommendations = recommend_items(user_id, train, reconstructed_matrix, n_recommendations=k)
        precision = precision_at_k(recommendations, test.loc[user_id], k=k)
        precision_scores.append(precision)

avg_precision = sum(precision_scores) / len(precision_scores)
print(f"Precision@{k}: {avg_precision}")


Precision@7: 0.03508771929824561


In [14]:
# Mean Squared Error (MSE)
# Flatten the test data and predictions
actual = []
predicted = []

for user_id in test.index:
    user_index = test.index.get_loc(user_id)
    actual_ratings = test.iloc[user_index]
    pred_ratings = reconstructed_matrix[user_index]
    
    for i, rating in enumerate(actual_ratings):
        if rating > 0:  # Only include rated items
            actual.append(rating)
            predicted.append(pred_ratings[i])

mse = mean_squared_error(actual, predicted)
print(f"Mean Squared Error: {mse}")


Mean Squared Error: 10.131396776852757


# Matrix Factorization with SVD (Singular Value Decomposition)

In [15]:


# Step 1: Prepare Data (Assume user_item_matrix is defined)
# Fill missing values with 0 (assuming explicit feedback)
user_item_matrix_filled = user_item_matrix.fillna(0)

# Step 2: Apply SVD
n_factors = 20  # Number of latent features to use
svd = TruncatedSVD(n_components=n_factors, random_state=42)
U = svd.fit_transform(user_item_matrix_filled)  # User matrix
Vt = svd.components_  # Item matrix

# Step 3: Reconstruct the Matrix
reconstructed_matrix = np.dot(U, Vt)




In [16]:
# Step 4: Recommendation Function
def recommend_items_svd(user_id, user_item_matrix, reconstructed_matrix, n_recommendations=7):
    user_index = user_item_matrix.index.get_loc(user_id)
    user_ratings = reconstructed_matrix[user_index]
    rated_items = user_item_matrix.columns[user_item_matrix.iloc[user_index] > 0].tolist()

    # Get recommendations for unrated items
    unrated_indices = [i for i, item in enumerate(user_item_matrix.columns) if item not in rated_items]
    recommendations = [(user_item_matrix.columns[i], user_ratings[i]) for i in unrated_indices]

    # Sort and return top-N recommendations
    recommendations = sorted(recommendations, key=lambda x: x[1], reverse=True)[:n_recommendations]
    return pd.DataFrame(recommendations, columns=['ItemID', 'PredictedRating'])

# Example: Get recommendations for a user
user_id = "U00332"
recommendations = recommend_items_svd(user_id, user_item_matrix, reconstructed_matrix, n_recommendations=7)
print(recommendations)

      ItemID  PredictedRating
0  Item00106         0.259306
1  Item00238         0.248931
2  Item00569         0.158432
3  Item00652         0.123753
4  Item00417         0.119289
5  Item00301         0.114082
6  Item00767         0.108882


In [17]:
def precision_at_k(recommendations, test_data, k=7):
    relevant_items = test_data[test_data > 0].index.tolist()
    recommended_items = recommendations['ItemID'][:k].tolist()
    precision = len(set(recommended_items).intersection(set(relevant_items))) / k
    return precision

k = 7
precision_scores = []

for user_id in test.index:
    if test.loc[user_id].sum() > 0:
        recommendations = recommend_items_svd(user_id, train, reconstructed_matrix, n_recommendations=k)
        precision = precision_at_k(recommendations, test.loc[user_id], k=k)
        precision_scores.append(precision)

avg_precision = sum(precision_scores) / len(precision_scores)
print(f"Precision@{k}: {avg_precision}")


Precision@7: 0.049441786283891544


In [18]:
# Flatten test data and predictions for MSE calculation
actual = []
predicted = []

for user_id in test.index:
    user_index = test.index.get_loc(user_id)
    actual_ratings = test.iloc[user_index]
    pred_ratings = reconstructed_matrix[user_index]

    for i, rating in enumerate(actual_ratings):
        if rating > 0:
            actual.append(rating)
            predicted.append(pred_ratings[i])

mse = mean_squared_error(actual, predicted)
print(f"Mean Squared Error: {mse}")


Mean Squared Error: 10.019389425015301
