### Matrix factorization-based recommendation system for books

* The system will use collaborative filtering with Singular Value Decomposition (SVD) to recommend books based on user preferences.

In [1]:
#importing Librarires

import pandas as pd
import numpy as np
from sklearn.metrics import mean_squared_error
from scipy.sparse.linalg import svds

In [2]:
# importing data

books = pd.read_csv('Books.csv')
ratings = pd.read_csv('Ratings.csv')

In [3]:
ratings.head()

Unnamed: 0,book_id,user_id,rating
0,1,314,5
1,1,439,3
2,1,588,5
3,1,1169,4
4,1,1185,4


In [4]:
# Preprocess data

ratings.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 981756 entries, 0 to 981755
Data columns (total 3 columns):
 #   Column   Non-Null Count   Dtype
---  ------   --------------   -----
 0   book_id  981756 non-null  int64
 1   user_id  981756 non-null  int64
 2   rating   981756 non-null  int64
dtypes: int64(3)
memory usage: 22.5 MB


In [5]:
ratings.shape

(981756, 3)

In [7]:
duplicates = ratings[ratings.duplicated(subset=['user_id', 'book_id'], keep=False)]
print(duplicates)

        book_id  user_id  rating
1170         12    40251       4
1171         12    40251       4
2473         25    32635       5
2474         25    32635       5
2717         28     9011       2
...         ...      ...     ...
981044     9993    33994       4
981256     9995    28898       2
981257     9995    28898       2
981368     9996    48576       5
981369     9996    48576       5

[4487 rows x 3 columns]


### Dropiing duplicates

In [8]:
ratings = ratings.drop_duplicates(subset=['user_id', 'book_id'])

In [9]:
ratings.head()

Unnamed: 0,book_id,user_id,rating
0,1,314,5
1,1,439,3
2,1,588,5
3,1,1169,4
4,1,1185,4


### Finding the null values

In [10]:
ratings.isnull().sum()

Unnamed: 0,0
book_id,0
user_id,0
rating,0


### Reducing the matrix

* Filter the original ratings DataFrame to only include top books and users

In [11]:
# Reducing the matrix size

# Count ratings per book and select the top 1000 books
top_books = ratings['book_id'].value_counts().head(1000).index

# Count ratings per user and select the top 10000 users
top_users = ratings['user_id'].value_counts().head(10000).index

# Filter the original ratings DataFrame to only include top books and users
filtered_ratings = ratings[(ratings['book_id'].isin(top_books)) & (ratings['user_id'].isin(top_users))]

# Pivot this filtered data to create the reduced user-item matrix
reduced_user_item_matrix = filtered_ratings.pivot(index='user_id', columns='book_id', values='rating').fillna(0)

# Check the new dimensions of the matrix to confirm the reduction
print(reduced_user_item_matrix.shape)


(7205, 1000)


In [19]:
reduced_user_item_matrix.shape

(7205, 1000)

In [17]:
reduced_user_item_matrix.head()

book_id,1,2,3,4,5,6,7,8,9,43,...,9911,9916,9921,9922,9923,9930,9935,9938,9960,9994
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
7,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
23,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
35,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
41,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
46,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [20]:
reduced_user_item_matrix.info()

<class 'pandas.core.frame.DataFrame'>
Index: 7205 entries, 7 to 53403
Columns: 1000 entries, 1 to 9994
dtypes: float64(1000)
memory usage: 55.0 MB


In [22]:
non_zero_count = np.count_nonzero(reduced_user_item_matrix.values)

print(f"Count of actual ratings: {non_zero_count}")

Count of actual ratings: 91532


In [23]:
non_zero_count_by_book = (reduced_user_item_matrix != 0).sum(axis=0)
print("Non-zero counts by book:")
print(non_zero_count_by_book)

Non-zero counts by book:
book_id
1       100
2       100
3       100
4       100
5       100
       ... 
9930     35
9935     66
9938     37
9960     32
9994     32
Length: 1000, dtype: int64


### Since the data is very Saprse. Reaplacing the zero value with average rating.

In [24]:
# replace the zero with
# Calculate the mean for each book (column) while ignoring zeros
book_averages = reduced_user_item_matrix.replace(0, np.nan).mean(axis=0)

# Replace zero values with the corresponding book average
# We use apply along with lambda to selectively replace zeros
filled_user_item_matrix = reduced_user_item_matrix.apply(lambda col: col.replace(0, book_averages[col.name]))

# Optionally, if there are books without any ratings (all zeros resulting in NaN averages),
# you might want to fill these NaNs with a global average or another placeholder:
global_average = book_averages.mean()  # Calculate global average from book averages
filled_user_item_matrix.fillna(global_average, inplace=True)

# Check the update
print(filled_user_item_matrix.info())

<class 'pandas.core.frame.DataFrame'>
Index: 7205 entries, 7 to 53403
Columns: 1000 entries, 1 to 9994
dtypes: float64(1000)
memory usage: 55.0 MB
None


In [25]:
filled_user_item_matrix.head()

book_id,1,2,3,4,5,6,7,8,9,43,...,9911,9916,9921,9922,9923,9930,9935,9938,9960,9994
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
7,4.24,4.21,3.09,4.46,3.89,4.09,3.92,3.83,3.35,4.05,...,3.90566,3.233333,4.214286,3.959184,4.5,3.485714,4.151515,3.864865,3.875,4.375
23,4.24,4.21,3.09,4.46,3.89,4.09,3.92,3.83,3.35,4.05,...,3.90566,3.233333,4.214286,3.959184,4.5,3.485714,4.151515,3.864865,3.875,4.375
35,4.24,4.21,3.09,4.46,3.89,4.09,3.92,3.83,3.35,4.05,...,3.90566,3.233333,4.214286,3.959184,4.5,3.485714,4.151515,3.864865,3.875,4.375
41,4.24,4.21,3.09,4.46,3.89,4.09,3.92,3.83,3.35,4.05,...,3.90566,3.233333,4.214286,3.959184,4.5,3.485714,4.151515,3.864865,3.875,4.375
46,4.24,4.21,3.09,4.46,3.89,4.09,3.92,3.83,3.35,4.05,...,3.90566,3.233333,4.214286,3.959184,4.5,3.485714,4.151515,3.864865,3.875,4.375


### Trainging the model

In [26]:

from sklearn.decomposition import TruncatedSVD
from scipy.sparse import csr_matrix

# Convert the DataFrame to a sparse matrix format
# This step is optional but can help in handling large datasets more efficiently
matrix = csr_matrix(filled_user_item_matrix.values)

# Initialize the SVD model

n_components = 20
svd_model = TruncatedSVD(n_components=n_components, random_state=42)

# Fit the SVD model to the data
user_features = svd_model.fit_transform(matrix)

# Explained variance can help understand how much of the data variance is captured by the model
explained_variance = svd_model.explained_variance_ratio_.sum()
print(f"Explained Variance by the model: {explained_variance:.2f}")

# Optionally, look at the first few entries of the latent features (user features)
print("Sample of user features matrix:")
print(user_features[:5])


Explained Variance by the model: 0.13
Sample of user features matrix:
[[ 1.21022439e+02  3.57129177e-03 -6.53728023e-03  4.16140161e-03
   1.12754256e-03  5.17599658e-03 -1.22140584e-04 -2.98728526e-03
   8.36633234e-05  5.11877094e-04 -2.90151902e-03  2.62021818e-03
   2.19925297e-03  1.46811552e-03 -1.16909280e-05 -3.79816839e-03
  -4.11753797e-03  3.32965450e-05  1.71012032e-03  1.30628454e-03]
 [ 1.21024871e+02  1.72327470e-03  1.97712491e-03  2.50525071e-03
   1.53662129e-04  1.14520558e-03 -1.34041295e-03 -2.78174730e-04
   2.42234365e-04 -5.66288904e-04  1.07509983e-03  7.91655171e-05
   1.65195813e-03 -1.02857254e-04 -1.04887320e-04 -4.98050617e-04
   5.30990188e-04 -6.71107309e-04 -1.30589737e-05  4.97114813e-04]
 [ 1.19769934e+02 -2.87989097e-01  2.09827770e-01 -5.31491344e-01
   3.58636277e-01 -3.81245090e-01 -8.87831033e-01  6.03329798e-01
   3.72546751e-01  2.51498837e-01  1.59009910e-01 -2.53724374e-01
  -4.63264096e-01  8.25318106e-02 -1.21504739e-01  6.88394113e-01
  -4

### Evaluvation Matrix

* MAE
* RMSE

In [28]:

from sklearn.metrics import mean_squared_error, mean_absolute_error
from sklearn.model_selection import train_test_split
from scipy.sparse import csr_matrix
from sklearn.decomposition import TruncatedSVD

# Assuming you have the filled_user_item_matrix DataFrame ready
data_matrix = filled_user_item_matrix.values

# Create train and test sets (this is a simplified split without considering time-based splitting)
train_matrix, test_matrix = train_test_split(data_matrix, test_size=0.2, random_state=42)

# Convert train data to sparse matrix for more efficient calculations
train_sparse = csr_matrix(train_matrix)

# Initialize and fit the SVD model on the train data
svd_model = TruncatedSVD(n_components=20, random_state=42)
svd_model.fit(train_sparse)

# Transform the train set and then inverse transform to get the predictions
train_reduced = svd_model.transform(train_sparse)
predicted_train_full = svd_model.inverse_transform(train_reduced)

# Mask to apply only on elements that were originally non-zero in the test set
test_nonzero_mask = (test_matrix != 0)

# Make predictions on the test set
# We need to use the same transformation and inversion process used during training, but correctly map back to the test set dimensions
predicted_test_matrix = np.dot(svd_model.transform(csr_matrix(test_matrix)), svd_model.components_)

# Apply mask to extract the non-zero predicted and actual ratings
predicted_ratings_test = predicted_test_matrix[test_nonzero_mask]
actual_ratings_test = test_matrix[test_nonzero_mask]

# Calculate RMSE and MAE
rmse = np.sqrt(mean_squared_error(actual_ratings_test, predicted_ratings_test))
mae = mean_absolute_error(actual_ratings_test, predicted_ratings_test)

print(f"RMSE: {rmse:.4f}")
print(f"MAE: {mae:.4f}")


RMSE: 0.1114
MAE: 0.0192


* RMSE and MAE using Cross Validation

In [31]:
# import numpy as np
# from sklearn.metrics import mean_squared_error, mean_absolute_error
from sklearn.model_selection import KFold
# from scipy.sparse import csr_matrix
# from sklearn.decomposition import TruncatedSVD

# Assuming you have the filled_user_item_matrix DataFrame ready
data_matrix = filled_user_item_matrix.values

# Initialize the KFold cross-validator
num_folds = 5
kf = KFold(n_splits=num_folds, shuffle=True, random_state=42)

# Prepare to collect the metrics
rmses = []
maes = []

# Perform the cross-validation
for train_index, test_index in kf.split(data_matrix):
    # Split data
    train_matrix, test_matrix = data_matrix[train_index], data_matrix[test_index]

    # Convert train data to a sparse matrix
    train_sparse = csr_matrix(train_matrix)

    # Initialize and fit the SVD model
    svd_model = TruncatedSVD(n_components=20, random_state=42)
    svd_model.fit(train_sparse)

    # Transform the train set and get the predictions
    train_reduced = svd_model.transform(train_sparse)
    predicted_train_full = svd_model.inverse_transform(train_reduced)

    # Prepare the test set transformation
    test_sparse = csr_matrix(test_matrix)
    predicted_test_matrix = np.dot(svd_model.transform(test_sparse), svd_model.components_)

    # Mask for non-zero elements in the test set
    test_nonzero_mask = test_matrix != 0

    # Extract the non-zero predicted and actual ratings
    predicted_ratings_test = predicted_test_matrix[test_nonzero_mask]
    actual_ratings_test = test_matrix[test_nonzero_mask]

    # Calculate and collect RMSE and MAE
    rmse = np.sqrt(mean_squared_error(actual_ratings_test, predicted_ratings_test))
    mae = mean_absolute_error(actual_ratings_test, predicted_ratings_test)
    rmses.append(rmse)
    maes.append(mae)

# Calculate the average RMSE and MAE across all folds
average_rmse = np.mean(rmses)
average_mae = np.mean(maes)

print(f"Average RMSE across {num_folds} folds: {average_rmse:.4f}")
print(f"Average MAE across {num_folds} folds: {average_mae:.4f}")


Average RMSE across 5 folds: 0.1063
Average MAE across 5 folds: 0.0179


* Overlapping

In [29]:
# import numpy as np
# from scipy.sparse import csr_matrix
# from sklearn.decomposition import TruncatedSVD

# Assume `data_matrix` is your user-item ratings matrix (filled where 0 was replaced by averages)
data_sparse = csr_matrix(filled_user_item_matrix)
svd_model = TruncatedSVD(n_components=20, random_state=42)
user_features = svd_model.fit_transform(data_sparse)
predicted_ratings = svd_model.inverse_transform(user_features)

# Convert predictions to DataFrame for easier handling
predicted_df = pd.DataFrame(predicted_ratings, index=filled_user_item_matrix.index, columns=filled_user_item_matrix.columns)

def top_k_items(ratings, k=5):
    """Returns the indices of the top k rated items"""
    return set(ratings.nlargest(k).index)

def overlapping(set1, set2):
    """Calculate Jaccard Index as a measure of overlap."""
    intersection = set1.intersection(set2)
    union = set1.union(set2)
    return len(intersection) / len(union) if union else 0

# Evaluate overlap for each user
jaccard_scores = []
k = 5  # Adjust k based on how many top items you want to consider

for user_id in filled_user_item_matrix.index:
    actual_top_k = top_k_items(filled_user_item_matrix.loc[user_id], k)
    predicted_top_k = top_k_items(predicted_df.loc[user_id], k)
    overlap_score = overlapping(actual_top_k, predicted_top_k)
    jaccard_scores.append(overlap_score)

# Average Jaccard Index across all users
average_jaccard = np.mean(jaccard_scores)
print(f"Average overlapping for top-{k} items: {average_jaccard:.4f}")


Average overlapping for top-5 items: 0.6203


* K-Precision

In [35]:
# import numpy as np
# from sklearn.decomposition import TruncatedSVD
# from scipy.sparse import csr_matrix

# Assuming 'filled_user_item_matrix' is a pandas DataFrame
data_sparse = csr_matrix(filled_user_item_matrix.values)  # Convert DataFrame to sparse matrix
svd_model = TruncatedSVD(n_components=20, random_state=42)
user_features = svd_model.fit_transform(data_sparse)
predicted_ratings = svd_model.inverse_transform(user_features)

def get_top_k_predictions(predictions, k):
    """Returns the indices of the top k predicted ratings for each user."""
    top_k_preds = np.argsort(-predictions, axis=1)[:, :k]
    return top_k_preds

def precision_at_k(actual_ratings, predicted_top_k, k):
    """Calculate precision at k for each user."""
    hits = 0
    total = 0

    # Iterate over each user
    for idx, top_k_items in enumerate(predicted_top_k):
        # Actual items the user has rated 4 or above (change threshold as needed)
        actual_high_rated_items = np.where(actual_ratings[idx] >= 4)[0]  # Threshold for high ratings
        # Check if top-k predicted items are in the user's high rated items
        hits += len(set(top_k_items).intersection(set(actual_high_rated_items)))
        total += k

    return hits / total if total else 0

# Apply the functions
k = 5
predicted_top_k = get_top_k_predictions(predicted_ratings, k)
actual_ratings = filled_user_item_matrix.values  # Convert DataFrame to numpy array for indexing
k_precision = precision_at_k(actual_ratings, predicted_top_k, k)

print(f"Precision at {k}: {k_precision:.4f}")


Precision at 5: 0.9991
