In [1]:
import numpy as np
import pandas as pd

In [3]:
books = pd.read_csv('Books.csv', low_memory=False)


In [5]:
books = books.drop('Image-URL-S', axis=1, errors='ignore')
books = books.drop('Image-URL-M', axis=1, errors='ignore')

books = books.drop('Image-URL-L', axis=1, errors='ignore')

books.head()

Unnamed: 0,ISBN,Book-Title,Book-Author,Year-Of-Publication,Publisher
0,195153448,Classical Mythology,Mark P. O. Morford,2002,Oxford University Press
1,2005018,Clara Callan,Richard Bruce Wright,2001,HarperFlamingo Canada
2,60973129,Decision in Normandy,Carlo D'Este,1991,HarperPerennial
3,374157065,Flu: The Story of the Great Influenza Pandemic...,Gina Bari Kolata,1999,Farrar Straus Giroux
4,393045218,The Mummies of Urumchi,E. J. W. Barber,1999,W. W. Norton &amp; Company


In [7]:
users = pd.read_csv('Users.csv', low_memory=False)
ratings = pd.read_csv('Ratings.csv', low_memory=False)


In [9]:
ratings.head()

Unnamed: 0,User-ID,ISBN,Book-Rating
0,276725,034545104X,0
1,276726,0155061224,5
2,276727,0446520802,0
3,276729,052165615X,3
4,276729,0521795028,6


In [11]:
print(ratings.shape)
print(users.shape)
print(books.shape)


(1149780, 3)
(278858, 3)
(271360, 5)


In [13]:
ratings['User-ID'].value_counts()

User-ID
11676     13602
198711     7550
153662     6109
98391      5891
35859      5850
          ...  
116180        1
116166        1
116154        1
116137        1
276723        1
Name: count, Length: 105283, dtype: int64

In [15]:
x=ratings['User-ID'].value_counts()>200

In [17]:
x[x].shape

(899,)

In [19]:
y=x[x].index

In [21]:
ratings=ratings[ratings['User-ID'].isin(y)]

In [23]:
ratings.head()

Unnamed: 0,User-ID,ISBN,Book-Rating
1456,277427,002542730X,10
1457,277427,0026217457,0
1458,277427,003008685X,8
1459,277427,0030615321,0
1460,277427,0060002050,0


In [25]:
ratings.shape

(526356, 3)

In [27]:
ratings_books=ratings.merge(books,on="ISBN")

In [29]:
num_ratings=ratings_books.groupby('Book-Title')['Book-Rating'].count().reset_index()

In [45]:
final_rating=ratings_books.merge(num_ratings,on='Book-Title')

In [47]:
final_rating.head()

Unnamed: 0,User-ID,ISBN,Book-Rating_x,Book-Title,Book-Author,Year-Of-Publication,Publisher,Book-Rating_y
0,277427,002542730X,10,Politically Correct Bedtime Stories: Modern Ta...,James Finn Garner,1994,John Wiley &amp; Sons Inc,82
1,277427,0026217457,0,Vegetarian Times Complete Cookbook,Lucy Moll,1995,John Wiley &amp; Sons,7
2,277427,003008685X,8,Pioneers,James Fenimore Cooper,1974,Thomson Learning,1
3,277427,0030615321,0,"Ask for May, Settle for June (A Doonesbury book)",G. B. Trudeau,1982,Henry Holt &amp; Co,1
4,277427,0060002050,0,On a Wicked Dawn (Cynster Novels),Stephanie Laurens,2002,Avon Books,13


In [49]:
final_rating.rename(columns={'Book-Rating_y':'num_of_ratings'},inplace=True)

In [51]:
final_rating.head()

Unnamed: 0,User-ID,ISBN,Book-Rating_x,Book-Title,Book-Author,Year-Of-Publication,Publisher,num_of_ratings
0,277427,002542730X,10,Politically Correct Bedtime Stories: Modern Ta...,James Finn Garner,1994,John Wiley &amp; Sons Inc,82
1,277427,0026217457,0,Vegetarian Times Complete Cookbook,Lucy Moll,1995,John Wiley &amp; Sons,7
2,277427,003008685X,8,Pioneers,James Fenimore Cooper,1974,Thomson Learning,1
3,277427,0030615321,0,"Ask for May, Settle for June (A Doonesbury book)",G. B. Trudeau,1982,Henry Holt &amp; Co,1
4,277427,0060002050,0,On a Wicked Dawn (Cynster Novels),Stephanie Laurens,2002,Avon Books,13


In [53]:
final_rating.drop_duplicates(['User-ID','Book-Title'],inplace=True)

In [55]:
final_rating.shape

(483423, 8)

In [59]:
import pandas as pd
from surprise import SVD, Dataset, Reader
from surprise.model_selection import train_test_split
import numpy as np

data = final_rating[['User-ID', 'Book-Title', 'Book-Rating_x']]
reader = Reader(rating_scale=(data['Book-Rating_x'].min(), data['Book-Rating_x'].max()))
surprise_data = Dataset.load_from_df(data[['User-ID', 'Book-Title', 'Book-Rating_x']], reader)

trainset, testset = train_test_split(surprise_data, test_size=0.2)
from surprise.model_selection import GridSearchCV

# Define the parameter grid
param_grid = {
    'n_factors': [50, 100, 150],
    'reg_all': [0.02, 0.05, 0.1],
    'lr_all': [0.002, 0.005, 0.01]
}

# Perform GridSearchCV
grid_search = GridSearchCV(SVD, param_grid, measures=['rmse'], cv=3)
grid_search.fit(surprise_data)

# Best parameters
best_params = grid_search.best_params['rmse']
print(f"Best Parameters: {best_params}")

# Train the model with the best parameters
svd = SVD(**best_params)
svd.fit(trainset)


# # Step 3: Train the SVD model
# svd = SVD()
# svd.fit(trainset)

# Step 4: Generate predictions for testing data
all_predictions = []
for uid, iid, true_r in testset:
    pred = svd.predict(uid, iid).est
    all_predictions.append((uid, iid, true_r, pred))

# Step 5: Prepare data for ranking metrics
predictions_df = pd.DataFrame(all_predictions, columns=['User-ID', 'Book-Title', 'True-Rating', 'Predicted-Rating'])

# Sort by user and predicted rating
predictions_df = predictions_df.sort_values(by=['User-ID', 'Predicted-Rating'], ascending=[True, False])

# Step 6: Define evaluation metrics
def mean_average_precision(df, k=10):
    """Calculate MAP@k."""
    users = df['User-ID'].unique()
    map_score = 0
    for user in users:
        user_data = df[df['User-ID'] == user].head(k)
        rel_items = (user_data['True-Rating'] > 0).astype(int)  # Relevant if True-Rating > 0
        precision_at_k = (np.cumsum(rel_items) / (np.arange(len(rel_items)) + 1)).mean()
        map_score += precision_at_k
    return map_score / len(users)

def ndcg(df, k=10):
    """Calculate NDCG@k."""
    users = df['User-ID'].unique()
    ndcg_score = 0
    for user in users:
        user_data = df[df['User-ID'] == user].head(k)
        rel_items = (user_data['True-Rating'] > 0).astype(int)
        dcg = np.sum(rel_items / np.log2(np.arange(2, len(rel_items) + 2)))
        idcg = np.sum(sorted(rel_items, reverse=True) / np.log2(np.arange(2, len(rel_items) + 2)))
        ndcg_score += (dcg / idcg) if idcg > 0 else 0
    return ndcg_score / len(users)

def mrr(df, k=10):
    """Calculate MRR@k."""
    users = df['User-ID'].unique()
    mrr_score = 0
    for user in users:
        user_data = df[df['User-ID'] == user].head(k)
        rel_items = (user_data['True-Rating'] > 0).astype(int)
        ranks = np.where(rel_items == 1)[0]
        mrr_score += 1 / (ranks[0] + 1) if len(ranks) > 0 else 0
    return mrr_score / len(users)

# Step 7: Compute and print metrics
map_score = mean_average_precision(predictions_df, k=10)
ndcg_score = ndcg(predictions_df, k=10)
mrr_score = mrr(predictions_df, k=10)

print(f"MAP@10: {map_score}")
print(f"NDCG@10: {ndcg_score}")
print(f"MRR@10: {mrr_score}")


Best Parameters: {'n_factors': 50, 'reg_all': 0.1, 'lr_all': 0.002}
MAP@10: 0.3436029561064325
NDCG@10: 0.6073587048690595
MRR@10: 0.5384121687236254
