# Book-Oracle: Basic Recommendation System

- Develop a basic Recommendation System
- 26.11.2023
- Janina, Oliwia, Neha, Nina

## Import Libraries

In [47]:
import pandas as pd
import numpy as np

#Modelling
from sklearn.preprocessing import MinMaxScaler, StandardScaler, RobustScaler, OneHotEncoder

from sklearn.metrics import classification_report, accuracy_score, roc_auc_score, roc_curve, confusion_matrix, make_scorer, precision_score, recall_score, f1_score
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.pipeline import Pipeline
from scipy.sparse import csr_matrix, hstack
from sklearn.neighbors import NearestNeighbors

#NLP
import nltk

#Plotting
import matplotlib.pyplot as plt
import seaborn as sns
from matplotlib.ticker import PercentFormatter
plt.rcParams.update({ "figure.figsize" : (8, 5),"axes.facecolor" : "white", "axes.edgecolor":  "black"})
plt.rcParams["figure.facecolor"]= "w"
pd.plotting.register_matplotlib_converters()
pd.set_option('display.float_format', lambda x: '%.3f' % x)
pd.options.display.float_format = "{:,.2f}".format

RSEED = 42

import warnings
warnings.filterwarnings('ignore')

## Import Data

In [48]:
df = pd.read_csv('data/kaggle_full_df.csv')
df['country'].fillna('unknown', inplace=True)
df.shape

(1005487, 12)

In [None]:
df.info()

In [49]:
#Only Rating greater than 4
df = df[df['book_rating']>4]

#Only users from US or Canada
df = df[df['country'].str.contains("usa|canada")]

df.shape

(290837, 12)

In [51]:
df.head(5)


Unnamed: 0,book_title,book_author,year_of_publication,publisher,image_url_m,common_identifier,user_id,isbn,book_rating,age,city,country
1,Classical Mythology,Mark P. O. Morford,2002,Oxford University Press,http://images.amazon.com/images/P/0195153448.0...,1,269782,801319536,7,30,edmonton,canada
2,Pay It Forward: A Novel,Catherine Ryan Hyde,2000,Simon &amp; Schuster,http://images.amazon.com/images/P/0684862719.0...,2392,269782,684862719,8,30,edmonton,canada
3,Watership Down,Richard Adams,1976,Avon,http://images.amazon.com/images/P/0380002930.0...,3172,269782,140039589,10,30,edmonton,canada
5,Writing The Circle: Native Women Of Western Ca...,Jeanne Perreault,1990,Lpg Distribution,http://images.amazon.com/images/P/0920897886.0...,95231,269782,920897886,10,30,edmonton,canada
6,Clara Callan,Richard Bruce Wright,2001,HarperFlamingo Canada,http://images.amazon.com/images/P/0002005018.0...,2,8,2005018,5,0,timmins,canada


In [52]:
#Add a new column with a total rating count for each book by common identifier
df['rating_count'] = df.groupby(['book_title', 'book_author'])['book_rating'].transform('count')

#Show a list of books that got the highest rating count, group by title and author to show unique books

df.groupby(['book_title', 'book_author', 'rating_count']).size().reset_index(name='Count').sort_values(by='rating_count', ascending=False).head(5)

Unnamed: 0,book_title,book_author,rating_count,Count
83609,The Lovely Bones: A Novel,Alice Sebold,601,601
77011,The Da Vinci Code,Dan Brown,411,411
88787,The Secret Life Of Bees,Sue Monk Kidd,379,379
87707,The Red Tent (Bestselling Backlist),Anita Diamant,341,341
85091,The Nanny Diaries: A Novel,Emma Mclaughlin,330,330


In [53]:
popularity_threshold = 50
df = df[df['rating_count'] >= popularity_threshold]
df.shape

(49104, 13)

<span style="color: White;">Collaborative Filtering - Item based:</span>

In [None]:
!pip install scikit-surprise

- Library used - Surprise

- Model - matrix factorization SVD

- Recommend top 5 books for a user. (Here, user_id is to be given as input)

- Identify books the user hasn't interacted with.
- Make predictions for these books.
- Sort predictions by estimated rating.
- Extract the top N recommendations and return their titles.

## Analyse: ##
1. Understand the code

2. What does RMSE mean in our model and why is it used?
--> measure errors between predicted and actual values
--> RSME is widely used in collaborative filtering 
--> Lower the RSME, better is the Model

3. How is the train/test split done?
--> train/test split is done on the "data"
--> Total number of rows (Number of Ratings) in "data" object on which train/test split is done  :  1005487
--> Number of unique users: 90976
--> Number of items (common_identifiers): 245238
 

4. Try the same with different libraries & include implicit rating as well 
    1. hybrid model from LightFM library 
    2. Alternating Least Squares(ALS) model from implicit's library

#### <span style="color: green;"> Recommend top 5 books to User based on the books NOT interacted with </span>

- Model used - Matrix Factorization SVD
- Library used - Surprise

## Modelling & Evaluation

In [54]:
import pandas as pd
from surprise import Dataset, Reader
from surprise.model_selection import train_test_split
from surprise import SVD
from surprise import accuracy


In [55]:

# Load data into the Surprise library's format
reader = Reader(rating_scale=(4, 10))
#reader = Reader(rating_scale=(1, 10))
data = Dataset.load_from_df(df[['user_id', 'common_identifier', 'book_rating']], reader)


In [56]:

# Split the data into training and testing sets
trainset, testset = train_test_split(data, test_size=0.2, random_state=42)

# Build and train the matrix factorization model
model = SVD()
model.fit(trainset)


<surprise.prediction_algorithms.matrix_factorization.SVD at 0x2a0ee0610>

In [57]:

# Make predictions on the test set
predictions = model.test(testset)

# Evaluate the model's accuracy
accuracy.rmse(predictions)


RMSE: 1.3888


1.3887876041957476

In [58]:

# Function to get book recommendations for a user
def get_book_recommendations(user_id, df, model, n=5):
    # Get the unique books the user hasn't interacted with
    books_not_interacted = df[~df['common_identifier'].isin(df[df['user_id'] == user_id]['common_identifier'].tolist())]['common_identifier'].unique()
    #books_not_interacted = df[~df['common_identifier'].isin(df[df['user_id'] == user_id]['common_identifier'].tolist())]['common_identifier'].unique()

    # Make predictions for the books the user hasn't interacted with
    predictions = [model.predict(user_id, book) for book in books_not_interacted]

    # Sort predictions by estimated rating in descending order
    sorted_predictions = sorted(predictions, key=lambda x: x.est, reverse=True)

    # Get the top N recommendations
    top_n_recommendations = sorted_predictions[:n]

    # Extract book titles from recommendations
    recommended_books = [df[df['common_identifier'] == prediction.iid]['book_title'].values[0] for prediction in top_n_recommendations]

    return recommended_books



In [62]:
# Call f(get_book_recommendations) for recommendatios
user_id = 270713
recommendations = get_book_recommendations(user_id, df, model, n=5)
print(f"Top 5 recommendations for user {user_id}: are as below")
for i, title in enumerate(recommendations[:5], start=1):
    print(f"{i}. {title}")

Top 5 recommendations for user 270713: are as below
1. The Stand: Complete And Uncut
2. The Return Of The King (The Lord Of The Rings, Part 3)
3. Lonesome Dove
4. Where The Red Fern Grows
5. The Fellowship Of The Ring (The Lord Of The Rings, Part 1)


In [68]:
import pandas as pd
from surprise import Dataset, Reader
from surprise.model_selection import train_test_split
from surprise import SVD
from surprise import accuracy


# Load data into the Surprise library's format
reader = Reader(rating_scale=(1, 10))
#reader = Reader(rating_scale=(1, 10))
data = Dataset.load_from_df(df[['user_id', 'common_identifier', 'book_rating']], reader)


# Split the data into training and testing sets
trainset, testset = train_test_split(data, test_size=0.2, random_state=42)

# Build and train the matrix factorization model
model = SVD()
model.fit(trainset)


# Make predictions on the test set
predictions = model.test(testset)

# Evaluate the model's accuracy
accuracy.rmse(predictions)


# Function to get book recommendations for a user
def get_book_recommendations(user_id, df, model, n=5):
    # Get the unique books the user hasn't interacted with
    books_not_interacted = df[~df['common_identifier'].isin(df[df['user_id'] == user_id]['common_identifier'].tolist())]['common_identifier'].unique()
    #books_not_interacted = df[~df['common_identifier'].isin(df[df['user_id'] == user_id]['common_identifier'].tolist())]['common_identifier'].unique()

    # Make predictions for the books the user hasn't interacted with
    predictions = [model.predict(user_id, book) for book in books_not_interacted]

    # Sort predictions by estimated rating in descending order
    sorted_predictions = sorted(predictions, key=lambda x: x.est, reverse=True)

    # Get the top N recommendations
    top_n_recommendations = sorted_predictions[:n]

    # Extract book titles from recommendations
    recommended_books = [df[df['common_identifier'] == prediction.iid]['book_title'].values[0] for prediction in top_n_recommendations]

    return recommended_books


# Call f(get_book_recommendations) for recommendatios
user_id = 270713
recommendations = get_book_recommendations(user_id, df, model, n=5)
print(f"Top 5 recommendations for user {user_id}: are as below")
for i, title in enumerate(recommendations[:5], start=1):
    print(f"{i}. {title}")


RMSE: 1.3894
Top 5 recommendations for user 270713: are as below
1. Dune (Remembering Tomorrow)
2. The Return Of The King (The Lord Of The Rings, Part 3)
3. The Stand: Complete And Uncut
4. Fast Food Nation: The Dark Side Of The All-American Meal
5. Charlotte'S Web (Trophy Newbery)


-  <span style="color: pink;"> Accessing values from "data" object used in above code 
data = Dataset.load_from_df(df[['user_id', 'common_identifier', 'book_rating']], reader) </span>

In [None]:
#Output Format ----->   'user_id', 'common_identifier', 'book_rating']

# Get the full training set from the data object
full_trainset = data.build_full_trainset()

# Convert the generator to a list and access the first few raw ratings
raw_ratings = list(full_trainset.all_ratings())[:3]

# Get the number of ratings (number of rows in the training set)
num_ratings = full_trainset.n_ratings


# Display the first few raw ratings
print(raw_ratings)

# Display the number of ratings
print("Number of Ratings:", num_ratings)


In [None]:
#determine the number of unique users and items (common_identifiers) 

# Get the full training set from the data object
full_trainset = data.build_full_trainset()

# Get the number of users and items (common_identifiers)
num_users = full_trainset.n_users
num_items = full_trainset.n_items

# Display the number of users and items
print("Number of unique users:", num_users)
print("Number of items (common_identifiers):", num_items)


In [None]:
!pip install lightfm

In [None]:
!pip install implicit

In [None]:
import pandas as pd
from lightfm import LightFM
from lightfm.data import Dataset
from lightfm.cross_validation import random_train_test_split
from lightfm.evaluation import precision_at_k

# Create a LightFM dataset
dataset = Dataset()
dataset.fit(users=df['user_id'], items=df['common_identifier'])
(interactions, _) = dataset.build_interactions(((row['user_id'], row['common_identifier']) for index, row in df.iterrows()))

# Split the data into train and test sets
train, test = random_train_test_split(interactions, test_percentage=0.2, random_state=42)

# Build the model
model = LightFM(loss='warp')  # You can try different loss functions (e.g., 'warp', 'logistic', 'bpr')

# Train the model
model.fit(train, epochs=30, num_threads=2)

# Evaluate the model
precision = precision_at_k(model, test, k=20).mean()
print(f"Precision at k=20: {precision}")


precision at k=5,The higher the precision, the better the model is at suggesting relevant items within the top-k recommendations.

#### <span style="color: green;">  LightFM model to get  top 5 recommendation of books for a user</span>

**<u><span style="color: red;"> Issue:</span></u>**

Recommends same book 5 times instead of 5 different books

In [None]:
import pandas as pd
from lightfm import LightFM
from lightfm.data import Dataset
from lightfm.cross_validation import random_train_test_split
from lightfm.evaluation import precision_at_k
import numpy as np

# Create a LightFM dataset
dataset = Dataset()
dataset.fit(users=df['user_id'], items=df['common_identifier'])
#build user-item interactions where each row represents a user, each column represents an item
(interactions, _) = dataset.build_interactions(((row['user_id'], row['common_identifier']) for index, row in df.iterrows()))

# Split the data into train and test sets
train, test = random_train_test_split(interactions, test_percentage=0.2, random_state=42)

# Build the model
model = LightFM(loss='warp')  # You can try different loss functions (e.g., 'warp', 'logistic', 'bpr')

# Train the model
model.fit(train, epochs=30, num_threads=2)

# Recommend top 5 books for a random user
num_users, num_items = interactions.shape

# Generate a random user ID
random_user_id = np.random.randint(0, num_users)

# Use the predict method to get scores for all items for the random user
scores = model.predict(random_user_id, np.arange(num_items))

# Get the indices of the top 5 items
top_item_indices = sorted(range(num_items), key=lambda x: scores[x], reverse=True)[:5]

# Map item indices to book titles
top_item_titles = df.loc[df['common_identifier'].isin(top_item_indices), 'book_title'].tolist()

# Print or use the top 5 book titles as recommendations for the random user
#print(f"Top 5 recommendations for user {random_user_id}: {top_item_titles}")

print(f"Top 5 recommendations for user {random_user_id}: are as below")
for i, title in enumerate(top_item_titles[:5], start=1):
    print(f"{i}. {title}")


####  <span style="color: green;"> Recommend books popular yearly </span>

In [None]:
df['year_of_publication'].unique()

In [None]:
# Group by year and calculate the average rating and number of ratings for each book
yearly_stats = df.groupby(['year_of_publication', 'isbn']).agg({'book_rating': ['mean', 'count']}).reset_index()
yearly_stats.columns = ['year', 'isbn', 'avg_rating', 'num_ratings']

# Sort books within each year by the number of ratings and average rating
yearly_stats = yearly_stats.sort_values(['year', 'num_ratings', 'avg_rating'], ascending=[True, False, False])

# Function to recommend popular books for a given year
def recommend_popular_books(year, top_n=5):
    year_books = yearly_stats[yearly_stats['year'] == year].head(top_n)
    recommended_books = df[df['isbn'].isin(year_books['isbn'])][['book_title', 'book_author']].drop_duplicates()
    return recommended_books

# Example usage
year_to_recommend = 2004  # Replace with the desired year
top_n_recommendations = 5  # Adjust the number of recommendations as needed

print(f"Popular Books in {year_to_recommend}:\n")
popular_books = recommend_popular_books(year_to_recommend, top_n=top_n_recommendations)
print(popular_books)


#### <span style="color: green;">  Recommend books based on Author</span>

In [None]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import linear_kernel

# Assuming 'df' is your input DataFrame
# Columns: ['book_title', 'book_author', 'year_of_publication', 'publisher', 'image_url_m', 'common_identifier', 'user_id', 'isbn', 'book_rating', 'age', 'city', 'country', 'user', 'item']

# Create a TF-IDF vectorizer for book authors
tfidf_vectorizer = TfidfVectorizer(stop_words='english')
tfidf_matrix = tfidf_vectorizer.fit_transform(df['book_author'])

# Create a DataFrame to store the mapping between book_author and index in tfidf_matrix
author_mapping = pd.DataFrame({'book_author': df['book_author'].unique(), 'index': range(len(df['book_author'].unique()))})

# Function to recommend books by the same author
def recommend_books_by_author(target_author, df=df, tfidf_matrix=tfidf_matrix, author_mapping=author_mapping):
    # Filter books by the target author
    author_books = df[df['book_author'] == target_author]['book_title'].unique()

    # Get the index of the target author in the mapping
    target_author_index = author_mapping[author_mapping['book_author'] == target_author]['index'].iloc[0]

    # Calculate the similarity between books by the target author and all other books
    target_author_tfidf = tfidf_matrix.getrow(target_author_index)
    similarity_scores = linear_kernel(target_author_tfidf, tfidf_matrix).flatten()

    # Sort books by similarity score in descending order
    recommended_books = pd.DataFrame({'book_title': df['book_title'], 'similarity_score': similarity_scores})
    recommended_books = recommended_books.sort_values(by='similarity_score', ascending=False)

    return recommended_books.head(5)  # Return top 5 recommendations




**<u><span style="color: red;"> Issue:</span></u>**

Some books repeated

In [None]:
# Calling
target_author_to_recommend = df['book_author'].sample().iloc[0]  # Randomly select a book author for recommendation

print(f"Recommendations for books by Author {target_author_to_recommend}:\n")
book_recommendations = recommend_books_by_author(target_author_to_recommend)
print(book_recommendations)

#### <span style="color: green;"> Recommend book based on book title </span>

In [None]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import linear_kernel

# Create a TF-IDF vectorizer for book titles
tfidf_vectorizer = TfidfVectorizer(stop_words='english')
tfidf_matrix = tfidf_vectorizer.fit_transform(df['book_title'])

# Create a DataFrame to store the mapping between book_title and index in tfidf_matrix
title_mapping = pd.DataFrame({'book_title': df['book_title'].unique(), 'index': range(len(df['book_title'].unique()))})

# Function to recommend books by the same title
def recommend_books_by_title(target_title, df=df, tfidf_matrix=tfidf_matrix, title_mapping=title_mapping):
    # Filter books by the target title
    title_books = df[df['book_title'] == target_title]['book_title'].unique()

    # Get the index of the target title in the mapping
    target_title_index = title_mapping[title_mapping['book_title'] == target_title]['index'].iloc[0]

    # Calculate the similarity between books with the target title and all other books
    target_title_tfidf = tfidf_matrix.getrow(target_title_index)
    similarity_scores = linear_kernel(target_title_tfidf, tfidf_matrix).flatten()

    # Sort books by similarity score in descending order
    recommended_books = pd.DataFrame({'book_title': df['book_title'], 'similarity_score': similarity_scores})
    
    recommended_books = recommended_books.sort_values(by='similarity_score', ascending=False)

    return recommended_books.head(5)  # Return top 5 recommendations




**<u><span style="color: red;"> Issue:</span></u>**

Books repeated 

In [None]:
# Calling function
target_title_to_recommend = df['book_title'].sample().iloc[0]  # Randomly select a book title for recommendation

print(f"Recommendations based on Book Title : {target_title_to_recommend}:\n")
book_recommendations = recommend_books_by_title(target_title_to_recommend)
print(book_recommendations)

## Error Analysis

- Model Evaluation: using other metrics than RSME

- Fraction of Concordant Pairs(fcp)
FCP is a ranking-oriented metric that assesses the proportion of concordant pairs (i.e., pairs of user-item interactions where the predicted ranking order matches the actual ranking order).

NOTE: Lower values for MAE and MSE indicate better accuracy, while higher values for FCP indicate better performance.


In [60]:
# Evaluate the model using additional metrics
accuracy.mae(predictions)
accuracy.mse(predictions)
accuracy.fcp(predictions)

MAE:  1.1191
MSE: 1.9287
FCP:  0.5566


0.556563864282585

- Error Analysis: Checking predictions versus actual ratings for few users

In [None]:
# Print some example predictions vs. actual ratings
for prediction in predictions[:10]:
    print(f"User: {prediction.uid}, Book: {prediction.iid}, Predicted: {prediction.est}, Actual: {prediction.r_ui}")


- Model Optimization: hyperparameter tuning to improve model performance. 
- Used grid search to find optimal hyperparameters

In [None]:
from surprise.model_selection import GridSearchCV

# Define the parameter grid
param_grid = {'n_factors': [50, 100, 150], 'reg_all': [0.02, 0.05, 0.1]}

# Create a grid search object
grid_search = GridSearchCV(SVD, param_grid, measures=['rmse'], cv=3)

# Fit the grid search object on the data
grid_search.fit(data)

# Get the best hyperparameters
best_params = grid_search.best_params['rmse']

# Create a new SVD model with the best hyperparameters
best_model = SVD(n_factors=best_params['n_factors'], reg_all=best_params['reg_all'])

# Fit the best model on the training set
best_model.fit(trainset)

# Make predictions on the test set
best_predictions = best_model.test(testset)

# Evaluate the best model
print("Best hyperparameters:", best_params)
accuracy.rmse(best_predictions)


- Error Analysis Report: actual and predicted book titles, and it indicates whether the prediction is considered accurate based on a threshold difference (in this case, ±2). 

In [None]:
from surprise.model_selection import GridSearchCV
from surprise import Dataset, Reader
from surprise.model_selection import train_test_split
from surprise import SVD
from surprise import accuracy

# Load data into the Surprise library's format
reader = Reader(rating_scale=(1, 10))
data = Dataset.load_from_df(df[['user_id', 'common_identifier', 'book_rating']], reader)

# Split the data into training and testing sets
trainset, testset = train_test_split(data, test_size=0.2, random_state=42)

# Define the parameter grid
param_grid = {'n_factors': [50, 100, 150], 'reg_all': [0.02, 0.05, 0.1]}

# Create a grid search object
grid_search = GridSearchCV(SVD, param_grid, measures=['rmse'], cv=3)

# Fit the grid search object on the data
grid_search.fit(data)

# Get the best hyperparameters
best_params = grid_search.best_params['rmse']

# Create a new SVD model with the best hyperparameters
best_model = SVD(n_factors=best_params['n_factors'], reg_all=best_params['reg_all'])

# Fit the best model on the training set
best_model.fit(trainset)

# Make predictions on the test set
best_predictions = best_model.test(testset)

# Evaluate the best model
print("Best hyperparameters:", best_params)
accuracy.rmse(best_predictions)

# Error Analysis Report
print("\nError Analysis Report:")
for i, prediction in enumerate(best_predictions[:10], 1):
    book_title_actual = df[df['common_identifier'] == prediction.iid]['book_title'].values[0]
    book_title_predicted = df[df['common_identifier'] == prediction.iid]['book_title'].values[0]
    
    # Check if the prediction is accurate (within a threshold, e.g., ±1)
    is_accurate = abs(prediction.est - prediction.r_ui) <= 2
    
    print(f"\nPrediction {i}:")
    print(f"User ID: {prediction.uid}")
    print(f"Actual Book Title: {book_title_actual}")
    print(f"Predicted Book Title: {book_title_predicted}")
    print(f"Predicted Rating: {prediction.est:.2f}")
    print(f"Actual Rating: {prediction.r_ui}")
    print(f"Is Accurate: {is_accurate}")


In [None]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.neighbors import NearestNeighbors


# Initialize TfidfVectorizer
vectorizer = TfidfVectorizer()

# Transform book titles into numerical vectors
title_vectors = vectorizer.fit_transform(df['book_title'])

# Initialize NearestNeighbors
nn_model = NearestNeighbors(n_neighbors=4)  # n_neighbors=4 to include the input book itself

# Fit the model using transformed vectors
nn_model.fit(title_vectors)

def recommend_books(input_book_title):
    # Transform the input book title into a numerical vector
    input_vector = vectorizer.transform([input_book_title])

    # Get the indices of the nearest neighbors (including the input book itself)
    _, neighbor_indices = nn_model.kneighbors(input_vector)

    # Extract recommended book indices excluding the input book itself
    neighbor_indices = neighbor_indices[0][1:]

    # Get the recommended books based on the indices
    recommended_books = df.iloc[neighbor_indices][['image_url_m', 'book_title', 'book_author']]

    return recommended_books

# Example usage
input_title = 'Life of Pi'
recommendations = recommend_books(input_title)
print(f"Recommendations for '{input_title}':")
print(recommendations[['image_url_m','book_title', 'book_author']])
