# Book-Oracle: Basic Recommendation System

- Develop a basic Recommendation System
- 26.11.2023
- Janina, Oliwia, Neha, Nina

## Import Libraries

In [22]:
import pandas as pd
import numpy as np
import pickle
#Modelling
from sklearn.preprocessing import MinMaxScaler, StandardScaler, RobustScaler, OneHotEncoder

from sklearn.metrics import classification_report, accuracy_score, roc_auc_score, roc_curve, confusion_matrix, make_scorer, precision_score, recall_score, f1_score
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.pipeline import Pipeline
from scipy.sparse import csr_matrix, hstack
from sklearn.neighbors import NearestNeighbors

#NLP
import nltk

#Plotting
import matplotlib.pyplot as plt
import seaborn as sns
from matplotlib.ticker import PercentFormatter
plt.rcParams.update({ "figure.figsize" : (8, 5),"axes.facecolor" : "white", "axes.edgecolor":  "black"})
plt.rcParams["figure.facecolor"]= "w"
pd.plotting.register_matplotlib_converters()
pd.set_option('display.float_format', lambda x: '%.3f' % x)
pd.options.display.float_format = "{:,.2f}".format

RSEED = 42

import warnings
warnings.filterwarnings('ignore')

## Import Data

In [None]:
df = pd.read_csv('data/kaggle_full_df.csv')
df['country'].fillna('unknown', inplace=True)
df.shape

In [None]:
df.info()

In [None]:
#Only Rating greater than 4
df = df[df['book_rating']>0]

#Only users from US or Canada
df = df[df['country'].str.contains("usa|canada")]

df.shape

In [None]:
#Add a new column with a total rating count for each book by common identifier
df['rating_count'] = df.groupby(['book_title', 'book_author'])['book_rating'].transform('count')

#Show a list of books that got the highest rating count, group by title and author to show unique books

df.groupby(['book_title', 'book_author', 'rating_count']).size().reset_index(name='Count').sort_values(by='rating_count', ascending=False).head(5)

In [None]:
popularity_threshold = 50
df = df[df['rating_count'] >= popularity_threshold]
df.shape

<span style="color: White;">Collaborative Filtering - Item based:</span>

In [None]:
!pip install scikit-surprise

- Library used - Surprise

- Model - matrix factorization SVD

- Recommend top 5 books for a user. (Here, user_id is to be given as input)

- Identify books the user hasn't interacted with.
- Make predictions for these books.
- Sort predictions by estimated rating.
- Extract the top N recommendations and return their titles.

#### <span style="color: green;"> Recommend top 5 books to User based on the books NOT interacted with </span>

- Model used - Matrix Factorization SVD
- Library used - Surprise

## Modelling & Evaluation

In [None]:
import pandas as pd
from surprise import Dataset, Reader
from surprise.model_selection import train_test_split
from surprise import SVD
from surprise import accuracy


In [None]:

# Load data into the Surprise library's format
reader = Reader(rating_scale=(1, 10))
#reader = Reader(rating_scale=(1, 10))
data = Dataset.load_from_df(df[['user_id', 'common_identifier', 'book_rating']], reader)


In [None]:

# Split the data into training and testing sets
trainset, testset = train_test_split(data, test_size=0.2, random_state=42)

# Build and train the matrix factorization model
model = SVD()
model.fit(trainset)


In [None]:

# Make predictions on the test set
predictions = model.test(testset)

# Evaluate the model's accuracy
accuracy.rmse(predictions)


In [None]:
# Function to get book recommendations for a user
def get_book_recommendations(user_id, df, model, n=5):
    # Get the unique books the user hasn't interacted with
    books_not_interacted = df[~df['common_identifier'].isin(df[df['user_id'] == user_id]['common_identifier'].tolist())]['common_identifier'].unique()

    # Make predictions for the books the user hasn't interacted with
    predictions = [model.predict(user_id, book) for book in books_not_interacted]

    # Sort predictions by estimated rating in descending order
    sorted_predictions = sorted(predictions, key=lambda x: x.est, reverse=True)

    # Get the top N unique recommendations
    top_n_recommendations = []
    seen_books = set()
    

    for prediction in sorted_predictions:
        if len(top_n_recommendations) >= n:
            break

        book_id = prediction.iid

        # Check if the book has been recommended before
        if book_id not in seen_books:
            seen_books.add(book_id)
            print(seen_books)

            # Extract information from recommendations
            book_info = df[df['common_identifier'] == book_id][['common_identifier', 'book_title', 'book_author', 'book_rating', 'image_url_m']].to_dict(orient='records')
            top_n_recommendations.append(book_info[0])

    return top_n_recommendations


In [None]:
# Assuming df is your DataFrame and model is your collaborative filtering model

# Example usage
user_id = 31315
recommendations = get_book_recommendations(user_id, df, model, n=5)

# Print or use the recommendations as needed
for i, book_info in enumerate(recommendations, start=1):
    print(f"{i}. Common Identifier: {book_info['common_identifier']}")
    print(f"   Book Title: {book_info['book_title']}")
    print(f"   Book Author: {book_info['book_author']}")
    print(f"   Book Rating: {book_info['book_rating']}")
    print(f"   Image URL (Medium): {book_info['image_url_m']}")
    print("---")


In [None]:
print(df['user_id'].head(2000).unique())

-  <span style="color: pink;"> Accessing values from "data" object used in above code 
data = Dataset.load_from_df(df[['user_id', 'common_identifier', 'book_rating']], reader) </span>

In [None]:
#Output Format ----->   'user_id', 'common_identifier', 'book_rating']

# Get the full training set from the data object
full_trainset = data.build_full_trainset()

# Convert the generator to a list and access the first few raw ratings
raw_ratings = list(full_trainset.all_ratings())[:3]

# Get the number of ratings (number of rows in the training set)
num_ratings = full_trainset.n_ratings


# Display the first few raw ratings
print(raw_ratings)

# Display the number of ratings
print("Number of Ratings:", num_ratings)


In [None]:
#determine the number of unique users and items (common_identifiers) 

# Get the full training set from the data object
full_trainset = data.build_full_trainset()

# Get the number of users and items (common_identifiers)
num_users = full_trainset.n_users
num_items = full_trainset.n_items

# Display the number of users and items
print("Number of unique users:", num_users)
print("Number of items (common_identifiers):", num_items)


In [None]:
!pip install lightfm

In [None]:
!pip install implicit

In [None]:
import pandas as pd
from lightfm import LightFM
from lightfm.data import Dataset
from lightfm.cross_validation import random_train_test_split
from lightfm.evaluation import precision_at_k

# Create a LightFM dataset
dataset = Dataset()
dataset.fit(users=df['user_id'], items=df['common_identifier'])
(interactions, _) = dataset.build_interactions(((row['user_id'], row['common_identifier']) for index, row in df.iterrows()))

# Split the data into train and test sets
train, test = random_train_test_split(interactions, test_percentage=0.2, random_state=42)

# Build the model
model = LightFM(loss='warp')  # You can try different loss functions (e.g., 'warp', 'logistic', 'bpr')

# Train the model
model.fit(train, epochs=30, num_threads=2)

# Evaluate the model
precision = precision_at_k(model, test, k=20).mean()
print(f"Precision at k=20: {precision}")


precision at k=5,The higher the precision, the better the model is at suggesting relevant items within the top-k recommendations.

#### <span style="color: green;">  Recommend books based on Author</span>

In [None]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import linear_kernel

# Assuming 'df' is your input DataFrame
# Columns: ['book_title', 'book_author', 'year_of_publication', 'publisher', 'image_url_m', 'common_identifier', 'user_id', 'isbn', 'book_rating', 'age', 'city', 'country', 'user', 'item']

# Create a TF-IDF vectorizer for book authors
tfidf_vectorizer = TfidfVectorizer(stop_words='english')
tfidf_matrix = tfidf_vectorizer.fit_transform(df['book_author'])

# Create a DataFrame to store the mapping between book_author and index in tfidf_matrix
author_mapping = pd.DataFrame({'book_author': df['book_author'].unique(), 'index': range(len(df['book_author'].unique()))})

# Function to recommend books by the same author
def recommend_books_by_author(target_author, df=df, tfidf_matrix=tfidf_matrix, author_mapping=author_mapping):
    # Filter books by the target author
    author_books = df[df['book_author'] == target_author]['book_title'].unique()

    # Get the index of the target author in the mapping
    target_author_index = author_mapping[author_mapping['book_author'] == target_author]['index'].iloc[0]

    # Calculate the similarity between books by the target author and all other books
    target_author_tfidf = tfidf_matrix.getrow(target_author_index)
    similarity_scores = linear_kernel(target_author_tfidf, tfidf_matrix).flatten()

    # Sort books by similarity score in descending order
    recommended_books = pd.DataFrame({'book_title': df['book_title'], 'similarity_score': similarity_scores})
    recommended_books = recommended_books.sort_values(by='similarity_score', ascending=False)

    return recommended_books.head(5)  # Return top 5 recommendations




In [None]:
# Calling
target_author_to_recommend = df['book_author'].sample().iloc[0]  # Randomly select a book author for recommendation

print(f"Recommendations for books by Author {target_author_to_recommend}:\n")
book_recommendations = recommend_books_by_author(target_author_to_recommend)
print(book_recommendations)

## Error Analysis

- Model Evaluation: using other metrics than RSME

- Fraction of Concordant Pairs(fcp)
FCP is a ranking-oriented metric that assesses the proportion of concordant pairs (i.e., pairs of user-item interactions where the predicted ranking order matches the actual ranking order).

NOTE: Lower values for MAE and MSE indicate better accuracy, while higher values for FCP indicate better performance.


In [None]:
# Evaluate the model using additional metrics
accuracy.mae(predictions)
accuracy.mse(predictions)
accuracy.fcp(predictions)

- Error Analysis: Checking predictions versus actual ratings for few users

In [None]:
# Print some example predictions vs. actual ratings
for prediction in predictions[:5]:
    print(f"User: {prediction.uid}, Book: {prediction.iid}, Predicted: {prediction.est}, Actual: {prediction.r_ui}")


- Model Optimization: hyperparameter tuning to improve model performance. 
- Used grid search to find optimal hyperparameters

In [None]:
from surprise.model_selection import GridSearchCV

# Define the parameter grid
param_grid = {'n_factors': [50, 100, 150], 'reg_all': [0.02, 0.05, 0.1]}

# Create a grid search object
grid_search = GridSearchCV(SVD, param_grid, measures=['rmse'], cv=3)

# Fit the grid search object on the data
grid_search.fit(data)

# Get the best hyperparameters
best_params = grid_search.best_params['rmse']

# Create a new SVD model with the best hyperparameters
best_model = SVD(n_factors=best_params['n_factors'], reg_all=best_params['reg_all'])

# Fit the best model on the training set
best_model.fit(trainset)

# Make predictions on the test set
best_predictions = best_model.test(testset)

# Evaluate the best model
print("Best hyperparameters:", best_params)
accuracy.rmse(best_predictions)


- Error Analysis Report: actual and predicted book titles, and it indicates whether the prediction is considered accurate based on a threshold difference (in this case, ±2). 

In [None]:
from surprise.model_selection import GridSearchCV
from surprise import Dataset, Reader
from surprise.model_selection import train_test_split
from surprise import SVD
from surprise import accuracy

# Load data into the Surprise library's format
reader = Reader(rating_scale=(1, 10))
data = Dataset.load_from_df(df[['user_id', 'common_identifier', 'book_rating']], reader)

# Split the data into training and testing sets
trainset, testset = train_test_split(data, test_size=0.2, random_state=42)

# Define the parameter grid
param_grid = {'n_factors': [50, 100, 150], 'reg_all': [0.02, 0.05, 0.1]}

# Create a grid search object
grid_search = GridSearchCV(SVD, param_grid, measures=['rmse'], cv=3)

# Fit the grid search object on the data
grid_search.fit(data)

# Get the best hyperparameters
best_params = grid_search.best_params['rmse']

# Create a new SVD model with the best hyperparameters
best_model = SVD(n_factors=best_params['n_factors'], reg_all=best_params['reg_all'])

# Fit the best model on the training set
best_model.fit(trainset)

# Make predictions on the test set
best_predictions = best_model.test(testset)

# Evaluate the best model
print("Best hyperparameters:", best_params)
accuracy.rmse(best_predictions)

# Error Analysis Report
print("\nError Analysis Report:")
for i, prediction in enumerate(best_predictions[:10], 1):
    book_title_actual = df[df['common_identifier'] == prediction.iid]['book_title'].values[0]
    book_title_predicted = df[df['common_identifier'] == prediction.iid]['book_title'].values[0]
    
    # Check if the prediction is accurate (within a threshold, e.g., ±1)
    is_accurate = abs(prediction.est - prediction.r_ui) <= 2
    
    print(f"\nPrediction {i}:")
    print(f"User ID: {prediction.uid}")
    print(f"Actual Book Title: {book_title_actual}")
    print(f"Predicted Book Title: {book_title_predicted}")
    print(f"Predicted Rating: {prediction.est:.2f}")
    print(f"Actual Rating: {prediction.r_ui}")
    print(f"Is Accurate: {is_accurate}")


#### Using pickle library to save & load models for streamlit app use

In [None]:
import pickle

- Saving models in pickle

In [None]:
# create an iterator object with write permission - model.pkl
with open('svd.pkl', 'wb') as files:
    pickle.dump(model, files)

- Loading saved model in pickle

In [None]:
# load saved model
with open('svd.pkl' , 'rb') as f:
    load_svd_model = pickle.load(f)

In [None]:
print(load_svd_model)