<a href="https://colab.research.google.com/github/Sarvesh1814/US-Book-Recommendation-System-/blob/main/BE_RR.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Mathematics Behind the Baseline Estimate Model


---
* **Baseline Equation**
---
\begin{equation}
b_{ui} = \mu + b_u + b_i
\end{equation}
---
---
* **Optimization Problem**
---
\begin{equation}
\min_{b_u, b_i} \sum_{(u,i) \in R_{train}} (r_{ui} - b_{ui})^2 + \lambda (||b_u||^2 + ||b_i||^2)
\end{equation}
---
---
* **Gradient Descent**
---
\begin{equation}
\frac{\partial}{\partial b_u} = -2 \sum_{i \in I_u} (r_{ui} - \mu - b_u - b_i) + 2 \lambda b_u
\end{equation}

\begin{equation}
\frac{\partial}{\partial b_i} = -2 \sum_{u \in U_i} (r_{ui} - \mu - b_u - b_i) + 2 \lambda b_i
\end{equation}

\begin{equation}
b_{u}^{(k+1)} = b_{u}^{(k)} - \gamma \cdot \frac{\partial}{\partial b_u} J(b_u^{(k)}, b_i^{(k)})
\end{equation}

\begin{equation}
b_{i}^{(k+1)} = b_{i}^{(k)} - \gamma \cdot \frac{\partial}{\partial b_i} J(b_u^{(k)}, b_i^{(k)})
\end{equation}

\begin{aligned}
b_{u}^{(k+1)} &= b_{u}^{(k)} + \gamma \cdot \left( \sum_{i \in I_u} (r_{ui} - \mu - b_u^{(k)} - b_i^{(k)}) - \lambda b_u^{(k)} \right) \\
b_{i}^{(k+1)} &= b_{i}^{(k)} + \gamma \cdot \left( \sum_{u \in U_i} (r_{ui} - \mu - b_u^{(k)} - b_i^{(k)}) - \lambda b_i^{(k)} \right)
\end{aligned}

---
* **Update Rule**
---
\begin{equation}
b_{u}^{(k+1)} = b_{u}^{(k)} + \gamma \cdot \left( e_{ui} - \lambda \cdot b_{u}^{(k)} \right)
\end{equation}


# Importing Libraries

In [None]:
!pip install surprise
from surprise import Dataset, Reader
import numpy as np
import pandas as pd
from surprise.model_selection import GridSearchCV
from surprise import SVD
from surprise.model_selection import cross_validate, train_test_split
from surprise import accuracy
from surprise import dump
import pickle


Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


# Data 

In [None]:
Base = "/content/drive/MyDrive/RS Data/Assignment 3/Processed_data/"
train=pd.read_csv(Base+"Train_df_final.csv")
test = pd.read_csv(Base+"Test_df_final.csv")

In [None]:
# Extracting import columns
train = train[["User-ID"	,"ISBN",	"Book-Rating"	,"book_name"]]
test = test[["User-ID"	,"ISBN",	"Book-Rating"	,"book_name"]]

In [None]:
# Creating Surpise Dataset
reader = Reader(rating_scale=(1, 10)) # Rating Scale from 1-10
data = Dataset.load_from_df(train[['User-ID', 'ISBN', 'Book-Rating']], reader) 
trainset = data.build_full_trainset()


In [None]:
users= pd.read_csv("/content/drive/MyDrive/RS Data/Assignment 3/Processed_data/users_features.csv")

In [None]:
books=pd.read_csv("/content/drive/MyDrive/RS Data/Assignment 3/Processed_data/book_features.csv")
books.head()

Unnamed: 0,Book-Title,ISBN,page_count,categories,Year-Of-Publication,feature_1,feature_2,feature_3,feature_4,feature_5,...,feature_91,feature_92,feature_93,feature_94,feature_95,feature_96,feature_97,feature_98,feature_99,feature_100
0,Footfall,345323440,706.0,330,1996,-0.32409,0.17809,0.085679,0.051699,-0.233513,...,0.382992,-0.09016,-0.040599,-0.15977,0.468369,0.346564,0.053775,-0.575963,0.202055,0.013086
1,Footfall,345323475,706.0,330,1985,-0.32409,0.17809,0.085679,0.051699,-0.233513,...,0.382992,-0.09016,-0.040599,-0.15977,0.468369,0.346564,0.053775,-0.575963,0.202055,0.013086
2,The Reader,679442790,224.0,330,1997,-0.265439,0.180581,-0.044998,0.521798,0.064341,...,0.527207,-0.152366,-0.067668,0.264588,0.671669,0.287058,0.089154,-0.69685,-0.135784,0.029446
3,The Reader,375707972,224.0,330,1999,-0.265439,0.180581,-0.044998,0.521798,0.064341,...,0.527207,-0.152366,-0.067668,0.264588,0.671669,0.287058,0.089154,-0.69685,-0.135784,0.029446
4,The Reader,679781307,224.0,330,1998,-0.265439,0.180581,-0.044998,0.521798,0.064341,...,0.527207,-0.152366,-0.067668,0.264588,0.671669,0.287058,0.089154,-0.69685,-0.135784,0.029446


# Loading Models

## Recommendation Algorithm

In [None]:
algo = dump.load('/content/drive/MyDrive/RS Data/Assignment 3/Processed_data/baseline_model.pkl')[1]

## Reranking Algorithm

In [None]:
with open('/content/drive/MyDrive/RS Data/Assignment 3/Processed_data/XGB_Ranker.pkl', 'rb') as file:
    model = pickle.load(file)



# Recommendation system with Reranking (BE + XGB Ranker)

In [None]:
def BaseLine():

  k = int(input("Value of K for top-K Recommendations: "))
  user_id = int(input("Enter User-ID: "))  
  
  # Mapping user id to surprise's uid
  uid = trainset.to_inner_uid(user_id)
  all_items = trainset.all_items()
  x = trainset.ur[int(uid)]
  a = [x[i][0] for i in range(len(x))]

  # Finding unrated Items for user
  not_rated_items = [item for item in all_items if item not in a]
  
  # saving predictions for unrated items
  predictions = []
  for iid in not_rated_items:
      pred = algo.predict(uid, trainset.to_raw_iid(iid))
      predictions.append((iid, pred.est))
  
  # sorting top 1000 items based on predicted ratings of unkown items for the user
  predictions_sorted = sorted(predictions, key=lambda x: x[1], reverse=True)[:100]

  # converting it into dataframe for further processing of reranking of the candidates
  lst =()
  for item, rating in predictions_sorted:
      lst+= ((trainset.to_raw_iid(item),rating),)
  dx = pd.DataFrame(lst,columns=["ISBN","Predicted Rating"])
  
  # Fetching features of the user 
  uf =users.loc[users['User-ID'] ==user_id ].drop(columns=['User-ID'])
  
  # Fetching features of the top items to make a combined user-item feature vector 
  temp = dx.merge(books,on="ISBN",how="inner").drop(columns=['page_count',"Predicted Rating","Book-Title"])
  temp["Age"] = users["Age"].iloc[0]
  temp["State"] =users["State"].iloc[0]
  
  # Providing features to reranker
  X_test = temp.iloc[:,1:]
  
  #predicting the rank of the items 
  y_pred = model.predict(X_test)
  temp['predicted_score'] = y_pred
  
  # Reranking the items
  temp = temp.sort_values(by='predicted_score', ascending=False)
  temp = temp[["ISBN"]].merge(books[["Book-Title","ISBN"]],on = 'ISBN',how = 'inner')
  
  print("\n")
  print(f"Top {k} Book recommendations for User {user_id} are: ")
  display(temp.head(k))
  data_dict = temp.iloc[0:k].to_dict(orient='records')
  return 

In [None]:
BaseLine()

Value of K for top-K Recommendations: 5
Enter User-ID: 151824


Top 5 Book recommendations for User 151824 are: 


Unnamed: 0,ISBN,Book-Title
0,439139600,Harry Potter and the Goblet of Fire (Book 4)
1,439139597,Harry Potter and the Goblet of Fire (Book 4)
2,394800184,Are You My Mother?
3,441172717,Dune (Remembering Tomorrow)
4,60256656,The Giving Tree


# Coverage

In [None]:
def BaseTest(user_id, trainset, algo, model, books, users,k):
  
  # Mapping user id to surprise's uid
  uid = trainset.to_inner_uid(user_id)
  all_items = trainset.all_items()
  x = trainset.ur[int(uid)]
  a = [x[i][0] for i in range(len(x))]

  # Finding unrated Items for user
  not_rated_items = [item for item in all_items if item not in a]
  
  # saving predictions for unrated items
  predictions = []
  for iid in not_rated_items:
      pred = algo.predict(uid, trainset.to_raw_iid(iid))
      predictions.append((iid, pred.est))
  
  # sorting top 1000 items based on predicted ratings of unkown items for the user
  predictions_sorted = sorted(predictions, key=lambda x: x[1], reverse=True)[:1000]

  # converting it into dataframe for further processing of reranking of the candidates
  lst =()
  for item, rating in predictions_sorted:
      lst+= ((trainset.to_raw_iid(item),rating),)
  dx = pd.DataFrame(lst,columns=["ISBN","Predicted Rating"])
  
  # Fetching features of the user 
  uf =users.loc[users['User-ID'] ==user_id ].drop(columns=['User-ID'])
  
  # Fetching features of the top items to make a combined user-item feature vector 
  temp = dx.merge(books,on="ISBN",how="inner").drop(columns=['page_count',"Predicted Rating","Book-Title"])
  temp["Age"] = users["Age"].iloc[0]
  temp["State"] =users["State"].iloc[0]
  
  # Providing features to reranker
  X_test = temp.iloc[:,1:]
  
  #predicting the rank of the items 
  y_pred = model.predict(X_test)
  temp['predicted_score'] = y_pred
  
  # Reranking the items
  temp = temp.sort_values(by='predicted_score', ascending=False)
  temp = temp[["ISBN"]].merge(books[["Book-Title","ISBN"]],on = 'ISBN',how = 'inner')
  
  data_dict = temp.iloc[0:k].to_dict(orient='records')
  return [x['ISBN'] for x in data_dict]


In [None]:
true = test[test['Book-Rating'] == 10]
true = true[true['User-ID'].isin(list(train["User-ID"]))]
true   

Unnamed: 0,User-ID,ISBN,Book-Rating,book_name
9,31315,051513452X,10,The Lunatic Cafe (Anita Blake Vampire Hunter (...
15,16634,0452261341,10,"Gunslinger Tower 1 (Dark Tower, No 1)"
18,46003,1583224890,10,9-11
29,83287,0553563521,10,Devil's Waltz (Alex Delaware Novels (Paperback))
30,51883,0425077047,10,And Ladies of the Club
...,...,...,...,...
23403,116122,0345434684,10,The Skies Of Pern
23409,198781,0060961325,10,The Celluloid Closet: Homosexuality in the Movies
23410,219259,0767915054,10,A Girl Named Zippy: Growing Up Small in Moorel...
23416,145449,0316107255,10,Penguin Dreams and Stranger Things (A Bloom Co...


### Coverage @ 5

In [None]:
all_recommendations = []
for user_id in true["User-ID"]:
    recommendations = BaseTest(user_id, trainset, algo, model, books, users,5)
    all_recommendations.extend(recommendations)
intersection = list(set(all_recommendations).intersection(set(books["ISBN"])))
coverage = len(intersection)/len(set(books["ISBN"]))
coverage

0.0006262721152340692

### Coverage @ 10 

In [None]:
all_recommendations = []
for user_id in true["User-ID"]:
    recommendations = BaseTest(user_id, trainset, algo, model, books, users,10)
    all_recommendations.extend(recommendations)
intersection = list(set(all_recommendations).intersection(set(books["ISBN"])))
coverage = len(intersection)/len(set(books["ISBN"]))
coverage

0.0011742602160638798

### Coverage @100

In [16]:
all_recommendations = []
for user_id in true["User-ID"]:
    recommendations = BaseTest(user_id, trainset, algo, model, books, users,100)
    all_recommendations.extend(recommendations)
intersection = list(set(all_recommendations).intersection(set(books["ISBN"])))
coverage = len(intersection)/len(set(books["ISBN"]))
coverage

0.008767809613276969

### Coverage @ 1000

In [17]:
all_recommendations = []
for user_id in true["User-ID"]:
    recommendations = BaseTest(user_id, trainset, algo, model, books, users,1000)
    all_recommendations.extend(recommendations)
intersection = list(set(all_recommendations).intersection(set(books["ISBN"])))
coverage = len(intersection)/len(set(books["ISBN"]))
coverage

0.08251135118208862