<a href="https://colab.research.google.com/github/Nimisha-30/Recommendation-System/blob/main/Book_Recommendation_System.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install surprise

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [None]:
import pandas as pd
from random import randint
import numpy as np
from scipy.sparse import csr_matrix
from scipy.sparse.linalg import svds
from numpy import dot
from numpy.linalg import norm
from sklearn.metrics.pairwise import cosine_similarity
from surprise import SVD, Reader, Dataset, accuracy
from surprise.model_selection import train_test_split

In [None]:
""" Function to generate dataset with features associated to book data set.
Input parameters: n_books:number of books,
n_genres:number of genres,
n_authors:number of authors,
n_publishers:number of publishers,
n_readers:number of readers,
size:size of dataset. """
def generateData(n_books=3000, n_genres=15, n_authors=500, n_publishers=50, n_readers=30000, size=100000):
    d=pd.DataFrame({
        'bid': [randint(1, n_books) for _ in range(size)], # book id:string
        'aid': [randint(1, n_authors) for _ in range(size)], # author id:string
        'genre': [randint(1, n_genres) for _ in range(size)], # book genre:integer, value representing genre of a book, value between 0 and 16
        'rid': [randint(1, n_readers) for _ in range(size)], # reader id:string
        'n_pages': [randint(100, 800) for _ in range(size)], # number of pages:integer, value between 100 and 800
        'rating': [randint(1, 10) for _ in range(size)], # book rating:integer, random value between 0 and 11
        'pid': [randint(1, n_publishers) for _ in range(size)], # publisher id:string
        'year': [randint(2000, 2021) for _ in range(size)], # publisher year:integer, year of publishing
        'price': [randint(1, 200) for _ in range(size)], # book price:integer, sale price of book
        'lang': [randint(1, 7) for _ in range(size)] # text language:integer, language of the book which is mapped to an integer
    }).drop_duplicates()
    return d

In [None]:
d=generateData(size=100000)
d.to_csv('data.csv', index=False)

#COLLABORATIVE FILTERING SYSTEM

In [None]:
""" Function to normalize prediction ratings.
Input parameter: Prediction ratings - (List -> List). """
def normalize(pred_rating):
    return (pred_rating-pred_rating.min())/(pred_rating.max()-pred_rating.min())

In [None]:
""" Function to calculate singular value decomposition of input matrix given n_factors.
It will then generate and normalize the user rating predictions.
Input parameters: scipy csr matrix corresponding to pivot table,
pandas dataframe which is pivot table,
number of singular values and vectors to compute
Must be 1<=n_factors<min(mat.shape). """
def generate_prediction_df(mat, pt_df, n_factors):
    if not 1<=n_factors<min(mat.shape):
        raise ValueError("Must be 1<=n_factors<min(mat.shape).")
    # matrix factorization
    u, s, v=svds(mat, k=n_factors)
    s=np.diag(s)
    # calculate prediction ratings
    pred_rating=np.dot(np.dot(u, s), v)
    pred_rating=normalize(pred_rating)
    # convert to df
    pred_df=pd.DataFrame(pred_rating, columns=pt_df.columns, index=list(pt_df.index)).transpose()
    return pred_df

In [None]:
""" Function to recommend items to user.
Input parametres: pred_df:generated from 'generate_prediction_df' function,
usr_id:user you wish to get item recommendations for,
n_recs:number of recommendations you want for this user. """
def recommend_items(pred_df, usr_id, n_recs):
    usr_pred=pred_df[usr_id].sort_values(ascending=False).reset_index().rename(columns={usr_id:'sim'})
    rec_df=usr_pred.sort_values(by='sim', ascending=False).head(n_recs)
    return rec_df

In [None]:
if __name__=='__main__':
    # constants
    PATH='/content/data.csv'
    # import data
    df=pd.read_csv(PATH)
    print(df.shape)
    # generate a pivot table with readers on the index and books on the column and values being the ratings
    pt_df=df.pivot_table(columns='bid', index='rid', values='rating').fillna(0)
    # convert to csr matrix
    mat=pt_df.values
    mat=csr_matrix(mat)
    pred_df=generate_prediction_df(mat, pt_df, 10)
    #generate recommendations
    print(recommend_items(pred_df, 5, 5))

(100000, 10)
    bid       sim
0  1770  0.172923
1  1467  0.171165
2  1269  0.170773
3   414  0.170558
4  1479  0.170384


#CONTENT BASED SYSTEM

In [None]:
""" Function to normalize input data between 0 and 1.
Input parameters: data : List - List of values to normalize. """
def normalize(data):
    min_val=min(data)
    if min_val<0:
        data=[x+abs(min_val) for x in data]
    max_val=max(data)
    return [x/max_val for x in data]

In [None]:
""" Function is used to one hot encode specified column and add it into input dataframe.
Input parametres: df:data frame for which results are to be appended,
enc_col:column to OHE. """
def ohe(df, enc_col):
    ohe_df=pd.get_dummies(df[enc_col])
    ohe_df.reset_index(drop=True, inplace=True)
    return pd.concat([df, ohe_df], axis=1)

In [None]:
class CBRecommend():
    def __init__(self, df):
        self.df=df
    """ Function to calculate cosine similarity between 2 vectors. """
    def cosine_sim(self, v1, v2):
        return sum(dot(v1, v2)/(norm(v1)*norm(v2)))
    def recommend(self, bid, n_rec):
        # calculate similarity of input book id vector wrt all other vectors
        inputVec=self.df.loc[bid].values
        self.df['sim']=self.df.apply(lambda x: self.cosine_sim(inputVec, x.values), axis=1)
        # return top n user specified books
        return self.df.nlargest(columns='sim', n=n_rec)

In [None]:
if __name__=="__main__":
    # constants
    PATH='/content/data.csv'
    # import data
    df=pd.read_csv(PATH)
    # normalize n_pages, ratings, price columns
    df['n_pages_norm']=normalize(df['n_pages'].values)
    df['rating_norm']=normalize(df['rating'].values)
    df['price_norm']=normalize(df['price'].values)
    # OHE on publish_year aand genre
    df=ohe(df=df, enc_col='year')
    df=ohe(df=df, enc_col='genre')
    df=ohe(df=df, enc_col='lang')
    # drop redundant columns
    cols=['year', 'genre', 'n_pages', 'rating', 'price', 'lang']
    df.drop(columns=cols, inplace=True)
    df.set_index('bid', inplace=True)
    # run on a sample as example
    t=df.copy()
    cbr=CBRecommend(df=t)
    print(cbr.recommend(bid=t.index[0], n_rec=5))

      aid    rid  pid  n_pages_norm  rating_norm  price_norm  2000  2001  \
bid                                                                        
1813  412  27566   43       0.95250          0.4       0.080     0     0   
1566  416  27875   43       0.99375          0.1       0.755     0     0   
2779  405  27148   41       0.97000          0.3       0.280     0     0   
1972  354  23660   36       0.49875          0.9       0.775     0     0   
1995  429  28692   43       0.71000          0.5       0.780     0     0   

      2002  2003  ...  14  15  1  2  3  4  5  6  7       sim  
bid               ...                                         
1813     0     0  ...   0   0  0  0  1  0  0  0  0  4.710501  
1566     0     0  ...   0   0  0  0  1  0  0  0  0  4.710501  
2779     0     0  ...   0   0  0  0  1  0  0  0  0  4.710501  
1972     0     0  ...   0   0  0  1  0  0  0  0  0  4.710501  
1995     0     0  ...   0   0  0  0  0  1  0  0  0  4.710501  

[5 rows x 51 columns]


#HYBRID RECOMMENDATION SYSTEM

###Function flow:
1. Use a content based model (cosine similarity) to compute the 50 most similar books.
2. Compute the predicted rating that the user might give these 50 books using a collaborative filtering model.
3. Return top n books with highest predicted rating.

In [None]:
""" Function represents hybrid recommendation system.
Input parameters: rid: integer-reader id,
bid: integer-book id,
n_recs:integer-number of recommendations wanted,
df:DataFrame-The cosine similarity dataframe,
svd_model:model-SVD model. """
def hybrid(rid, bid, n_recs, cosine_sim, svd_model):
    # sort similarity values in decreasing order and take top 50 results
    sim=list(enumerate(cosine_sim[int(bid)]))
    sim=sorted(sim, key=lambda x: x[1], reverse=True)
    sim=sim[1:50]
    # get book metadata
    book_idx=[i[0] for i in sim]
    books=df.iloc[book_idx][['bid', 'rating', 'year', 'price', 'rid']]
    # predict using svd model
    books['est']=books.apply(lambda x: svd_model.predict(rid, x['bid'], x['rating']).est, axis=1)
    # sort predictions in decreasing order and return top n_recs
    books=books.sort_values('est', ascending=False)
    return books.head(n_recs)

In [None]:
if __name__=='__main__':
    # constants
    PATH='/content/data.csv'
    # import data
    df=pd.read_csv(PATH)
    # content based
    rmat=df.pivot_table(columns='bid', index='rid', values='rating').fillna(0)
    # compute the cosine similarity matrix
    cosine_sim=cosine_similarity(rmat, rmat)
    cosine_sim=pd.DataFrame(cosine_sim, index=rmat.index, columns=rmat.index)
    # collaborative filtering
    reader=Reader()
    data=Dataset.load_from_df(df[['rid', 'bid', 'rating']], reader)
    # split data into training and testing set
    train, test=train_test_split(data, test_size=0.3, random_state=10)
    #train
    svd=SVD()
    svd.fit(train)
    # run trained model against test set
    test_pred=svd.test(test)
    # check accuracy
    accuracy.rmse(test_pred, verbose=True)
    # generate recommendations
    r_id=df['rid'].values[0]
    b_id=df['bid'].values[0]
    n_recs=5
    print(hybrid(r_id, b_id, n_recs, df, cosine_sim, svd))