In [8]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats
from ast import literal_eval
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.metrics.pairwise import linear_kernel, cosine_similarity
from nltk.stem.snowball import SnowballStemmer
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.corpus import wordnet
from surprise import Reader, Dataset, SVD
from surprise.model_selection import cross_validate
import warnings; warnings.simplefilter('ignore')

In [159]:
books = pd.read_csv('./dataset/books.csv')
books = books.drop(['image_url', 'small_image_url', 'title', 'best_book_id', 'isbn', 'isbn13'], axis = 1)
books.head(5)

Unnamed: 0,id,book_id,work_id,books_count,authors,original_publication_year,original_title,language_code,average_rating,ratings_count,work_ratings_count,work_text_reviews_count,ratings_1,ratings_2,ratings_3,ratings_4,ratings_5
0,1,2767052,2792775,272,Suzanne Collins,2008.0,The Hunger Games,eng,4.34,4780653,4942365,155254,66715,127936,560092,1481305,2706317
1,2,3,4640799,491,"J.K. Rowling, Mary GrandPré",1997.0,Harry Potter and the Philosopher's Stone,eng,4.44,4602479,4800065,75867,75504,101676,455024,1156318,3011543
2,3,41865,3212258,226,Stephenie Meyer,2005.0,Twilight,en-US,3.57,3866839,3916824,95009,456191,436802,793319,875073,1355439
3,4,2657,3275794,487,Harper Lee,1960.0,To Kill a Mockingbird,eng,4.25,3198671,3340896,72586,60427,117415,446835,1001952,1714267
4,5,4671,245494,1356,F. Scott Fitzgerald,1925.0,The Great Gatsby,eng,3.89,2683664,2773745,51992,86236,197621,606158,936012,947718


In [161]:
content = books[['original_title', 'authors', 'average_rating']]
content = content.astype('str')
content.head(5)

Unnamed: 0,original_title,authors,average_rating
0,The Hunger Games,Suzanne Collins,4.34
1,Harry Potter and the Philosopher's Stone,"J.K. Rowling, Mary GrandPré",4.44
2,Twilight,Stephenie Meyer,3.57
3,To Kill a Mockingbird,Harper Lee,4.25
4,The Great Gatsby,F. Scott Fitzgerald,3.89


In [176]:
content['content'] = content['original_title'] + ' ' + content['authors'] + ' ' + content['average_rating']
indices = pd.Series(content.index, index=content['original_title'])

In [177]:
#removing stopwords
tfidf = TfidfVectorizer(stop_words='english')

#Construct the required TF-IDF matrix by fitting and transforming the data
tfidf_matrix = tfidf.fit_transform(content['authors'])

#Output the shape of tfidf_matrix
tfidf_matrix.shape

(10000, 6175)

In [178]:
count = CountVectorizer(stop_words='english')
count_matrix = count.fit_transform(content['content'])
cosine_sim_content = cosine_similarity(count_matrix, count_matrix)

In [179]:
def get_recommendations(title, cosine_sim=cosine_sim_content):
    idx = indices[title]

    # Get the pairwsie similarity scores of all books with that book
    sim_scores = list(enumerate(cosine_sim_content[idx]))

    # Sort the books based on the similarity scores
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)

    # Get the scores of the 10 most similar books
    sim_scores = sim_scores[1:11]

    # Get the book indices
    book_indices = [i[0] for i in sim_scores]

    # Return the top 10 most similar books
    return list(content['original_title'].iloc[book_indices])

In [220]:
def book_shows(book):
    for book in book:
        if book != 'nan':
            print(book)

In [190]:
book_shows(get_recommendations("Harry Potter and the Philosopher's Stone", cosine_sim_content))

Harry Potter and the Goblet of Fire
Harry Potter and the Order of the Phoenix
Harry Potter and the Chamber of Secrets
Harry Potter and the Deathly Hallows
Harry Potter and the Half-Blood Prince
Harry Potter Boxed Set Books 1-4
Harry Potter and the Prisoner of Azkaban
Harry Potter Collection (Harry Potter, #1-6)
nan
Complete Harry Potter Boxed Set


In [189]:
book_shows(get_recommendations('The Starter Wife', cosine_sim_content))

Wife 22
The Silent Wife
Fueled
nan
The Return of the King
nan
The Pilot's Wife
The Lost Wife
Starter for Ten
Prince's Gambit


In [193]:
book_shows(get_recommendations('To Kill a Mockingbird', cosine_sim_content))

Go Set a Watchman
Generation Kill
Batman (Volume 2): Hush
The Hobbit or There and Back Again
One Shot
Never Go Back 
Personal
Nothing to Lose
Unhinged
Vicious


#Collaborative Filtering

In [194]:
reader = Reader()

In [195]:
ratings = pd.read_csv('./dataset/ratings.csv')
ratings.head(5)

Unnamed: 0,book_id,user_id,rating
0,1,314,5
1,1,439,3
2,1,588,5
3,1,1169,4
4,1,1185,4


In [51]:
data = Dataset.load_from_df(ratings[['book_id', 'user_id', 'rating']], reader)
svd = SVD()
cross_validate(svd, data, measures=['RMSE', 'MAE'], cv=5, verbose=True)

Evaluating RMSE, MAE of algorithm SVD on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    0.8440  0.8436  0.8448  0.8423  0.8421  0.8434  0.0010  
MAE (testset)     0.6602  0.6595  0.6611  0.6589  0.6580  0.6595  0.0011  
Fit time          137.84  136.44  129.87  153.33  117.58  135.01  11.63   
Test time         5.08    4.50    4.45    5.40    4.58    4.80    0.37    


{'test_rmse': array([0.84403084, 0.84360665, 0.84478745, 0.84229498, 0.84206043]),
 'test_mae': array([0.6602343 , 0.65952864, 0.66107256, 0.6588754 , 0.65801519]),
 'fit_time': (137.84285855293274,
  136.4438772201538,
  129.86659622192383,
  153.3289396762848,
  117.58158040046692),
 'test_time': (5.07884407043457,
  4.4980247020721436,
  4.447017431259155,
  5.396587610244751,
  4.583525657653809)}

In [52]:
trainset = data.build_full_trainset()
svd.fit(trainset)

<surprise.prediction_algorithms.matrix_factorization.SVD at 0x2361fefaeb0>

In [54]:
ratings[ratings['user_id'] == 1]

Unnamed: 0,book_id,user_id,rating
117889,1180,1,4
488112,4893,1,3
625717,6285,1,4


In [234]:
ratings[ratings['book_id'] == 1]

Unnamed: 0,book_id,user_id,rating
0,1,314,5
1,1,439,3
2,1,588,5
3,1,1169,4
4,1,1185,4
...,...,...,...
95,1,51460,3
96,1,51480,1
97,1,51838,5
98,1,52036,2


In [56]:
svd.predict(1, 240, 4)

Prediction(uid=1, iid=240, r_ui=4, est=4.358400241274715, details={'was_impossible': False})

#Hybrid Filtering

In [196]:
def convert_int(x):
    try:
        return int(x)
    except:
        return np.nan

In [202]:
id_map = pd.read_csv('./dataset/books.csv')[['book_id', 'work_id']]
id_map['work_id'] = id_map['work_id'].apply(convert_int)
id_map = id_map.merge(books[['original_title', 'work_id']], on='work_id').set_index('original_title')
id_map

Unnamed: 0_level_0,book_id,work_id
original_title,Unnamed: 1_level_1,Unnamed: 2_level_1
The Hunger Games,2767052,2792775
Harry Potter and the Philosopher's Stone,3,4640799
Twilight,41865,3212258
To Kill a Mockingbird,2657,3275794
The Great Gatsby,4671,245494
...,...,...
Bayou Moon,7130616,7392860
Means of Ascent,208324,1084709
The Mauritius Command,77431,2393986
Cinderella Ate My Daughter: Dispatches from the Frontlines of the New Girlie-Girl Culture,8565083,13433613


In [204]:
indices_map = id_map.set_index('work_id')
indices_map

Unnamed: 0_level_0,book_id
work_id,Unnamed: 1_level_1
2792775,2767052
4640799,3
3212258,41865
3275794,2657
245494,4671
...,...
7392860,7130616
1084709,208324
2393986,77431
13433613,8565083


In [241]:
def hybrid(userId, title):
    idx = indices[title]
    work_id = id_map.loc[title]['work_id']
    book_id = id_map.loc[title]['book_id']
    
    sim_scores = list(enumerate(cosine_sim_content[int(idx)]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    sim_scores = sim_scores[1:26]
    book_indices = [i[0] for i in sim_scores]
    
    buks = books.iloc[book_indices][['original_title', 'authors', 'work_id']]
    buks['est'] = buks['work_id'].apply(lambda x: svd.predict(userId, indices_map.loc[x]['book_id']).est)
    buks = buks.sort_values('est', ascending=False)
    return buks.head(10)

In [231]:
hybrid(10944, "Harry Potter and the Half-Blood Prince")

Unnamed: 0,original_title,authors,work_id,est
20,Harry Potter and the Order of the Phoenix,"J.K. Rowling, Mary GrandPré",2809203,4.02479
2077,The Complete Tales of Beatrix Potter,Beatrix Potter,1139913,4.023151
22,Harry Potter and the Chamber of Secrets,"J.K. Rowling, Mary GrandPré",6231171,4.016759
17,Harry Potter and the Prisoner of Azkaban,"J.K. Rowling, Mary GrandPré, Rufus Beck",2402163,3.986064
23,Harry Potter and the Goblet of Fire,"J.K. Rowling, Mary GrandPré",3046572,3.976007
9282,Harry Potter and Philosophy: If Aristotle Ran ...,"David Baggett, Shawn E. Klein",32035,3.880341
9204,Half Blood Blues,Esi Edugyan,15997604,3.856534
2168,What Do You Do?,Gillian Flynn,45948939,3.856534
3735,Harry Potter Page to Screen: The Complete Film...,Bob McCabe,15697851,3.856534
3053,Harry Potter and the Chamber of Secrets: Sheet...,John Williams,498223,3.856534


In [235]:
hybrid(51480, "Harry Potter and the Half-Blood Prince")

Unnamed: 0,original_title,authors,work_id,est
20,Harry Potter and the Order of the Phoenix,"J.K. Rowling, Mary GrandPré",2809203,4.02479
2077,The Complete Tales of Beatrix Potter,Beatrix Potter,1139913,4.023151
22,Harry Potter and the Chamber of Secrets,"J.K. Rowling, Mary GrandPré",6231171,4.016759
17,Harry Potter and the Prisoner of Azkaban,"J.K. Rowling, Mary GrandPré, Rufus Beck",2402163,3.986064
23,Harry Potter and the Goblet of Fire,"J.K. Rowling, Mary GrandPré",3046572,3.976007
9282,Harry Potter and Philosophy: If Aristotle Ran ...,"David Baggett, Shawn E. Klein",32035,3.880341
9204,Half Blood Blues,Esi Edugyan,15997604,3.856534
2168,What Do You Do?,Gillian Flynn,45948939,3.856534
3735,Harry Potter Page to Screen: The Complete Film...,Bob McCabe,15697851,3.856534
3053,Harry Potter and the Chamber of Secrets: Sheet...,John Williams,498223,3.856534


In [225]:
book_shows(get_recommendations("Divergent", cosine_sim_content))

Four: A Divergent Collection
Four: The Initiate
Free Four: Tobias Tells the Divergent Story
Insurgent
Allegiant
Four: The Transfer
