In [1]:
import numpy as np
import pandas as pd
from sklearn.decomposition import TruncatedSVD
from sklearn.preprocessing import LabelEncoder,MinMaxScaler
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import seaborn as sns
import matplotlib.pyplot as plt

# Data preprocessing

In [2]:
def mergeDuplicateData():
    info_original = pd.read_csv('BX-Books.csv',sep=';', on_bad_lines='skip', encoding='latin-1', low_memory=False).sort_values('ISBN')
    info_original.drop(['Book-Author','Year-Of-Publication','Publisher','Image-URL-S','Image-URL-M','Image-URL-L'],axis=1 ,inplace=True)

    ratings = pd.read_csv('BX-Book-Ratings.csv',sep=';', encoding='latin-1')
    users = pd.read_csv('BX-Users.csv', sep=';', encoding='latin-1')

    ratings = ratings[ratings['ISBN'].isin(info_original['ISBN'])]
    ratings = ratings[ratings['User-ID'].isin(users['User-ID'])].reset_index(drop=True)

    info_name_modified = info_original.copy()
    info_name_modified['Book-Title'] = info_name_modified['Book-Title'].str.replace(pat=r'[^\w]',repl=r'',regex=True)
    info_name_modified['Book-Title'] = info_name_modified['Book-Title'].str.upper()

    info_no_duplicate = info_name_modified.drop_duplicates(['Book-Title'],keep='first')
    
    for index, ISBN in enumerate(ratings['ISBN']):
        if(ISBN not in list(info_no_duplicate['ISBN'])):
            index_to_change = info_name_modified[info_name_modified['ISBN'] == ISBN].reset_index(drop=True)
            book_title = index_to_change.at[0,'Book-Title']
            new_ISBN = info_name_modified[info_name_modified['Book-Title'] == book_title].reset_index(drop=True)
            new_ISBN = new_ISBN.at[0,'ISBN']
            ratings.loc[ratings['ISBN'] == ISBN,'ISBN'] = new_ISBN
            print(index, book_title,ISBN, 'to',new_ISBN)

    for index, ISBN in enumerate(info_no_duplicate['ISBN']):
        index_to_change = info_original[info_original['ISBN'] == ISBN].reset_index(drop=True)
        book_title = index_to_change.at[0,'Book-Title']
        info_no_duplicate.loc[info_no_duplicate['ISBN'] == ISBN,'Book-Title'] = book_title
        print(index, ISBN, book_title)
        
    info_no_duplicate.to_csv("modified_book.csv",index=False)
    ratings.to_csv("modified_rating.csv",index=False)

Erases inconsistent data and integrates ISBNs from the same books. It takes quite a long time, so I'll save it and bring it back

In [3]:
ratings = pd.read_csv('modified_rating.csv', encoding='latin-1')
info = pd.read_csv('modified_book.csv', encoding='latin-1')

ratings['ISBN'] = ratings['ISBN'].apply(lambda x : x.zfill(10))

ratings_group = ratings.groupby('User-ID').count()
ratings_group = ratings_group[ratings_group['ISBN'] >= 10]
ratings = ratings.loc[ratings['User-ID'].isin(ratings_group.index)]

books_group = ratings.groupby('ISBN').count()
books_group = books_group[books_group['User-ID'] >= 10]
ratings = ratings.loc[ratings['ISBN'].isin(books_group.index)]

ratings = ratings.loc[ratings['Book-Rating']>0]

Consolidate ISBN to 10 digits and erase books and people with less than 10 evaluations from the data and filter out zero-point data

# Modeling

In [4]:
num_of_sample = 100

num_of_components = 15

num_of_recommend = 20

ISBN_to_recommend = '0451118642'

user_history = [[0,'0451118642',9],
                [0,'0345243447',10],
                ]

Declare Required Variables

## contents-based filtering

In [5]:
def contentBasedRecommend(df, books_group, ISBN_to_recommend , num_of_recommend):
    meta = pd.read_csv('Preprocessed_data.csv',on_bad_lines='skip', low_memory=False)
    meta.drop(['Unnamed: 0','location','user_id','rating','book_title','year_of_publication','publisher','img_s','img_m','img_l','Language','city','state','country','age'],axis=1 ,inplace=True)
    meta.rename(columns = {'isbn':'ISBN'},inplace=True)
    meta = meta.sort_values('ISBN').drop_duplicates(['ISBN'],keep='first')

    df = pd.merge(df, meta, on = 'ISBN')
    df = df.loc[(df['Summary'] != '9') & (df['Category'] != '9'),:]
    df = df.loc[df['ISBN'].isin(books_group.index)].reset_index(drop=True)

    df['index'] = df.index.values
    book = df[df['ISBN'] == ISBN_to_recommend]

    to_join = ['Book-Title','book_author','Summary','Category']
    df['features'] = [' '.join(df[to_join].iloc[i,].values) for i in range(df[to_join].shape[0])]

    tf = TfidfVectorizer(stop_words='english',min_df=1,ngram_range=(1,5))
    matrix = tf.fit_transform(df['features'])
    cos= cosine_similarity(matrix)

    recommendation_list = list(enumerate(cos[book.index.values[0]]))
    recommendation_list = sorted(recommendation_list,key=lambda x:x[1],reverse=True)[1:num_of_recommend+1]

    books =[]
    for i in range(len(recommendation_list)):
        books.append([df[df['index'] == recommendation_list[i][0]]['Book-Title'].item(),recommendation_list[i][1]])

    print(pd.DataFrame(books).set_index(0))

Get metadata and combine it with data, 
combine 'Book-Title', 'book_author', 'Summary', and 'Category' into one feature to calculate the TF-IDF matrix
and recommend a book based on cosine similarity

In [6]:
contentBasedRecommend(info, books_group, ISBN_to_recommend, num_of_recommend)

                                                           1
0                                                           
2010: Odyssey Two                                   0.048847
Rama Revealed (Bantam Spectra Book)                 0.028845
Bloodletter (Star Trek Deep Space Nine, No 3)       0.023130
The Odyssey (Penguin Classics)                      0.021169
Dark Tide II: Ruin (Star Wars: The New Jedi Ord...  0.021156
The Ship Who Sang                                   0.020857
Star Wars: The Han Solo Adventures/3 Books in One   0.019922
Pegasus in Space                                    0.018672
The Andromeda Strain                                0.018574
The Light of Other Days                             0.018360
Dawn (Cutler)                                       0.017486
Rama II: The Sequel to Rendezvous with Rama         0.017293
Love in the Ruins                                   0.016784
Refugee (Bio of a Space Tyrant, Vol 1)              0.016722
LIFE UNIVERS EVRTH (Hitc

## item-based collaborative filtering

In [7]:
df = pd.merge(ratings, info, on = 'ISBN').sort_values('User-ID')

encoder = LabelEncoder()
df['User-ID'] = encoder.fit_transform(df['User-ID'])

pivot_table = df.pivot_table(values='Book-Rating',index='User-ID', columns='ISBN').dropna(how = 'all').fillna(0)

data merge and make pivot table to SVD calculation

In [8]:
def itemBasedRecommend(pivot_table, df,ISBN_to_recommend , num_of_components, num_of_recommend):
    scaler = MinMaxScaler()
    item_sim = cosine_similarity(pivot_table.T, pivot_table.T)

    SVD = TruncatedSVD(n_components = num_of_components, algorithm = 'arpack')
    matrix = SVD.fit_transform(pivot_table.T)

    item_sim_df = pd.DataFrame(data=item_sim,index=pivot_table.columns,columns=pivot_table.columns).drop([ISBN_to_recommend],axis = 0)
    corr= pd.DataFrame(np.corrcoef(matrix), columns = pivot_table.columns,index = pivot_table.columns).drop([ISBN_to_recommend],axis = 0)

    pivot_table = pivot_table.drop([ISBN_to_recommend],axis = 1)

    cos = pd.DataFrame(scaler.fit_transform(item_sim_df[ISBN_to_recommend][:, np.newaxis]),index=pivot_table.columns,columns=[ISBN_to_recommend])
    corr = pd.DataFrame(scaler.fit_transform(corr[ISBN_to_recommend][:, np.newaxis]),index=pivot_table.columns,columns=[ISBN_to_recommend])
    
    recommendation_list = cos*0.75 + corr*0.25
    recommendation_list = recommendation_list.sort_values(by=ISBN_to_recommend, ascending=False)[:num_of_recommend]
    
    book_title_list = []
    for i in recommendation_list.index :
        index_to_change = df[df['ISBN'] == i].reset_index(drop=True)
        book_title_list.append(index_to_change.at[0,'Book-Title'])

    recommendation_list.index = book_title_list
    
    print(recommendation_list)

After reducing the dimension of the pivot table through SVD, calculate the cosine similarity and correlation, respectively, multiply the weight, and recommend the book.

In [9]:
itemBasedRecommend(pivot_table, df, ISBN_to_recommend, num_of_components, num_of_recommend)

  del sys.path[0]
  


                                                    0451118642
The Death and Life of Superman: A Novel               0.838511
2010: Odyssey Two                                     0.818614
The Rabbi                                             0.731573
The Ultimate Hitchhiker's Guide to the Galaxy         0.726015
Heart Of The Mat (Harlequin Romance, No 2876)         0.721143
36 Hours Christmas (Silhouette Promo)                 0.721143
Gossip Girl                                           0.721143
The Beach Club : A Novel                              0.721143
Informed Consent                                      0.721143
Combat, Vol. 1 (Combat)                               0.721143
Books and Reading: A Book of Quotations (Dover ...    0.721143
Das WÃ?ÃÂ¼ten der ganzen Welt.                      0.721143
Panama: A Novel                                       0.721143
The Sands of Sakkara                                  0.721143
Rainsong                                              0

## user-based collaborative filtering

In [10]:
def makeNewUserData(user_history, rating_data):
    user_data = pd.DataFrame(user_history,columns=['User-ID','ISBN','Book-Rating'])

    return pd.concat([user_data,rating_data])

In [11]:
user_history = [[0,'0451118642',9],
                [0,'0345243447',10],
                ]

ratings_include_history = makeNewUserData(user_history, ratings)

df = pd.merge(ratings_include_history, info, on = 'ISBN').sort_values('User-ID')

encoder = LabelEncoder()
df['User-ID'] = encoder.fit_transform(df['User-ID'])

pivot_table = df.pivot_table(values='Book-Rating',index='User-ID', columns='ISBN').dropna(how = 'all').fillna(0)

After adding the user's rating data, data merge and make pivot table to SVD calculation

In [12]:
def userBasedRecommend(pivot_table, df, num_of_components, num_of_recommend, want_print = True):
    ratings = pivot_table.values
    ratings_mean = np.mean(ratings,axis = 1)
    ratings = ratings - ratings_mean.reshape(-1,1)

    SVD = TruncatedSVD(n_components = num_of_components, algorithm = 'arpack')
    U = SVD.fit_transform(ratings)
    sigma=SVD.explained_variance_ratio_
    Vt= SVD.components_

    matrix_to_predict = np.dot(np.dot(U, np.diag(sigma)), Vt) + ratings_mean.reshape(-1,1)
    df_to_preditct = pd.DataFrame(matrix_to_predict, columns = pivot_table.columns,index = pivot_table.index)

    user_data = df[df['User-ID'] == 0]

    user_prediction_list = df_to_preditct.iloc[0].sort_values(ascending=False)
    recommendation_list = user_prediction_list[~user_prediction_list.index.isin(user_data['ISBN'])][:num_of_recommend]

    recomendation_ISBN = recommendation_list.copy()

    if(want_print):
        book_title_list = []
        for i in recommendation_list.index :
            index_to_change = df[df['ISBN'] == i].reset_index(drop=True)
            book_title_list.append(index_to_change.at[0,'Book-Title'])

        recommendation_list.index = book_title_list
        print('\n',recommendation_list)
    
    return recomendation_ISBN

Decompose the matrix and restore it to recommend a book

In [13]:
userBasedRecommend(pivot_table, df, num_of_components, num_of_recommend)


 The Two Towers (The Lord of the Rings, Part 2)                0.001577
The Fellowship of the Ring (The Lord of the Rings, Part 1)    0.001570
Interview With the Vampire                                    0.001561
The Return of the King (The Lord of the Rings, Part 3)        0.001561
The Catcher in the Rye                                        0.001522
The Golden Compass (His Dark Materials, Book 1)               0.001515
The Vampire Lestat (Vampire Chronicles, Book II)              0.001510
Ender's Game (Ender Wiggins Saga (Paperback))                 0.001506
Life of Pi                                                    0.001500
The Handmaid's Tale                                           0.001487
Red Dragon                                                    0.001486
Brave New World                                               0.001483
A Prayer for Owen Meany                                       0.001483
The Hitchhiker's Guide to the Galaxy                          0.001465
Sile

ISBN
0345339711    0.001577
0345339703    0.001570
0345256085    0.001561
0345339738    0.001561
0316769177    0.001522
0345413350    0.001515
0345313860    0.001510
0312853238    0.001506
0151008116    0.001500
0395404258    0.001487
0385319673    0.001486
0001047973    0.001483
0345361792    0.001483
0345391802    0.001465
0312022824    0.001463
0380789035    0.001463
0151660387    0.001448
006017322X    0.001446
0441172717    0.001438
0151001006    0.001437
Name: 0, dtype: float64

# Evaluation

In [14]:
def verifyRecommendation(ratings, info, num_of_sample, num_of_components, num_of_recommend):
    
    ratings_group = ratings.groupby('User-ID').count()
    ratings_group = ratings_group[ratings_group['ISBN'] >= 100]
    total_correct=0

    for i in range(num_of_sample):
        user_test_id = ratings_group.sample(n=1).index.values[0]

        user_test_df=ratings[ratings['User-ID']==user_test_id]
        ratings_exclude_test=ratings[ratings['User-ID']!=user_test_id]

        user_test_df=user_test_df[user_test_df['Book-Rating']>=7]
        user_test_df=user_test_df.sort_values(by='Book-Rating', ascending=False)

        user_test=user_test_df.sample(frac=0.4)
        user_test['User-ID'] = np.zeros((len(user_test),1))

        user_history=user_test.values.tolist()
    
        ratings_include_history = makeNewUserData(user_history, ratings_exclude_test)

        df = pd.merge(ratings_include_history, info, on = 'ISBN').sort_values('User-ID')

        encoder = LabelEncoder()
        df['User-ID'] = encoder.fit_transform(df['User-ID'])

        pivot_table = df.pivot_table(values='Book-Rating',index='User-ID', columns='ISBN').dropna(how = 'all').fillna(0)
    
        recommendation_list = userBasedRecommend(pivot_table, df, num_of_components, num_of_recommend, False)

        print('\n','sample', i+1) 
    
        correct = []   
        for i in recommendation_list.index :
            if(i in list(user_test_df['ISBN'])):
                correct.append([i,user_test_df[user_test_df['ISBN']==i]['Book-Rating'].values[0]])
       
        for i in correct :
            index_to_change = df[df['ISBN'] == i[0]].reset_index(drop=True)
            i[0] = index_to_change.at[0,'Book-Title']
            print(i)


        total_correct=total_correct+len(correct)
        total_num_recommend=num_of_sample*num_of_recommend
        
        print(len(correct),"/" ,num_of_recommend, "matched")

    percent=total_correct/total_num_recommend*100
    
    print()

    print("Total Matched : ", percent ,"%","(",total_correct,"/",total_num_recommend,")")

We take samples from the data at random, and we put only 40 percent of the data in the sample and we train them And compare the recommended book with the 60 percent left

In [15]:
num_of_sample = 5

num_of_recommend = 20

verifyRecommendation(ratings, info, num_of_sample, num_of_components, num_of_recommend)


 sample 1
['Harry Potter and the Chamber of Secrets (Book 2)', 10]
["Harry Potter and the Sorcerer's Stone (Book 1)", 10]
2 / 20 matched

 sample 2
['It', 9]
['The Tommyknockers', 7]
2 / 20 matched

 sample 3
['The Red Tent (Bestselling Backlist)', 9]
['Snow Falling on Cedars', 9]
['The Poisonwood Bible: A Novel', 8]
3 / 20 matched

 sample 4
['The Secret Life of Bees', 10]
["Tuesdays with Morrie: An Old Man, a Young Man, and Life's Greatest Lesson", 9]
2 / 20 matched

 sample 5
['The Nanny Diaries: A Novel', 8]
['The Hours : A Novel', 10]
['House of Sand and Fog', 7]
['Wicked : The Life and Times of the Wicked Witch of the West', 7]
['Bridget Jones: The Edge of Reason', 8]
5 / 20 matched

Total Matched :  14.000000000000002 % ( 14 / 100 )
