# Importing Libraries

In [1]:
pip install fuzzywuzzy

Note: you may need to restart the kernel to use updated packages.


In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.preprocessing import MinMaxScaler
from sklearn import neighbors
from sklearn.metrics import log_loss
import pickle

from fuzzywuzzy import fuzz,process




# Importing Data

**Here we are using *on_bad_lines* command. Our data contains multiple commas, so in order to avoid conflict we use this command.**

In [4]:
df = pd.read_csv("books.csv", on_bad_lines='skip')
df.head()

Unnamed: 0,bookID,title,authors,average_rating,isbn,isbn13,language_code,num_pages,ratings_count,text_reviews_count,publication_date,publisher
0,1,Harry Potter and the Half-Blood Prince (Harry ...,J.K. Rowling/Mary GrandPré,4.57,0439785960,9780439785969,eng,652,2095690,27591,9/16/2006,Scholastic Inc.
1,2,Harry Potter and the Order of the Phoenix (Har...,J.K. Rowling/Mary GrandPré,4.49,0439358078,9780439358071,eng,870,2153167,29221,9/1/2004,Scholastic Inc.
2,4,Harry Potter and the Chamber of Secrets (Harry...,J.K. Rowling,4.42,0439554896,9780439554893,eng,352,6333,244,11/1/2003,Scholastic
3,5,Harry Potter and the Prisoner of Azkaban (Harr...,J.K. Rowling/Mary GrandPré,4.56,043965548X,9780439655484,eng,435,2339585,36325,5/1/2004,Scholastic Inc.
4,8,Harry Potter Boxed Set Books 1-5 (Harry Potte...,J.K. Rowling/Mary GrandPré,4.78,0439682584,9780439682589,eng,2690,41428,164,9/13/2004,Scholastic


In [25]:
#df.to_pickle("df.pkl")

# Data Pre-processing

**The authors column has high cardinality. So we want to decrease it if we can. We can see there are multiple authors for some books, so we can remove the secondary authors.**

In [5]:
df["authors"].nunique()

6639

In [6]:
df['authors'] = df['authors'].apply(lambda x :x.split("/")[0])

In [7]:
df['authors'].nunique()

4215

**We can drop the *isbn* column as isbn13 covers it.**

In [8]:
df.columns

Index(['bookID', 'title', 'authors', 'average_rating', 'isbn', 'isbn13',
       'language_code', '  num_pages', 'ratings_count', 'text_reviews_count',
       'publication_date', 'publisher'],
      dtype='object')

In [9]:
df.drop(['isbn','isbn13','  num_pages','publisher'],axis=1, inplace=True)
df.head()

Unnamed: 0,bookID,title,authors,average_rating,language_code,ratings_count,text_reviews_count,publication_date
0,1,Harry Potter and the Half-Blood Prince (Harry ...,J.K. Rowling,4.57,eng,2095690,27591,9/16/2006
1,2,Harry Potter and the Order of the Phoenix (Har...,J.K. Rowling,4.49,eng,2153167,29221,9/1/2004
2,4,Harry Potter and the Chamber of Secrets (Harry...,J.K. Rowling,4.42,eng,6333,244,11/1/2003
3,5,Harry Potter and the Prisoner of Azkaban (Harr...,J.K. Rowling,4.56,eng,2339585,36325,5/1/2004
4,8,Harry Potter Boxed Set Books 1-5 (Harry Potte...,J.K. Rowling,4.78,eng,41428,164,9/13/2004


In [10]:
df.shape

(11123, 8)

In [11]:
df['publication_date'][0].split('/')[2]

'2006'

In [12]:
df['language_code'].value_counts()

eng      8908
en-US    1408
spa       218
en-GB     214
fre       144
ger        99
jpn        46
mul        19
zho        14
grc        11
por        10
en-CA       7
ita         5
enm         3
lat         3
swe         2
rus         2
srp         1
nl          1
msa         1
glg         1
wel         1
ara         1
nor         1
tur         1
gla         1
ale         1
Name: language_code, dtype: int64

# Feature Transformation

In [13]:
features = pd.concat([df['language_code'].str.get_dummies(sep=","), df['average_rating'], df['ratings_count']], axis=1)

In [14]:
features.head()

Unnamed: 0,ale,ara,en-CA,en-GB,en-US,eng,enm,fre,ger,gla,...,por,rus,spa,srp,swe,tur,wel,zho,average_rating,ratings_count
0,0,0,0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,4.57,2095690
1,0,0,0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,4.49,2153167
2,0,0,0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,4.42,6333
3,0,0,0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,4.56,2339585
4,0,0,0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,4.78,41428


In [15]:
features.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 11123 entries, 0 to 11122
Data columns (total 29 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   ale             11123 non-null  int64  
 1   ara             11123 non-null  int64  
 2   en-CA           11123 non-null  int64  
 3   en-GB           11123 non-null  int64  
 4   en-US           11123 non-null  int64  
 5   eng             11123 non-null  int64  
 6   enm             11123 non-null  int64  
 7   fre             11123 non-null  int64  
 8   ger             11123 non-null  int64  
 9   gla             11123 non-null  int64  
 10  glg             11123 non-null  int64  
 11  grc             11123 non-null  int64  
 12  ita             11123 non-null  int64  
 13  jpn             11123 non-null  int64  
 14  lat             11123 non-null  int64  
 15  msa             11123 non-null  int64  
 16  mul             11123 non-null  int64  
 17  nl              11123 non-null 

# Model Training

In [16]:
min_max_scaler = MinMaxScaler()
features = pd.DataFrame(min_max_scaler.fit_transform(features))

In [17]:
features.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,19,20,21,22,23,24,25,26,27,28
0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.914,0.455816
1,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.898,0.468317
2,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.884,0.001377
3,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.912,0.508864
4,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.956,0.009011


In [18]:
model = neighbors.NearestNeighbors(n_neighbors=6, algorithm='ball_tree')
model.fit(features)

In [19]:
distance, indices = model.kneighbors(features)

# Book Recommendation System

In [20]:
all_books_names = list(df.title.values)

In [21]:
def get_index_from_name(name):
    return df[df["title"]==name].index.tolist()[0]

In [22]:
def print_similar_books(query=None):
    suggestion = []
    found_id = get_index_from_name(query)
    for id in indices[found_id][1:]:
        print(df.iloc[id]["title"])
        suggestion.append(df.iloc[id]["title"])
    return suggestion

In [23]:
def get_id_from_partial_name(partial):
    for name in all_books_names:
        if partial in name:
            print(name,all_books_names.index(name))

### Experiments with FuzzyWuzzy Logic

In [24]:
#Find if we have the book in our library
def find_the_book(find):
    '''
    We need to find, whether the book that the user is searching for is in our book list or not.
    
    For this we use the fuzzywuzzy's process function, where for scoring we use the fuzz function.
    
    We limit out results to 10.
    '''
    find = str(find)
    book_list = process.extract(find,choices=all_books_names, scorer=fuzz.partial_token_sort_ratio, limit=10)
    book_list =  list(map(lambda x : x[0],book_list))
    return book_list

In [24]:
# List Comprehension
# [x[0] for x in a]

# Pickle Files

In [26]:
#pickle.dump(model, open('model.pkl','wb'))

In [27]:
#pickle.dump(indices, open('knn.pkl','wb'))