# Predicting Book Ratings Using K-Nearest Neighbor

## Required Libraries

In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn import metrics
from sklearn.preprocessing import LabelEncoder


## Importing Data

In [2]:
# import data into colab

ratings_df = pd.read_csv("Ratings.csv", sep=';', encoding='latin-1',low_memory=False)
books_df = pd.read_csv("Books.csv", sep=';', encoding='latin-1',low_memory=False, on_bad_lines='skip')
users_df = pd.read_csv("Users.csv", sep=';', encoding='latin-1',low_memory=False,on_bad_lines='skip')

In [3]:
# books df sample
ratings_df.head(3)

Unnamed: 0,User-ID,ISBN,Book-Rating
0,276725,034545104X,0
1,276726,0155061224,5
2,276727,0446520802,0


In [4]:
# users df sample
users_df.head

<bound method NDFrame.head of         User-ID                            Location   Age
0             1                  nyc, new york, usa   NaN
1             2           stockton, california, usa  18.0
2             3     moscow, yukon territory, russia   NaN
3             4           porto, v.n.gaia, portugal  17.0
4             5  farnborough, hants, united kingdom   NaN
...         ...                                 ...   ...
278853   278854               portland, oregon, usa   NaN
278854   278855  tacoma, washington, united kingdom  50.0
278855   278856           brampton, ontario, canada   NaN
278856   278857           knoxville, tennessee, usa   NaN
278857   278858                dublin, n/a, ireland   NaN

[278858 rows x 3 columns]>

## Data Preprocessing

### Check null values in datasets

In [5]:
# check null values
print(users_df.isnull().sum())
print(users_df.count())

User-ID          0
Location         0
Age         110762
dtype: int64
User-ID     278858
Location    278858
Age         168096
dtype: int64


In [6]:
print(ratings_df.isnull().sum())
print(ratings_df.count())

User-ID        0
ISBN           0
Book-Rating    0
dtype: int64
User-ID        1149780
ISBN           1149780
Book-Rating    1149780
dtype: int64


In [7]:
print(books_df.isnull().sum())
print(books_df.count())

ISBN                   0
Book-Title             0
Book-Author            1
Year-Of-Publication    0
Publisher              2
Image-URL-S            0
Image-URL-M            0
Image-URL-L            3
dtype: int64
ISBN                   271360
Book-Title             271360
Book-Author            271359
Year-Of-Publication    271360
Publisher              271358
Image-URL-S            271360
Image-URL-M            271360
Image-URL-L            271357
dtype: int64


The dataset with the most null values is the users dataset specifically the age column. Age is going to be an important factor when predicting user ratings therefore missing null values in this column will be dropped and the remaining users used to make predictions for better accuracy.

In [8]:
print(books_df.isnull().sum())
print(books_df.count())

ISBN                   0
Book-Title             0
Book-Author            1
Year-Of-Publication    0
Publisher              2
Image-URL-S            0
Image-URL-M            0
Image-URL-L            3
dtype: int64
ISBN                   271360
Book-Title             271360
Book-Author            271359
Year-Of-Publication    271360
Publisher              271358
Image-URL-S            271360
Image-URL-M            271360
Image-URL-L            271357
dtype: int64


In [9]:
users_df = users_df.dropna()
books_df = books_df.dropna()

In [10]:
books_df['Year-Of-Publication'] = pd.to_numeric(books_df['Year-Of-Publication'], errors='coerce')
books_df['Year-Of-Publication'] = books_df['Year-Of-Publication'].astype(int)

### Rename Columns

In [11]:
users_df.rename(columns = {'User-ID':'user_id'}, inplace = True)
ratings_df.rename(columns = {'User-ID':'user_id'}, inplace = True)

print(users_df.columns)

Index(['user_id', 'Location', 'Age'], dtype='object')


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  users_df.rename(columns = {'User-ID':'user_id'}, inplace = True)


### Merge Datasets

In [12]:
# merge ratings and users data frames first
book_ratings = pd.merge(users_df, ratings_df, on="user_id")
book_ratings.head

<bound method NDFrame.head of         user_id                         Location   Age        ISBN  \
0             2        stockton, california, usa  18.0  0195153448   
1            10       albacete, wisconsin, spain  26.0  1841721522   
2            10       albacete, wisconsin, spain  26.0  8477024456   
3            19                        weston, ,  14.0  0375759778   
4            20     langhorne, pennsylvania, usa  19.0  0425163091   
...         ...                              ...   ...         ...   
840283   278851               dallas, texas, usa  33.0  1558531025   
840284   278851               dallas, texas, usa  33.0  1566910102   
840285   278851               dallas, texas, usa  33.0  1569661057   
840286   278851               dallas, texas, usa  33.0  1885071213   
840287   278852  brisbane, queensland, australia  32.0  0449907597   

        Book-Rating  
0                 0  
1                 0  
2                 6  
3                 7  
4                 0

In [13]:
book_ratings = pd.merge(book_ratings, books_df, on="ISBN")
book_ratings.head

<bound method NDFrame.head of         user_id                     Location   Age        ISBN  Book-Rating  \
0             2    stockton, california, usa  18.0  0195153448            0   
1            10   albacete, wisconsin, spain  26.0  1841721522            0   
2          3675  barcelona, catalunya, spain  24.0  1841721522            0   
3          6366        madrid, madrid, spain  20.0  1841721522            7   
4         13872  barcelona, catalunya, spain  17.0  1841721522            0   
...         ...                          ...   ...         ...          ...   
753291   278851           dallas, texas, usa  33.0  0743203763            0   
753292   278851           dallas, texas, usa  33.0  0767907566            5   
753293   278851           dallas, texas, usa  33.0  0884159221            7   
753294   278851           dallas, texas, usa  33.0  0912333022            7   
753295   278851           dallas, texas, usa  33.0  1569661057           10   

                     

In [14]:
book_ratings['ISBN'] = pd.to_numeric(book_ratings['ISBN'], errors='coerce')
book_ratings['ISBN'] = book_ratings['ISBN'].astype(str)

In [15]:
# Remove strings that ended up in Age variable
book_ratings['Age'] = book_ratings['Age'].astype(int)
book_ratings['Age'] = book_ratings['Age'].astype(str)
mask = (book_ratings['Age'].str.len() <= 2)

In [16]:
book_ratings = book_ratings.loc[mask]
book_ratings['Age'].shape
book_ratings['Age'].astype(str).astype(int)
book_ratings.dtypes

user_id                 int64
Location               object
Age                    object
ISBN                   object
Book-Rating             int64
Book-Title             object
Book-Author            object
Year-Of-Publication     int32
Publisher              object
Image-URL-S            object
Image-URL-M            object
Image-URL-L            object
dtype: object

In [17]:
import numpy as np
book_ratings.ISBN =book_ratings.ISBN.str.replace(r'[^\w\d]+', '')
avg_ratings = book_ratings.groupby('ISBN')['Book-Rating'].mean().round().astype(np.int8)


  book_ratings.ISBN =book_ratings.ISBN.str.replace(r'[^\w\d]+', '')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  book_ratings.ISBN =book_ratings.ISBN.str.replace(r'[^\w\d]+', '')


In [18]:
book_ratings.loc[book_ratings['Book-Rating'] == 0, 'Book-Rating'] = book_ratings.loc[book_ratings['Book-Rating'] == 0, 'ISBN'].map(avg_ratings)

In [19]:
book_ratings.describe()

Unnamed: 0,user_id,Book-Rating,Year-Of-Publication
count,750270.0,750270.0,750270.0
mean,141187.35644,3.998786,1967.714668
std,78777.014076,3.250217,233.21645
min,2.0,0.0,0.0
25%,74453.0,1.0,1991.0
50%,141472.0,3.0,1997.0
75%,208622.0,7.0,2001.0
max,278852.0,10.0,2050.0


In [20]:
# Encoding categorical variables
l1 = LabelEncoder()
l1.fit(book_ratings['Publisher'])
book_ratings.Publisher = l1.transform(book_ratings.Publisher)

l1.fit(book_ratings['Location'])
book_ratings.Location = l1.transform(book_ratings.Location)


book_ratings.rename(columns={"Book-Title": "Title"}, inplace = True)
l1.fit(book_ratings['Title'])
book_ratings.Title = l1.transform(book_ratings.Title)

l1.fit(book_ratings['Book-Author'])

book_ratings.rename(columns={"Book-Author": "Author"}, inplace = True)
book_ratings.Author = l1.transform(book_ratings.Author)
book_ratings.columns

book_ratings.head

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  book_ratings.Publisher = l1.transform(book_ratings.Publisher)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  book_ratings.Location = l1.transform(book_ratings.Location)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  book_ratings.rename(columns={"Book-Title": "Title"}, inplace = True)
A value is trying to be set on a copy of a slice from a DataFrame.
Try 

<bound method NDFrame.head of         user_id  Location Age         ISBN  Book-Rating   Title  Author  \
0             2     13746  18   1951534480            0   30755   55700   
1            10       215  26  18417215220            2  109614   12108   
2          3675      1000  24  18417215220            2  109614   12108   
3          6366      8430  20  18417215220            7  109614   12108   
4         13872      1000  17  18417215220            2  109614   12108   
...         ...       ...  ..          ...          ...     ...     ...   
753291   278851      3610  33   7432037630            0   14188   68612   
753292   278851      3610  33   7679075660            5    9473   74607   
753293   278851      3610  33   8841592210            7  199391   14323   
753294   278851      3610  33   9123330220            7  149322   39281   
753295   278851      3610  33  15696610570           10   36769   53900   

        Year-Of-Publication  Publisher  \
0                      2002

### Split into Training and Testing datasets

In [21]:
book_ratings.columns

Index(['user_id', 'Location', 'Age', 'ISBN', 'Book-Rating', 'Title', 'Author',
       'Year-Of-Publication', 'Publisher', 'Image-URL-S', 'Image-URL-M',
       'Image-URL-L'],
      dtype='object')

In [22]:
y= book_ratings["Book-Rating"]
book_ratings.drop(['Book-Rating', 'ISBN', 'Image-URL-S',
       'Image-URL-M', 'Image-URL-L'], axis = 1, inplace=True)
X = book_ratings[["Year-Of-Publication","Age"]]


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  book_ratings.drop(['Book-Rating', 'ISBN', 'Image-URL-S',


In [23]:
book_ratings.columns

Index(['user_id', 'Location', 'Age', 'Title', 'Author', 'Year-Of-Publication',
       'Publisher'],
      dtype='object')

In [24]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2)

## KNN Model


In [25]:
k = 3
neigh = KNeighborsClassifier(n_neighbors = k).fit(X_train,y_train)
Pred_y = neigh.predict(X_test)
print("Accuracy of model at K=3 is",metrics.accuracy_score(y_test, Pred_y))

Accuracy of model at K=3 is 0.164687379210151


## Optimize Model

In [26]:
k = 4  
neigh = KNeighborsClassifier(n_neighbors = k).fit(X_train,y_train)
Pred_y = neigh.predict(X_test)
print("Accuracy of model at K=4 is",metrics.accuracy_score(y_test, Pred_y))

Accuracy of model at K=4 is 0.16755967851573433


In [27]:
k = 5  
neigh = KNeighborsClassifier(n_neighbors = k).fit(X_train,y_train)
Pred_y = neigh.predict(X_test)
print("Accuracy of model at K=5 is",metrics.accuracy_score(y_test, Pred_y))

Accuracy of model at K=5 is 0.17079184826795688


In [28]:
k = 10
neigh = KNeighborsClassifier(n_neighbors = k).fit(X_train,y_train)
Pred_y = neigh.predict(X_test)
print("Accuracy of model at K=10 is",metrics.accuracy_score(y_test, Pred_y))

Accuracy of model at K=10 is 0.18053500739733697


In [30]:
X_train

Unnamed: 0,Year-Of-Publication,Age
402478,2001,34
507764,1962,43
269534,1992,35
377666,1982,3
592560,1977,35
...,...,...
172399,1998,46
123169,1998,39
29800,1986,17
315877,1995,32


In [29]:
import pickle 

knnPickle = open('knn_model.pkl', 'wb') 
      
# source, destination 
pickle.dump(neigh, knnPickle)  

# close the file
knnPickle.close()