# KNN algorithm experiment

Importing the datasets


In [1]:
import pandas as pd
import numpy as np
from pprint import pprint as pp
from sklearn.neighbors import NearestNeighbors
from scipy.sparse import csr_matrix

df_ratings = pd.read_csv("../data/Ratings.csv", na_values=["null", "nan", ""])
df_books = pd.read_csv(
    "../data/Books.csv",
    na_values=["null", "nan", ""],
    usecols=["ISBN", "Book-Title", "Book-Author"],
)
df_users = pd.read_csv("../data/Users.csv", na_values=["null", "nan", ""])

df_books = df_books.fillna("NaN")
df_ratings = df_ratings.dropna()

## Getting book rating count

- Grouping the ratings by ISBN
- Getting the Book-Rating count


In [2]:
combine_book_ratings = pd.merge(df_ratings, df_books, on="ISBN")
combine_book_ratings = combine_book_ratings.drop(["Book-Author"], axis="columns")

book_rating_count = (
    combine_book_ratings.groupby(by=["ISBN"])["Book-Rating"]
    .count()
    .reset_index()
    .rename(columns={"Book-Rating": "RatingCount"})
)[["ISBN", "RatingCount"]]

## Getting most rated books

- We get the RatingCount quantile values from 90% to 100%
- For statistical significance we choose top 10% most rated books


In [3]:
book_rating_with_total_count = combine_book_ratings.merge(
    book_rating_count, on=["ISBN"], how="left"
)
pp(book_rating_with_total_count["RatingCount"].quantile(np.arange(0.9, 1, 0.01)))
# Top 10% of rating counts
popularity_threshold = 136

rating_popular_books = book_rating_with_total_count.query(
    "RatingCount >= @popularity_threshold"
)

0.90    136.0
0.91    150.0
0.92    167.0
0.93    184.0
0.94    209.0
0.95    236.0
0.96    277.0
0.97    350.0
0.98    420.0
0.99    568.0
Name: RatingCount, dtype: float64


## Defining KNN model

- Using pivot table we define the model
  - Index: ISBN
  - Columns: User-ID
  - values: Book-Rating
- KNN model
  - metric: cosine
  - alogrithm: auto


In [4]:
pivot = (
    rating_popular_books.drop_duplicates(["Book-Title", "User-ID"])
    .pivot(index="ISBN", columns="User-ID", values="Book-Rating")
    .fillna(0)
)

model_knn = NearestNeighbors(metric="cosine", algorithm="auto")
model_knn.fit(csr_matrix(pivot.values))

## Recomendation system func

- Using the ISBN as index for the pivot matrix.
- We utilize the knn model from `sklearn.neighbors`.
- Then we zip the neighbors' ISBNs and distance to searched one.
- Due to errors we return that ISBN is not in top 10% of rated books.


In [5]:
def get_recommends(isbn="", k_neighbors=5):
    try:
        x = pivot.loc[isbn].array.reshape(1, -1)
        distances, indices = model_knn.kneighbors(x, n_neighbors=k_neighbors)
        R_books = []
        for distance, indice in zip(distances[0], indices[0]):
            if distance != 0:
                R_book = combine_book_ratings[
                    combine_book_ratings["ISBN"] == pivot.index[indice]
                ]["Book-Title"].values[0]
                R_books.append([R_book, distance])
        recommended_books = [isbn, R_books[::-1]]
        return recommended_books
    except:
        return f"{isbn} is not in the top books"

## Recomendation system tests

    Using popular ISBNs we can find the closest neighbors to them


In [6]:
pp(get_recommends("1558745157"))

['1558745157',
 [["Left Behind: A Novel of the Earth's Last Days (Left Behind No. 1)",
   np.float64(0.9381235623430844)],
  ['Night Sins', np.float64(0.9352435834429496)],
  ['On the Street Where You Live', np.float64(0.9258003569774721)],
  ['A Child Called \\It\\": One Child\'s Courage to Survive"',
   np.float64(0.7356135842239919)],
  ["The Lost Boy: A Foster Child's Search for the Love of a Family",
   np.float64(3.3306690738754696e-16)]]]
