## Importing Libraries

In [617]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from category_encoders import CountEncoder

## Importing Data

In [618]:
books = pd.read_csv("data/books.csv", sep=";", encoding="latin-1", low_memory=False, on_bad_lines='skip').drop(["Image-URL-S", "Image-URL-M", "Image-URL-L"], axis=1)
ratings = pd.read_csv("data/ratings.csv", sep=";", encoding="latin-1", low_memory=False, on_bad_lines='skip')
users = pd.read_csv("data/users.csv", sep=";", encoding="latin-1", low_memory=False, on_bad_lines='skip')

## Cleaning Data

In [619]:
books.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 271360 entries, 0 to 271359
Data columns (total 5 columns):
 #   Column               Non-Null Count   Dtype 
---  ------               --------------   ----- 
 0   ISBN                 271360 non-null  object
 1   Book-Title           271360 non-null  object
 2   Book-Author          271358 non-null  object
 3   Year-Of-Publication  271360 non-null  object
 4   Publisher            271358 non-null  object
dtypes: object(5)
memory usage: 10.4+ MB


In [620]:
books[books["Book-Author"].isna()]

Unnamed: 0,ISBN,Book-Title,Book-Author,Year-Of-Publication,Publisher
118033,751352497,A+ Quiz Masters:01 Earth,,1999,Dorling Kindersley
187689,9627982032,The Credit Suisse Guide to Managing Your Perso...,,1995,Edinburgh Financial Publishing


In [621]:
books[books["Publisher"].isna()]

Unnamed: 0,ISBN,Book-Title,Book-Author,Year-Of-Publication,Publisher
128890,193169656X,Tyrant Moon,Elaine Corvidae,2002,
129037,1931696993,Finders Keepers,Linnea Sinclair,2001,


In [622]:
books = books.dropna()

---

In [623]:
ratings.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1149780 entries, 0 to 1149779
Data columns (total 3 columns):
 #   Column       Non-Null Count    Dtype 
---  ------       --------------    ----- 
 0   User-ID      1149780 non-null  int64 
 1   ISBN         1149780 non-null  object
 2   Book-Rating  1149780 non-null  int64 
dtypes: int64(2), object(1)
memory usage: 26.3+ MB


---

In [624]:
users.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 278858 entries, 0 to 278857
Data columns (total 3 columns):
 #   Column    Non-Null Count   Dtype  
---  ------    --------------   -----  
 0   User-ID   278858 non-null  int64  
 1   Location  278858 non-null  object 
 2   Age       168096 non-null  float64
dtypes: float64(1), int64(1), object(1)
memory usage: 6.4+ MB


In [625]:
users.describe()

Unnamed: 0,User-ID,Age
count,278858.0,168096.0
mean,139429.5,34.751434
std,80499.51502,14.428097
min,1.0,0.0
25%,69715.25,24.0
50%,139429.5,32.0
75%,209143.75,44.0
max,278858.0,244.0


In [626]:
users["Age"] = users["Age"].fillna(users["Age"].median())

---

In [627]:
books

Unnamed: 0,ISBN,Book-Title,Book-Author,Year-Of-Publication,Publisher
0,0195153448,Classical Mythology,Mark P. O. Morford,2002,Oxford University Press
1,0002005018,Clara Callan,Richard Bruce Wright,2001,HarperFlamingo Canada
2,0060973129,Decision in Normandy,Carlo D'Este,1991,HarperPerennial
3,0374157065,Flu: The Story of the Great Influenza Pandemic...,Gina Bari Kolata,1999,Farrar Straus Giroux
4,0393045218,The Mummies of Urumchi,E. J. W. Barber,1999,W. W. Norton &amp; Company
...,...,...,...,...,...
271355,0440400988,There's a Bat in Bunk Five,Paula Danziger,1988,Random House Childrens Pub (Mm)
271356,0525447644,From One to One Hundred,Teri Sloat,1991,Dutton Books
271357,006008667X,Lily Dale : The True Story of the Town that Ta...,Christine Wicker,2004,HarperSanFrancisco
271358,0192126040,Republic (World's Classics),Plato,1996,Oxford University Press


In [628]:
ratings.head()

Unnamed: 0,User-ID,ISBN,Book-Rating
0,276725,034545104X,0
1,276726,0155061224,5
2,276727,0446520802,0
3,276729,052165615X,3
4,276729,0521795028,6


In [629]:
users.head()

Unnamed: 0,User-ID,Location,Age
0,1,"nyc, new york, usa",32.0
1,2,"stockton, california, usa",18.0
2,3,"moscow, yukon territory, russia",32.0
3,4,"porto, v.n.gaia, portugal",17.0
4,5,"farnborough, hants, united kingdom",32.0


## Merging Dataframes

In [630]:
users_ratings = pd.merge(users, ratings, on="User-ID", how="inner")
books_users_ratings = pd.merge(books, users_ratings, on="ISBN", how="inner")
df = books_users_ratings.copy()

In [631]:
df

Unnamed: 0,ISBN,Book-Title,Book-Author,Year-Of-Publication,Publisher,User-ID,Location,Age,Book-Rating
0,0195153448,Classical Mythology,Mark P. O. Morford,2002,Oxford University Press,2,"stockton, california, usa",18.0,0
1,0002005018,Clara Callan,Richard Bruce Wright,2001,HarperFlamingo Canada,8,"timmins, ontario, canada",32.0,5
2,0002005018,Clara Callan,Richard Bruce Wright,2001,HarperFlamingo Canada,11400,"ottawa, ontario, canada",49.0,0
3,0002005018,Clara Callan,Richard Bruce Wright,2001,HarperFlamingo Canada,11676,"n/a, n/a, n/a",32.0,8
4,0002005018,Clara Callan,Richard Bruce Wright,2001,HarperFlamingo Canada,41385,"sudbury, ontario, canada",32.0,0
...,...,...,...,...,...,...,...,...,...
1031127,0440400988,There's a Bat in Bunk Five,Paula Danziger,1988,Random House Childrens Pub (Mm),276463,"providence, rhode island, usa",26.0,7
1031128,0525447644,From One to One Hundred,Teri Sloat,1991,Dutton Books,276579,"orem, utah, usa",12.0,4
1031129,006008667X,Lily Dale : The True Story of the Town that Ta...,Christine Wicker,2004,HarperSanFrancisco,276680,"hopewell junction, new york, usa",55.0,0
1031130,0192126040,Republic (World's Classics),Plato,1996,Oxford University Press,276680,"hopewell junction, new york, usa",55.0,0


---

### Idea

We will make a book recommendation system based in the user's last read books. Our final objective is to recommend 10 books for each person. For this, we will first clusterize the books after making a PCA (Principal Component Analysis) considering some characteristics:

- Year of Publication
- Mean Age of Readers by Book
- Mean Age of Readers by Author
- Mean Age of Readers by Publisher
- Mean Rating by Book
- Mean Rating by Author
- Mean Rating by Publisher
- Mean Rating by Location
- Top Location by Book (Count Encoded) - where the book has been more read
- Book Contribution to Top Location - what is the fraction of readings the book contributed to in its top place
- Book Author (Count Encoded)
- Publisher (Count Encoded)
- Book Appearance (Count encoding of ISBN)

Then, we will recommend numbers of books of the clusters which have books the person already read using K-Nearest Neighbors. Each number will depend on the proportion of books of each cluster the person has read. If it's not possible to complete the 10 books with this technique, books from other clusters will be recommended - in this case, clusters close to the ones from which the books the person read come from will be chosen. 

## Data Preprocessing

### Mean Ages and Ratings

In [632]:
mean_age_by_book = df.groupby("Book-Title")["Age"].mean().rename("Mean-Age-By-Book")
df = pd.merge(df, mean_age_by_book, on="Book-Title", how="inner")

In [633]:
mean_age_by_author = df.groupby("Book-Author")["Age"].mean().rename("Mean-Age-By-Author")
df = pd.merge(df, mean_age_by_author, on="Book-Author", how="inner")

In [634]:
mean_age_by_publisher = df.groupby("Publisher")["Age"].mean().rename("Mean-Age-By-Publisher")
df = pd.merge(df, mean_age_by_publisher, on="Publisher", how="inner")

In [635]:
mean_rating_by_book = df.groupby("Book-Title")["Book-Rating"].mean().rename("Mean-Rating-By-Book")
df = pd.merge(df, mean_rating_by_book, on="Book-Title", how="inner")

In [636]:
mean_rating_by_author = df.groupby("Book-Author")["Book-Rating"].mean().rename("Mean-Rating-By-Author")
df = pd.merge(df, mean_rating_by_author, on="Book-Author", how="inner")

In [637]:
mean_rating_by_publisher = df.groupby("Publisher")["Book-Rating"].mean().rename("Mean-Rating-By-Publisher")
df = pd.merge(df, mean_rating_by_publisher, on="Publisher", how="inner")

In [638]:
mean_rating_by_location = df.groupby("Location")["Book-Rating"].mean().rename("Mean-Rating-By-Location")
df = pd.merge(df, mean_rating_by_location, on="Location", how="inner")

In [639]:
df = df.drop(["Age", "Book-Rating", "User-ID", "Book-Title"], axis=1)

### Top Location By Book

In [640]:
location_count_by_book = df.groupby(["ISBN", "Location"]).size().reset_index(name="Readers-Count-In-Top-Location")
top_location_by_book = location_count_by_book.loc[location_count_by_book.groupby("ISBN")["Readers-Count-In-Top-Location"].idxmax()].rename({"Location": "Top-Location-By-Book"}, axis=1)
df = pd.merge(df, top_location_by_book, on="ISBN", how="inner")

### Count Encodings

In [641]:
location_encoder = CountEncoder()
df["Location-Encoded"] = location_encoder.fit_transform(df["Location"])

In [642]:
location_encodings = df.groupby(["Location"])["Location-Encoded"].max().rename("Top-Location-By-Book-Encoded")
df = df.drop(["Location-Encoded"], axis=1)
df = pd.merge(df, location_encodings, left_on="Top-Location-By-Book", right_on="Location", how="inner")

In [643]:
df = df.drop(["Location", "Top-Location-By-Book"], axis=1)

In [644]:
author_encoder = CountEncoder()
df["Book-Author-Encoded"] = author_encoder.fit_transform(df["Book-Author"])
df = df.drop(["Book-Author"], axis=1)

In [645]:
publisher_encoder = CountEncoder()
df["Publisher-Encoded"] = publisher_encoder.fit_transform(df["Publisher"])
df = df.drop(["Publisher"], axis=1)

### Book Contribution to Top Location

In [646]:
df["Book-Contribution-To-Top-Location"] = df["Readers-Count-In-Top-Location"]/df["Top-Location-By-Book-Encoded"]

df = df.drop(["Readers-Count-In-Top-Location"], axis=1)

### Book Appearance

In [647]:
isbn_encoder = CountEncoder()
df["Book-Appearance"] = isbn_encoder.fit_transform(df["ISBN"])

### Dropping duplicates

In [648]:
df = df.drop_duplicates()

In [649]:
df

Unnamed: 0,ISBN,Year-Of-Publication,Mean-Age-By-Book,Mean-Age-By-Author,Mean-Age-By-Publisher,Mean-Rating-By-Book,Mean-Rating-By-Author,Mean-Rating-By-Publisher,Mean-Rating-By-Location,Top-Location-By-Book-Encoded,Book-Author-Encoded,Publisher-Encoded,Book-Contribution-To-Top-Location,Book-Appearance
0,0195153448,2002,24.000,24.000000,34.682643,3.500,3.500000,3.496732,3.730612,245,2,2754,0.004082,1
1,0764500546,1996,28.500,28.500000,36.338095,5.000,5.000000,3.347619,3.730612,245,2,630,0.004082,1
2,0060907924,1980,52.000,31.305085,37.791306,10.000,3.644068,2.626382,3.730612,245,59,6694,0.004082,1
3,0425173305,2000,32.875,33.357143,36.940554,4.625,5.071429,2.424827,3.730612,245,14,28614,0.004082,3
4,0425173305,2000,32.875,33.357143,36.940554,4.625,5.071429,2.424827,2.947639,245,14,28614,0.004082,3
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1031127,1902012089,0,32.000,32.000000,32.000000,0.000,0.000000,0.000000,0.000000,1,2,2,1.000000,1
1031128,0865341877,1993,80.000,80.000000,80.000000,0.000,0.000000,0.000000,0.000000,1,1,1,1.000000,1
1031129,9684271980,1995,32.000,32.000000,32.000000,8.000,8.000000,8.000000,8.000000,1,1,1,1.000000,1
1031130,0971991200,2002,51.000,51.000000,51.000000,0.000,0.000000,0.000000,0.000000,1,1,1,1.000000,1
