In [27]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
from IPython.display import display
from sklearn.pipeline import Pipeline
import category_encoders as ce

import sys
sys.path.insert(1, '../')

from src import preprocessing as pp
from src import analysis

# Enable module reloading
%load_ext autoreload
%autoreload 2

pd.set_option('display.max_columns', None)

plt.rcParams['figure.figsize'] = (10, 6)
%config InlineBackend.figure_format='retina'
plt.rcParams.update({'font.size': 15})

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [24]:
from sklearn.base import TransformerMixin
import json
from sklearn.model_selection import train_test_split


# Load datasets

In [3]:
books = pd.read_csv('../data/books.csv')
reviews = pd.read_csv('../data/reviews.csv')
# intr = pd.read_csv('../data/interactions.csv')
authors = pd.read_csv('../data/authors.csv')

# Data preprocessing

## books

In [13]:
books.columns

Index(['isbn', 'text_reviews_count', 'series', 'country_code', 'language_code',
       'popular_shelves', 'asin', 'is_ebook', 'average_rating', 'kindle_asin',
       'similar_books', 'description', 'format', 'link', 'authors',
       'publisher', 'num_pages', 'publication_day', 'isbn13',
       'publication_month', 'edition_information', 'publication_year', 'url',
       'image_url', 'book_id', 'ratings_count', 'work_id', 'title',
       'title_without_series'],
      dtype='object')

In [28]:
drop_cols1 = ['isbn', 'series', 'country_code', 'language_code',
              'asin', 'kindle_asin',
              'similar_books', 'description', 'format', 'link',
              'publisher', 'publication_day', 'isbn13',
              'publication_month', 'edition_information', 'publication_year', 'url',
              'image_url', 'ratings_count', 'work_id', 'title',
              'title_without_series']

drop_cols2 = ['popular_shelves', 'authors']

tags = ['favorites', 'currently-reading', 'to-read']

encoder = ce.OneHotEncoder()

books_ppl = Pipeline([
    ('DropUnusedCols1', pp.DropColumns(drop_cols1)),
    ('SelectTopNPercentileOfBooks', pp.SelectBooksWithNPercentile('text_reviews_count', 0.9)),
    ('ExportAuthorsAverageRating', pp.ExportAuthorsAverageRating('authors', 'authors_average_rating', authors)),
    ('ExtraxtPopularShelves',pp.ExportBookShelves('popular_shelves', tags)),
    ('DropUnusedCols2', pp.DropColumns(drop_cols2)),
    ('EncodeCategories', pp.EncodeCategories(encoder))
])

model = books_ppl.fit(books)

(fit) Drop columns: ['isbn', 'series', 'country_code', 'language_code', 'asin', 'kindle_asin', 'similar_books', 'description', 'format', 'link', 'publisher', 'publication_day', 'isbn13', 'publication_month', 'edition_information', 'publication_year', 'url', 'image_url', 'ratings_count', 'work_id', 'title', 'title_without_series']
(transform) Drop columns: ['isbn', 'series', 'country_code', 'language_code', 'asin', 'kindle_asin', 'similar_books', 'description', 'format', 'link', 'publisher', 'publication_day', 'isbn13', 'publication_month', 'edition_information', 'publication_year', 'url', 'image_url', 'ratings_count', 'work_id', 'title', 'title_without_series']
(fit) Select books with: text_reviews_count >= 103.0
(transform) Select books with: text_reviews_count >= 103.0
(fit) Export authors average rating
transform) Export authors average rating
(fit) ExportBookShelves, tag_col: popular_shelves, tags:['favorites', 'currently-reading', 'to-read']
(transform) ExportBookShelves, tag_col:

In [31]:
books_transformed = books_ppl.transform(books)

(transform) Drop columns: ['isbn', 'series', 'country_code', 'language_code', 'asin', 'kindle_asin', 'similar_books', 'description', 'format', 'link', 'publisher', 'publication_day', 'isbn13', 'publication_month', 'edition_information', 'publication_year', 'url', 'image_url', 'ratings_count', 'work_id', 'title', 'title_without_series']
(transform) Select books with: text_reviews_count >= 103.0
transform) Export authors average rating
(transform) ExportBookShelves, tag_col: popular_shelves, tags:['favorites', 'currently-reading', 'to-read']
(transform) Drop columns: ['popular_shelves', 'authors']
(transform) Category encoder OneHotEncoder(cols=[], drop_invariant=False, handle_missing='value',
              handle_unknown='value', return_df=True, use_cat_names=False,
              verbose=0)


In [32]:
books_transformed.head(3)

Unnamed: 0,text_reviews_count,is_ebook,average_rating,num_pages,book_id,authors_average_rating,favorites,currently-reading,to-read
4,428,False,3.71,351.0,22642971,3.89,49,67,9481
19,112,False,4.1,402.0,10806009,4.02,393,31,1639
26,276,False,3.86,237.0,9463563,3.86,53,65,13204


## users/reviews

In [33]:
reviews.columns

Index(['user_id', 'book_id', 'review_id', 'rating', 'review_text',
       'date_added', 'date_updated', 'read_at', 'started_at', 'n_votes',
       'n_comments'],
      dtype='object')

In [34]:
drop_cols1 = ['date_added', 'date_updated', 'read_at', 'started_at']

reviews_ppl = Pipeline([
    ('DropUnusedCols1', pp.DropColumns(drop_cols1)),
])

model = reviews_ppl.fit(reviews)

(fit) Drop columns: ['date_added', 'date_updated', 'read_at', 'started_at']


In [35]:
reviews_transformed = reviews_ppl.transform(reviews)

(transform) Drop columns: ['date_added', 'date_updated', 'read_at', 'started_at']


In [36]:
reviews_transformed.head(3)

Unnamed: 0,user_id,book_id,review_id,rating,review_text,n_votes,n_comments
0,8842281e1d1347389f2ab93d60773d4d,2767052,248c011811e945eca861b5c31a549291,5,I cracked and finally picked this up. Very enj...,24,25
1,7504b2aee1ecb5b2872d3da381c6c91e,23302416,84c0936a0f9868f38e75d2f9a5cb761e,5,I read this book because my fifth grade son wa...,0,0
2,f8a89075dc6de14857561522e729f82c,18053080,785c8db878f4009da9741dea51f641da,4,Though the book started out slow and only star...,0,0


# Merge data

In [37]:
data = pd.merge(books_transformed, reviews_transformed, on='book_id')

In [38]:
data.head(3)

Unnamed: 0,text_reviews_count,is_ebook,average_rating,num_pages,book_id,authors_average_rating,favorites,currently-reading,to-read,user_id,review_id,rating,review_text,n_votes,n_comments
0,428,False,3.71,351.0,22642971,3.89,49,67,9481,929a819434fe1182015b4b95c1149346,5e008fef7e2fb6a1c5aa460153467e8f,4,*4.5 stars* \n I'm a big huge fan of the Acros...,0,0
1,428,False,3.71,351.0,22642971,3.89,49,67,9481,3aee9278dddcc99d8e0af367dff3d037,23182cfc18eb1be5c5eb55a51871e850,4,Thanks to Netgalley and the publishers for a c...,0,0
2,428,False,3.71,351.0,22642971,3.89,49,67,9481,8b089f0255ff3c591817b6d7bf3c61e6,9ae1da997e3c8474aa44c3fb5b563997,5,"First of all, I just have to say that I absolu...",0,0


# Training

## Train/Test split

In [45]:
drop_cols = ['rating']

X = data.drop(drop_cols, axis=1)

In [41]:
y = data['rating']

In [43]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)