In [306]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
from IPython.display import display
from sklearn.pipeline import Pipeline
import category_encoders as ce

import sys
sys.path.insert(1, '../')

from src import preprocessing as pp
from src import analysis

# Enable module reloading
%load_ext autoreload
%autoreload 2

pd.set_option('display.max_columns', None)

plt.rcParams['figure.figsize'] = (10, 6)
%config InlineBackend.figure_format='retina'
plt.rcParams.update({'font.size': 15})

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [261]:
pp.Re

src.preprocessing.transform_column.ExportSimilarBooksRating

In [3]:
from sklearn.base import TransformerMixin
import json
from sklearn.model_selection import train_test_split


# Data preprocessing

In this notebook, we are going to preprocess our data, so we can use them during training.

## Load datasets

In [93]:
books = pd.read_csv('../data/books.csv')
reviews = pd.read_csv('../data/reviews.csv')
# intr = pd.read_csv('../data/interactions.csv')
authors = pd.read_csv('../data/authors.csv')

## Merge data

In [12]:
data = pd.merge(books, reviews , on='book_id')

In [13]:
data.head(3)

Unnamed: 0,isbn,text_reviews_count,series,country_code,language_code,popular_shelves,asin,is_ebook,average_rating,kindle_asin,similar_books,description,format,link,authors,publisher,num_pages,publication_day,isbn13,publication_month,edition_information,publication_year,url,image_url,book_id,ratings_count,work_id,title,title_without_series,user_id,review_id,rating,review_text,date_added,date_updated,read_at,started_at,n_votes,n_comments
0,,1,['147734'],US,,"[{'count': '1057', 'name': 'to-read'}, {'count...",B0056A00P4,True,4.04,B0056A00P4,"['519546', '1295074', '21407416']",This is the final tale in the bestselling auth...,,https://www.goodreads.com/book/show/12182387-t...,"[{'author_id': '50873', 'role': ''}, {'author_...",,,,,,,,https://www.goodreads.com/book/show/12182387-t...,https://s.gr-assets.com/assets/nophoto/book/11...,12182387,4,285263,"The Passion (Dark Visions, #3)","The Passion (Dark Visions, #3)",8a6085f339853bb493a8341f0d7e3bdf,fc61f4a89afd084140b9ea2090e552e2,5,Nachdem Gabriel und Lydia verschwunden sind un...,Fri Sep 09 12:32:42 -0700 2011,Wed Oct 26 11:11:47 -0700 2011,Tue Oct 25 00:00:00 -0700 2011,Sun Oct 23 00:00:00 -0700 2011,0,0
1,,2,['425995'],US,,"[{'count': '1010', 'name': 'to-read'}, {'count...",B006KLYIAG,True,3.8,B006KLYIAG,"['13400912', '13327517', '18107102', '15797097...",Life should be simple for Cassie.\nFor the sma...,,https://www.goodreads.com/book/show/20135365-h...,"[{'author_id': '5395324', 'role': ''}]",,,,,,,,https://www.goodreads.com/book/show/20135365-h...,https://s.gr-assets.com/assets/nophoto/book/11...,20135365,5,18450480,Hope's Daughter,Hope's Daughter,c7cafc5c262441aaa9fc8c816dcd20d5,9f5ee0e6211043932bcb46793222c2f6,4,I received this book from the author in exchan...,Wed Apr 04 11:09:28 -0700 2012,Thu Mar 20 11:44:09 -0700 2014,Thu Mar 20 11:44:09 -0700 2014,Wed Mar 19 00:00:00 -0700 2014,2,0
2,698143760.0,17,['493993'],US,,"[{'count': '1799', 'name': 'fantasy'}, {'count...",,True,3.8,,"['15728807', '17182499', '15673520', '16081758...",Wanted by no one.\nHunted by everyone.\nSixtee...,ebook,https://www.goodreads.com/book/show/21401181-h...,"[{'author_id': '7314532', 'role': ''}]",Viking Children's,416.0,4.0,9780698143760.0,3.0,,2014.0,https://www.goodreads.com/book/show/21401181-h...,https://images.gr-assets.com/books/1394747643m...,21401181,33,24802827,"Half Bad (Half Life, #1)","Half Bad (Half Life, #1)",88d99966e16ad22b3d824758e41bdc31,d1c5a076f2433511b239dd4745d88f99,4,Dark story about a young soon to be witch boy....,Fri Mar 24 04:36:25 -0700 2017,Mon Mar 27 13:28:45 -0700 2017,Mon Mar 27 13:28:45 -0700 2017,Fri Mar 24 04:36:26 -0700 2017,0,0


## Train/Test split

In [14]:
drop_cols = ['rating']

X = data.drop(drop_cols, axis=1)

In [15]:
y = data['rating']

In [16]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

## Create preprocessing pipeline

### books
In this section, we will try to preprocess data from book dataset.

Problems needed to be solved by preprocessing are:
- drop columns that does not contain any useful information for our task (these are also columns that contain most of the missing values)
- replace authors columns with the average rating of authors
- export shelves (by default 'favorites', 'currently-reading', 'to-read', or any other) with number of votes
- normalize and scale the numeric atributes

### Reviews

In this section, we will try to preprocess data from reviews dataset.

Problems needed to be solved by preprocessing are:
- drop columns, that does not contain any usefull information
- since we found out during analysis that longest reviews contain lots of useless data, we will set trashold for max length of review (in number of words)
- get rid of reviews that are not in english
- remove urls from reviews
- remove other special characters
- normalize and scale numeric atributes

In [21]:
X_train.columns

Index(['isbn', 'text_reviews_count', 'series', 'country_code', 'language_code',
       'popular_shelves', 'asin', 'is_ebook', 'average_rating', 'kindle_asin',
       'similar_books', 'description', 'format', 'link', 'authors',
       'publisher', 'num_pages', 'publication_day', 'isbn13',
       'publication_month', 'edition_information', 'publication_year', 'url',
       'image_url', 'book_id', 'ratings_count', 'work_id', 'title',
       'title_without_series', 'user_id', 'review_id', 'review_text',
       'date_added', 'date_updated', 'read_at', 'started_at', 'n_votes',
       'n_comments'],
      dtype='object')

In [253]:
X_train.shape

(1601233, 38)

In [282]:
books.average_rating.quantile(0.4)

3.79

In [303]:
drop_cols1 = ['isbn', 'series', 'country_code', 'language_code',
              'asin', 'kindle_asin',
              'description', 
              'format',
              'link',
              'publisher',
              'num_pages',
              'publication_day', 'isbn13', 'publication_month',
              'edition_information',
              'publication_year', 'url', 'image_url',
              'title',
              'title_without_series',
              'date_added', 'date_updated', 'read_at', 'started_at',
              'n_votes',
              'n_comments'
             ]

drop_cols2 = ['popular_shelves', 'authors', 'similar_books']

replace_outliers_cols = ['text_reviews_count']

tags = ['favorites', 'currently-reading', 'to-read']

encoder = ce.OneHotEncoder()

ppl = Pipeline([
    ('DropUnusedCols1', pp.DropColumns(drop_cols1)),
    ('SelectTopNPercentileOfBooks', pp.SelectBooksWithNPercentile('text_reviews_count', 0.9)),
    ('ReplaceOutliersWithPercentile', pp.ReplaceOutliersWithPercentile(replace_outliers_cols, 0.9, 0.1)),
#     ('ExportAuthorsAverageRating', pp.ExportAuthorsAverageRating('authors', 'authors_average_rating', authors)),
#     ('ExportSimilarBooksAverageRating', pp.ExportSimilarBooksRating(
#         books[['book_id','average_rating']], 'similar_books', 'sim_books_average_rating')),
#     ('ExtraxtPopularShelves',pp.ExportBookShelves('popular_shelves', tags)),
#     ('EmptyValuesFilter', pp.EmptyValuesFilter(['review_text'])),
#     ('TextPreprocessor', pp.TextPreprocessor('review_text')),
#     ('ReviewLengthFilter', pp.ReviewLengthFilter('review_text', 0, 2000)),
#     ('ReviewsLanguageFilter', pp.ReviewsLanguageFilter('review_text', 'en')),
    ('DropUnusedCols2', pp.DropColumns(drop_cols2)),
#     ('EncodeCategories', pp.EncodeCategories(encoder))
])

model = ppl.fit(X_train.iloc[:5000])

AttributeError: module 'src.preprocessing' has no attribute 'ReplaceOutliersWithPercentile'

In [300]:
pX_train = ppl.transform(X_train.iloc[:5000])

(transform) Drop columns: ['isbn', 'series', 'country_code', 'language_code', 'asin', 'kindle_asin', 'description', 'format', 'link', 'publisher', 'num_pages', 'publication_day', 'isbn13', 'publication_month', 'edition_information', 'publication_year', 'url', 'image_url', 'title', 'title_without_series', 'date_added', 'date_updated', 'read_at', 'started_at', 'n_votes', 'n_comments']
(transform) Select books with: text_reviews_count >= 22363.0
(fit) Replace outliers for:  ['text_reviews_count'] with percentiles: ( 0.1 0.9 )
(transform) Drop columns: ['popular_shelves', 'authors', 'similar_books']


In [295]:
pX_train.head()

Unnamed: 0,text_reviews_count,is_ebook,average_rating,book_id,ratings_count,work_id,user_id,review_id,review_text
1703631,40116.0,False,4.21,22628,906322,2236198,ef7adbc97c4a7280882df0f888f1ab92,3c4257add627a71bf658df27348a4866,"If there was a list of banned books, I think t..."
1700061,32909.0,False,3.69,428263,1146155,2675454,a7958ca78024f0c9a30e5df0bed69560,d5a3b09f72e11ee7c2692738261c9214,My favourite of the series! Now that all the m...
2313973,68482.0,False,4.23,13335037,1962813,13155899,9e49b762b9c7b6aa8430e9a12ef7fd59,73001ec28d5d5fde69709a6924b7a1f5,I thought this book was decent but some things...
612566,31536.0,False,4.43,11387515,255461,16319487,9babe654bdebdcd89f4044e4202c8260,5b4cb7933029d4313b3648f521a07a76,I don't think words can describe the experienc...
616703,31536.0,False,4.43,11387515,255461,16319487,17011978371273cb19718587a0c81933,968ec59fc0778d42108c3c0c6e1a4289,I finsished this book a long time ago. I like ...


In [10]:
drop_cols1 = ['date_added', 'date_updated', 'read_at', 'started_at']

reviews_ppl = Pipeline([
    ('DropUnusedCols1', pp.DropColumns(drop_cols1)),
    ('EmptyValuesFilter', pp.EmptyValuesFilter(['review_text'])),
    ('TextPreprocessor', pp.TextPreprocessor('review_text')),
    ('ReviewLengthFilter', pp.ReviewLengthFilter('review_text', 0, 2000)),
    ('ReviewsLanguageFilter', pp.ReviewsLanguageFilter('review_text', 'en'))
])

model = reviews_ppl.fit(reviews)

(fit) Drop columns: ['date_added', 'date_updated', 'read_at', 'started_at']
(transform) Drop columns: ['date_added', 'date_updated', 'read_at', 'started_at']
(fit) Empty values filter
(transform) Empty values filter
(fit) Text preprocessing
(transform) Text preprocessing
(fit) Review length filter
(transform) Review length filter
(fit) Reviews language filter
