In [82]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
from IPython.display import display
from sklearn.pipeline import Pipeline
import category_encoders as ce

import sys
sys.path.insert(1, '../')

from src import preprocessing as pp
from src import analysis
# from src.preprocessing import ReviewsLanguageFilter

# Enable module reloading
%load_ext autoreload
%autoreload 2

pd.set_option('display.max_columns', None)

plt.rcParams['figure.figsize'] = (10, 6)
%config InlineBackend.figure_format='retina'
plt.rcParams.update({'font.size': 15})

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [3]:
from sklearn.base import TransformerMixin
import json
from sklearn.model_selection import train_test_split


# Data preprocessing

In this notebook, we are going to preprocess our data, so we can use them during training.

## Load datasets

In [4]:
books = pd.read_csv('../data/books.csv')
reviews = pd.read_csv('../data/reviews.csv')
# intr = pd.read_csv('../data/interactions.csv')
authors = pd.read_csv('../data/authors.csv')

## Merge data

In [12]:
data = pd.merge(books, reviews , on='book_id')

In [13]:
data.head(3)

Unnamed: 0,isbn,text_reviews_count,series,country_code,language_code,popular_shelves,asin,is_ebook,average_rating,kindle_asin,similar_books,description,format,link,authors,publisher,num_pages,publication_day,isbn13,publication_month,edition_information,publication_year,url,image_url,book_id,ratings_count,work_id,title,title_without_series,user_id,review_id,rating,review_text,date_added,date_updated,read_at,started_at,n_votes,n_comments
0,,1,['147734'],US,,"[{'count': '1057', 'name': 'to-read'}, {'count...",B0056A00P4,True,4.04,B0056A00P4,"['519546', '1295074', '21407416']",This is the final tale in the bestselling auth...,,https://www.goodreads.com/book/show/12182387-t...,"[{'author_id': '50873', 'role': ''}, {'author_...",,,,,,,,https://www.goodreads.com/book/show/12182387-t...,https://s.gr-assets.com/assets/nophoto/book/11...,12182387,4,285263,"The Passion (Dark Visions, #3)","The Passion (Dark Visions, #3)",8a6085f339853bb493a8341f0d7e3bdf,fc61f4a89afd084140b9ea2090e552e2,5,Nachdem Gabriel und Lydia verschwunden sind un...,Fri Sep 09 12:32:42 -0700 2011,Wed Oct 26 11:11:47 -0700 2011,Tue Oct 25 00:00:00 -0700 2011,Sun Oct 23 00:00:00 -0700 2011,0,0
1,,2,['425995'],US,,"[{'count': '1010', 'name': 'to-read'}, {'count...",B006KLYIAG,True,3.8,B006KLYIAG,"['13400912', '13327517', '18107102', '15797097...",Life should be simple for Cassie.\nFor the sma...,,https://www.goodreads.com/book/show/20135365-h...,"[{'author_id': '5395324', 'role': ''}]",,,,,,,,https://www.goodreads.com/book/show/20135365-h...,https://s.gr-assets.com/assets/nophoto/book/11...,20135365,5,18450480,Hope's Daughter,Hope's Daughter,c7cafc5c262441aaa9fc8c816dcd20d5,9f5ee0e6211043932bcb46793222c2f6,4,I received this book from the author in exchan...,Wed Apr 04 11:09:28 -0700 2012,Thu Mar 20 11:44:09 -0700 2014,Thu Mar 20 11:44:09 -0700 2014,Wed Mar 19 00:00:00 -0700 2014,2,0
2,698143760.0,17,['493993'],US,,"[{'count': '1799', 'name': 'fantasy'}, {'count...",,True,3.8,,"['15728807', '17182499', '15673520', '16081758...",Wanted by no one.\nHunted by everyone.\nSixtee...,ebook,https://www.goodreads.com/book/show/21401181-h...,"[{'author_id': '7314532', 'role': ''}]",Viking Children's,416.0,4.0,9780698143760.0,3.0,,2014.0,https://www.goodreads.com/book/show/21401181-h...,https://images.gr-assets.com/books/1394747643m...,21401181,33,24802827,"Half Bad (Half Life, #1)","Half Bad (Half Life, #1)",88d99966e16ad22b3d824758e41bdc31,d1c5a076f2433511b239dd4745d88f99,4,Dark story about a young soon to be witch boy....,Fri Mar 24 04:36:25 -0700 2017,Mon Mar 27 13:28:45 -0700 2017,Mon Mar 27 13:28:45 -0700 2017,Fri Mar 24 04:36:26 -0700 2017,0,0


## Train/Test split

In [14]:
drop_cols = ['rating']

X = data.drop(drop_cols, axis=1)

In [15]:
y = data['rating']

In [16]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

## Create preprocessing pipeline

### books
In this section, we will try to preprocess data from book dataset.

Problems needed to be solved by preprocessing are:
- drop columns that does not contain any useful information for our task (these are also columns that contain most of the missing values)
- replace authors columns with the average rating of authors
- export shelves (by default 'favorites', 'currently-reading', 'to-read', or any other) with number of votes
- replace missing values in columns, we are planning to use later
- normalize and scale the numeric atributes

### Reviews

In this section, we will try to preprocess data from reviews dataset.

Problems needed to be solved by preprocessing are:
- drop columns, that does not contain any usefull information
- since we found out during analysis that longest reviews contain lots of useless data, we will set trashold for max length of review (in number of words)
- get rid of reviews that are not in english
- remove urls from reviews
- remove other special characters

In [21]:
X_train.columns

Index(['isbn', 'text_reviews_count', 'series', 'country_code', 'language_code',
       'popular_shelves', 'asin', 'is_ebook', 'average_rating', 'kindle_asin',
       'similar_books', 'description', 'format', 'link', 'authors',
       'publisher', 'num_pages', 'publication_day', 'isbn13',
       'publication_month', 'edition_information', 'publication_year', 'url',
       'image_url', 'book_id', 'ratings_count', 'work_id', 'title',
       'title_without_series', 'user_id', 'review_id', 'review_text',
       'date_added', 'date_updated', 'read_at', 'started_at', 'n_votes',
       'n_comments'],
      dtype='object')

In [22]:
X_train.shape

(1601233, 38)

In [40]:
X_train.head()

Unnamed: 0,isbn,text_reviews_count,series,country_code,language_code,popular_shelves,asin,is_ebook,average_rating,kindle_asin,similar_books,description,format,link,authors,publisher,num_pages,publication_day,isbn13,publication_month,edition_information,publication_year,url,image_url,book_id,ratings_count,work_id,title,title_without_series,user_id,review_id,review_text,date_added,date_updated,read_at,started_at,n_votes,n_comments
785343,,197,['437256'],US,eng,"[{'count': '3174', 'name': 'to-read'}, {'count...",B00O6606OE,True,3.78,B00O6606OE,"['16045315', '21936669', '18518158', '18051224...","As vice president of Student Council, Kaye kno...",Kindle Edition,https://www.goodreads.com/book/show/16140843-m...,"[{'author_id': '203238', 'role': ''}]",Simon Pulse,352.0,4.0,,8.0,,2015.0,https://www.goodreads.com/book/show/16140843-m...,https://images.gr-assets.com/books/1401999803m...,16140843,1419,21971988,"Most Likely to Succeed (Superlatives, #3)","Most Likely to Succeed (Superlatives, #3)",9256019bfc43706b5ca19b079ec1b30c,0e3ba87beeb405fb0ae80886af7a5b4f,"This book was a cute, light, quick read that o...",Fri Feb 06 17:48:19 -0800 2015,Mon Sep 14 12:14:58 -0700 2015,Sat Sep 12 00:00:00 -0700 2015,Thu Sep 10 00:00:00 -0700 2015,5,0
2388979,1250000246.0,566,['270447'],US,en-US,"[{'count': '2440', 'name': 'to-read'}, {'count...",,False,4.16,B005J4EWME,"['9723976', '9526658', '11598940', '10890463',...",The second in the enthralling new mini-series ...,,https://www.goodreads.com/book/show/11551045-l...,"[{'author_id': '17015', 'role': ''}, {'author_...",St. Martin's Griffin,160.0,31.0,9781250000248.0,1.0,,2012.0,https://www.goodreads.com/book/show/11551045-l...,https://images.gr-assets.com/books/1317849415m...,11551045,12811,16491019,"Lenobia's Vow (House of Night Novellas, #2)","Lenobia's Vow (House of Night Novellas, #2)",adecc507f6316163ab3dfab5ee914408,da1e87978fd8c71557a0659d15bb769d,"I was crying in the end, but I loved the story...",Thu Nov 10 14:34:06 -0800 2011,Sun Feb 12 09:22:56 -0800 2012,Sun Feb 12 09:22:55 -0800 2012,Sat Feb 11 00:00:00 -0800 2012,0,0
589678,373211740.0,21,['517846'],US,eng,"[{'count': '3611', 'name': 'to-read'}, {'count...",,False,3.97,B00S503U5E,"['14061551', '25104766', '30201752', '10217144...","After almost a year in Japan, Katie Greene has...",Paperback,https://www.goodreads.com/book/show/23492281-s...,"[{'author_id': '4649677', 'role': ''}]",Harlequin Teen,287.0,30.0,9780373211746.0,6.0,,2015.0,https://www.goodreads.com/book/show/23492281-s...,https://images.gr-assets.com/books/1423171371m...,23492281,74,25473109,"Storm (Paper Gods, #3)","Storm (Paper Gods, #3)",bba5ba2a79a4846b7808a2ceae9f7d81,34392baeed66a7d83cf849fd9b230432,I thought this was a really satisfying conclus...,Wed Mar 11 17:44:20 -0700 2015,Thu Jul 16 09:01:19 -0700 2015,Thu Jul 02 00:00:00 -0700 2015,Thu Jul 02 00:00:00 -0700 2015,1,1
1703631,671027344.0,40116,[],US,eng,"[{'count': '427726', 'name': 'to-read'}, {'cou...",,False,4.21,B003TSEEDY,"['248704', '138202', '733111', '23232', '11620...",The critically acclaimed debut novel from Step...,Paperback,https://www.goodreads.com/book/show/22628.The_...,"[{'author_id': '12898', 'role': ''}]",MTV Books and Pocket Books,213.0,1.0,9780671027346.0,2.0,,1999.0,https://www.goodreads.com/book/show/22628.The_...,https://images.gr-assets.com/books/1167352178m...,22628,906322,2236198,The Perks of Being a Wallflower,The Perks of Being a Wallflower,ef7adbc97c4a7280882df0f888f1ab92,3c4257add627a71bf658df27348a4866,"If there was a list of banned books, I think t...",Wed Dec 18 08:53:03 -0800 2013,Wed Dec 24 09:27:18 -0800 2014,Fri Dec 20 15:33:52 -0800 2013,Wed Dec 18 00:00:00 -0800 2013,0,0
1911650,,984,['267158'],US,en-US,"[{'count': '494863', 'name': 'to-read'}, {'cou...",B005KJJ4F8,True,4.15,B005KJJ4F8,"['9917938', '9415956', '12882328', '8306761', ...",Humans and androids crowd the raucous streets ...,,https://www.goodreads.com/book/show/12973964-c...,"[{'author_id': '4684322', 'role': ''}]",Feiwel & Friends,400.0,3.0,,1.0,,2012.0,https://www.goodreads.com/book/show/12973964-c...,https://images.gr-assets.com/books/1465596509m...,12973964,9097,15545385,"Cinder (The Lunar Chronicles, #1)","Cinder (The Lunar Chronicles, #1)",b7428eec6330351dd471207e925d6526,2ca01c7038f8eac7a5ac5cd4cd88518d,`uuemmmmmmmmmm aemwaacchatidkhadkhlukkhlakchay...,Sun Oct 12 05:10:54 -0700 2014,Sun Feb 01 02:37:51 -0800 2015,Sun Feb 01 04:40:43 -0800 2015,Thu Jan 29 00:00:00 -0800 2015,3,3


In [85]:
drop_cols1 = ['isbn', 'series', 'country_code', 'language_code',
              'asin', 'kindle_asin',
              'description', 
              'format',
              'link',
              'publisher',
              'num_pages',
              'publication_day', 'isbn13', 'publication_month',
              'edition_information',
              'publication_year', 'url', 'image_url',
              'title',
              'title_without_series',
              'date_added', 'date_updated', 'read_at', 'started_at',
              'n_votes',
              'n_comments'
             ]

# drop_cols2 = ['popular_shelves', 'authors']

tags = ['favorites', 'currently-reading', 'to-read']

encoder = ce.OneHotEncoder()

ppl = Pipeline([
    ('DropUnusedCols1', pp.DropColumns(drop_cols1)),
    ('SelectTopNPercentileOfBooks', pp.SelectBooksWithNPercentile('text_reviews_count', 0.9)),
    ('ExportAuthorsAverageRating', pp.ExportAuthorsAverageRating('authors', 'authors_average_rating', authors)),
    ('ExtraxtPopularShelves',pp.ExportBookShelves('popular_shelves', tags)),
    ('EmptyValuesFilter', pp.EmptyValuesFilter(['review_text'])),
    ('TextPreprocessor', pp.TextPreprocessor('review_text')),
    ('ReviewLengthFilter', pp.ReviewLengthFilter('review_text', 0, 2000)),
    ('ReviewsLanguageFilter', pp.ReviewsLanguageFilter('review_text', 'en')),
    ('DropUnusedCols2', pp.DropColumns(drop_cols2)),
#     ('EncodeCategories', pp.EncodeCategories(encoder))
])

model = ppl.fit(X_train.iloc[:5000])

(fit) Drop columns: ['isbn', 'series', 'country_code', 'language_code', 'asin', 'kindle_asin', 'description', 'format', 'link', 'publisher', 'num_pages', 'publication_day', 'isbn13', 'publication_month', 'edition_information', 'publication_year', 'url', 'image_url', 'title', 'title_without_series', 'date_added', 'date_updated', 'read_at', 'started_at', 'n_votes', 'n_comments']
(transform) Drop columns: ['isbn', 'series', 'country_code', 'language_code', 'asin', 'kindle_asin', 'description', 'format', 'link', 'publisher', 'num_pages', 'publication_day', 'isbn13', 'publication_month', 'edition_information', 'publication_year', 'url', 'image_url', 'title', 'title_without_series', 'date_added', 'date_updated', 'read_at', 'started_at', 'n_votes', 'n_comments']
(fit) Select books with: text_reviews_count >= 22363.0
(transform) Select books with: text_reviews_count >= 22363.0
(fit) Export authors average rating
(transform) Export authors average rating
(fit) ExportBookShelves, tag_col: popula

In [90]:
pX_train = ppl.transform(X_train.iloc[:5000])

(transform) Drop columns: ['isbn', 'series', 'country_code', 'language_code', 'asin', 'kindle_asin', 'description', 'format', 'link', 'publisher', 'num_pages', 'publication_day', 'isbn13', 'publication_month', 'edition_information', 'publication_year', 'url', 'image_url', 'title', 'title_without_series', 'date_added', 'date_updated', 'read_at', 'started_at', 'n_votes', 'n_comments']
(transform) Select books with: text_reviews_count >= 22363.0
(transform) Export authors average rating
(transform) ExportBookShelves, tag_col: popular_shelves, tags:['favorites', 'currently-reading', 'to-read']
(transform) Empty values filter
(transform) Text preprocessing
(transform) Review length filter
(transform) Reviews language filter
(transform) Drop columns: ['popular_shelves', 'authors']


In [91]:
pX_train.head(3)

Unnamed: 0,text_reviews_count,is_ebook,average_rating,similar_books,book_id,ratings_count,work_id,user_id,review_id,review_text,authors_average_rating,favorites,currently-reading,to-read
1703631,40116,False,4.21,"['248704', '138202', '733111', '23232', '11620...",22628,906322,2236198,ef7adbc97c4a7280882df0f888f1ab92,3c4257add627a71bf658df27348a4866,"if there was a list of banned books, i think t...",4.21,23667,15217,427726
1700061,32909,False,3.69,"['140077', '2051840', '225669', '7239607', '42...",428263,1146155,2675454,a7958ca78024f0c9a30e5df0bed69560,d5a3b09f72e11ee7c2692738261c9214,my favourite of the series! now that all the m...,3.64,6604,6104,1296
2313973,68482,False,4.23,"['13253276', '10816908', '9917998', '9867814',...",13335037,1962813,13155899,9e49b762b9c7b6aa8430e9a12ef7fd59,73001ec28d5d5fde69709a6924b7a1f5,i thought this book was decent but some things...,4.09,28751,29031,281


In [9]:
reviews.columns

Index(['user_id', 'book_id', 'review_id', 'rating', 'review_text',
       'date_added', 'date_updated', 'read_at', 'started_at', 'n_votes',
       'n_comments'],
      dtype='object')

In [10]:
drop_cols1 = ['date_added', 'date_updated', 'read_at', 'started_at']

reviews_ppl = Pipeline([
    ('DropUnusedCols1', pp.DropColumns(drop_cols1)),
    ('EmptyValuesFilter', pp.EmptyValuesFilter(['review_text'])),
    ('TextPreprocessor', pp.TextPreprocessor('review_text')),
    ('ReviewLengthFilter', pp.ReviewLengthFilter('review_text', 0, 2000)),
    ('ReviewsLanguageFilter', pp.ReviewsLanguageFilter('review_text', 'en'))
])

model = reviews_ppl.fit(reviews)

(fit) Drop columns: ['date_added', 'date_updated', 'read_at', 'started_at']
(transform) Drop columns: ['date_added', 'date_updated', 'read_at', 'started_at']
(fit) Empty values filter
(transform) Empty values filter
(fit) Text preprocessing
(transform) Text preprocessing
(fit) Review length filter
(transform) Review length filter
(fit) Reviews language filter


In [11]:
reviews_transformed = reviews_ppl.transform(reviews)

(transform) Drop columns: ['date_added', 'date_updated', 'read_at', 'started_at']
(transform) Empty values filter
(transform) Text preprocessing
(transform) Review length filter
(transform) Reviews language filter


LangDetectException: No features in text.

In [None]:
reviews_transformed.head(3)