In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
from IPython.display import display
from sklearn.pipeline import Pipeline
import category_encoders as ce

import sys
sys.path.insert(1, '../')

from src import preprocessing as pp
from src import analysis
# from src.preprocessing import ReviewsLanguageFilter

# Enable module reloading
%load_ext autoreload
%autoreload 2

pd.set_option('display.max_columns', None)

plt.rcParams['figure.figsize'] = (10, 6)
%config InlineBackend.figure_format='retina'
plt.rcParams.update({'font.size': 15})

In [2]:
from sklearn.base import TransformerMixin
import json
from sklearn.model_selection import train_test_split


# Data preprocessing

In this notebook, we are going to preprocess our data, so we can use them during training.

## Load datasets

In [3]:
books = pd.read_csv('../data/books.csv')
reviews = pd.read_csv('../data/reviews.csv')
# intr = pd.read_csv('../data/interactions.csv')
# authors = pd.read_csv('../data/authors.csv')

## Books

In this section, we will try to preprocess data from book dataset.

Problems needed to be solved by preprocessing are:
- drop columns that does not contain any useful information for our task (these are also columns that contain most of the missing values)
- replace authors columns by their 

In [None]:
books.columns

In [None]:
drop_cols1 = ['isbn', 'series', 'country_code', 'language_code',
              'asin', 'kindle_asin',
              'similar_books', 'description', 'format', 'link',
              'publisher', 'publication_day', 'isbn13',
              'publication_month', 'edition_information', 'publication_year', 'url',
              'image_url', 'ratings_count', 'work_id', 'title',
              'title_without_series']

drop_cols2 = ['popular_shelves', 'authors']

tags = ['favorites', 'currently-reading', 'to-read']

encoder = ce.OneHotEncoder()

books_ppl = Pipeline([
    ('DropUnusedCols1', pp.DropColumns(drop_cols1)),
    ('SelectTopNPercentileOfBooks', pp.SelectBooksWithNPercentile('text_reviews_count', 0.9)),
    ('ExportAuthorsAverageRating', pp.ExportAuthorsAverageRating('authors', 'authors_average_rating', authors)),
    ('ExtraxtPopularShelves',pp.ExportBookShelves('popular_shelves', tags)),
    ('DropUnusedCols2', pp.DropColumns(drop_cols2)),
    ('EncodeCategories', pp.EncodeCategories(encoder))
])

model = books_ppl.fit(books)

In [None]:
books_transformed = books_ppl.transform(books)

In [None]:
books_transformed.head(3)

## Reviews

In this section, we will try to preprocess data from reviews dataset.

Problems needed to be solved by preprocessing are:
- drop columns, that does not contain any usefull information
- since we found out during analysis that longest reviews contain lots of useless data, we will set trashold for max length of review (in number of words)
- get rid of reviews that are not in english
- remove urls from reviews
- remove other special characters

In [4]:
reviews.columns

Index(['user_id', 'book_id', 'review_id', 'rating', 'review_text',
       'date_added', 'date_updated', 'read_at', 'started_at', 'n_votes',
       'n_comments'],
      dtype='object')

In [5]:
drop_cols1 = ['date_added', 'date_updated', 'read_at', 'started_at']

reviews_ppl = Pipeline([
    ('DropUnusedCols1', pp.DropColumns(drop_cols1)),
    ('EmptyValuesFilter', pp.EmptyValuesFilter(['review_text'])),
    ('TextPreprocessor', pp.TextPreprocessor('review_text')),
    ('ReviewLengthFilter', pp.ReviewLengthFilter('review_text', 0, 2000)),
    ('ReviewsLanguageFilter', pp.ReviewsLanguageFilter('review_text', 'en'))
])

model = reviews_ppl.fit(reviews)

(fit) Drop columns: ['date_added', 'date_updated', 'read_at', 'started_at']
(transform) Drop columns: ['date_added', 'date_updated', 'read_at', 'started_at']
(fit) Empty values filter
(transform) Empty values filter
(fit) Text preprocessing
(transform) Text preprocessing
(fit) Review length filter
(transform) Review length filter
(fit) Reviews language filter


In [None]:
reviews_transformed = reviews_ppl.transform(reviews)

(transform) Drop columns: ['date_added', 'date_updated', 'read_at', 'started_at']
(transform) Empty values filter
(transform) Text preprocessing


In [None]:
reviews_transformed.head(3)

# Merge data

In [None]:
data = pd.merge(books_transformed, reviews_transformed, on='book_id')

In [None]:
data.head(3)

# Training

## Train/Test split

In [None]:
drop_cols = ['rating']

X = data.drop(drop_cols, axis=1)

In [None]:
y = data['rating']

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)