In [32]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
from IPython.display import display
from sklearn.pipeline import Pipeline

import sys
sys.path.insert(1, '../')

from src import preprocessing as pp
from src import analysis

# Enable module reloading
%load_ext autoreload
%autoreload 2

pd.set_option('display.max_columns', None)

plt.rcParams['figure.figsize'] = (10, 6)
%config InlineBackend.figure_format='retina'
plt.rcParams.update({'font.size': 15})

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [3]:
from sklearn.base import TransformerMixin
import json
import category_encoders as ce

# Load datasets

In [4]:
books = pd.read_csv('../data/books.csv')
reviews = pd.read_csv('../data/reviews.csv')
# intr = pd.read_csv('../data/interactions.csv')
authors = pd.read_csv('../data/authors.csv')

# Data preprocessing

## books

In [6]:
books.columns

Index(['isbn', 'text_reviews_count', 'series', 'country_code', 'language_code',
       'popular_shelves', 'asin', 'is_ebook', 'average_rating', 'kindle_asin',
       'similar_books', 'description', 'format', 'link', 'authors',
       'publisher', 'num_pages', 'publication_day', 'isbn13',
       'publication_month', 'edition_information', 'publication_year', 'url',
       'image_url', 'book_id', 'ratings_count', 'work_id', 'title',
       'title_without_series'],
      dtype='object')

In [33]:
drop_cols1 = ['isbn', 'series', 'country_code', 'language_code',
              'asin', 'kindle_asin',
              'similar_books', 'description', 'format', 'link',
              'publisher', 'publication_day', 'isbn13',
              'publication_month', 'edition_information', 'publication_year', 'url',
              'image_url', 'ratings_count', 'work_id', 'title',
              'title_without_series']

drop_cols2 = ['popular_shelves', 'authors']

tags = ['favorites', 'currently-reading', 'to-read']

encoder = ce.OneHotEncoder()

ppl = Pipeline([
    ('DropUnusedCols1', pp.DropColumns(drop_cols1)),
    ('SelectTopNPercentileOfBooks', pp.SelectBooksWithNPercentile('text_reviews_count', 0.9)),
    ('ExportAuthorsAverageRating', pp.ExportAuthorsAverageRating('authors', 'authors_average_rating', authors)),
    ('ExtraxtPopularShelves',pp.ExportBookShelves('popular_shelves', tags)),
    ('DropUnusedCols2', pp.DropColumns(drop_cols2)),
    ('EncodeCategories', pp.EncodeCategories(encoder))
])

model = ppl.fit(books)

(fit) Drop columns: ['isbn', 'series', 'country_code', 'language_code', 'asin', 'kindle_asin', 'similar_books', 'description', 'format', 'link', 'publisher', 'publication_day', 'isbn13', 'publication_month', 'edition_information', 'publication_year', 'url', 'image_url', 'ratings_count', 'work_id', 'title', 'title_without_series']
(fit) Select books with: text_reviews_count >= 103.0
(fit) Export authors average rating
(fit) ExportBookShelves, tag_col: popular_shelves, tags:['favorites', 'currently-reading', 'to-read']
(fit) Drop columns: ['popular_shelves', 'authors']
(fit) Category encoder OneHotEncoder(cols=None, drop_invariant=False, handle_missing='value',
              handle_unknown='value', return_df=True, use_cat_names=False,
              verbose=0)


In [34]:
train = ppl.transform(books)

In [35]:
train.head(3)

Unnamed: 0,text_reviews_count,is_ebook,average_rating,num_pages,book_id,authors_average_rating,favorites,currently-reading,to-read
4,428,False,3.71,351.0,22642971,3.89,49,67,9481
19,112,False,4.1,402.0,10806009,4.02,393,31,1639
26,276,False,3.86,237.0,9463563,3.86,53,65,13204


## users

In [29]:
# TBA