TODO:
- select only english books

- train -> extract authors average rating
- train -> create book/book matrice (review_count, rating_count, authors_avg_rating) + shelves (sum od shelves?)

- add avg_rating of similar books (max X distance)

rating | book_data, similar_books (avg rating, ..), reviews(most frequent sentiment, top words, ...)

In [168]:
# Enable module reloading
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [32]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
from IPython.display import display
from sklearn.pipeline import Pipeline
import category_encoders as ce
from sklearn.base import TransformerMixin
import json
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler

import sys
sys.path.insert(1, '../')

from src import preprocessing as pp
from src import analysis

pd.set_option('display.max_columns', None)

plt.rcParams['figure.figsize'] = (10, 6)
%config InlineBackend.figure_format='retina'
plt.rcParams.update({'font.size': 15})

# Load data

In [3]:
books = pd.read_csv('../data/books.csv')
reviews = pd.read_csv('../data/reviews.csv')
authors = pd.read_csv('../data/authors.csv')

In [4]:
intr = pd.read_csv('../data/interactions.csv', nrows=100)

# Create subset of data

We will work only with books with most reviews and with users, who wrote the most reviews

In [5]:
top_n = 1000
min_review_count = 10

In [6]:
top_books = reviews.loc[:,'book_id'].value_counts().sort_values(ascending=False).index[:top_n]

In [7]:
top_book_reviews = reviews[reviews.book_id.isin(top_books)]

In [8]:
users = top_book_reviews.loc[:,'user_id'].value_counts().sort_values(ascending=False)

top_users = users[users >= min_review_count].index

In [9]:
len(top_users)

30365

In [10]:
top_reviews = reviews[reviews.user_id.isin(top_users) & reviews.book_id.isin(top_books)]

## Train, validation, test split

random split - maybe we will need to change this and use more precise split to prevent the cold start problem

In [11]:
data = top_reviews

In [12]:
train, test = train_test_split(data, test_size=0.01, random_state=42)

In [13]:
val, test = train_test_split(test, test_size=0.1, random_state=42)

In [14]:
len(train), len(val), len(test)

(843221, 7666, 852)

# Preprocessing pipeline

In [15]:
train.head(3)

Unnamed: 0,user_id,book_id,review_id,rating,review_text,date_added,date_updated,read_at,started_at,n_votes,n_comments
2224648,64aba3e21f822eaeafaef5887c955c50,7747064,dc993309713c5b04fa93af6e3236c82b,4,4.0 Sterne \n Hat mir etwas besser gefallen al...,Tue Nov 06 06:21:31 -0800 2012,Mon Nov 19 04:31:49 -0800 2012,Sat Nov 17 00:00:00 -0800 2012,Mon Nov 12 00:00:00 -0800 2012,0,0
1574935,3a50d6c6c3c0be86ae166eaa5b4b2b82,23203106,873ff598584c871b21d40ed8f9db8c7e,0,"Hello, beautiful cover.",Thu Aug 20 00:16:06 -0700 2015,Thu Aug 20 00:21:18 -0700 2015,,,10,2
1449120,8a1929c26541a4ea997d7bf17e5c2c67,12425532,62ac2827c8944d53bfc373f79b728783,3,Interesting concept and I found some the secon...,Thu Apr 05 23:42:27 -0700 2012,Tue Jun 21 17:01:19 -0700 2016,Thu Sep 20 00:00:00 -0700 2012,Thu Sep 20 00:00:00 -0700 2012,0,0


In [169]:
mean_cols = ['rating']

ppl = Pipeline([
    ("ExportBookData", pp.ExportBookData(mean_cols, mode='all'))
])

model = ppl.fit(train)

(fit) ExportBookData, mean_cols: ['rating'] book_id_col: book_id dist_func: <function euclidean_distances at 0x7efcec15e3b0> n_most_similar: 5 mode: all


In [170]:
x = ppl.transform(train[:10])

(transform) ExportBookData, mean_cols: ['rating'] book_id_col: book_id dist_func: <function euclidean_distances at 0x7efcec15e3b0> n_most_similar: 5 mode: all


In [171]:
x.head(3)

Unnamed: 0,user_id,book_id,review_id,rating,review_text,date_added,date_updated,read_at,started_at,n_votes,n_comments,row_count,rating_book_avg,sim_0_row_count,sim_0_rating,sim_1_row_count,sim_1_rating,sim_2_row_count,sim_2_rating,sim_3_row_count,sim_3_rating,sim_4_row_count,sim_4_rating
2224648,64aba3e21f822eaeafaef5887c955c50,7747064,dc993309713c5b04fa93af6e3236c82b,4,4.0 Sterne \n Hat mir etwas besser gefallen al...,Tue Nov 06 06:21:31 -0800 2012,Mon Nov 19 04:31:49 -0800 2012,Sat Nov 17 00:00:00 -0800 2012,Mon Nov 12 00:00:00 -0800 2012,0,0,1158.0,4.022453,1122.0,4.037433,1182.0,4.000846,1063.0,4.027281,1054.0,4.039848,1177.0,3.983008
1574935,3a50d6c6c3c0be86ae166eaa5b4b2b82,23203106,873ff598584c871b21d40ed8f9db8c7e,0,"Hello, beautiful cover.",Thu Aug 20 00:16:06 -0700 2015,Thu Aug 20 00:21:18 -0700 2015,,,10,2,609.0,2.981938,603.0,3.008292,595.0,3.013445,713.0,2.969144,717.0,2.970711,643.0,3.031104
1449120,8a1929c26541a4ea997d7bf17e5c2c67,12425532,62ac2827c8944d53bfc373f79b728783,3,Interesting concept and I found some the secon...,Thu Apr 05 23:42:27 -0700 2012,Tue Jun 21 17:01:19 -0700 2016,Thu Sep 20 00:00:00 -0700 2012,Thu Sep 20 00:00:00 -0700 2012,0,0,915.0,3.657923,886.0,3.673815,833.0,3.639856,915.0,3.692896,795.0,3.661635,996.0,3.62751
