TODO:
- select only english books

- train -> extract authors average rating
- train -> create book/book matrice (review_count, rating_count, authors_avg_rating) + shelves (sum od shelves?)

- add avg_rating of similar books (max X distance)

rating | book_data, similar_books (avg rating, ..), reviews(most frequent sentiment, top words, ...)

In [1]:
# Enable module reloading
%load_ext autoreload
%autoreload 2

In [2]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
from IPython.display import display
from sklearn.pipeline import Pipeline
import category_encoders as ce
from sklearn.base import TransformerMixin
import json
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler

import sys
sys.path.insert(1, '../')

from src import preprocessing as pp
from src import analysis

pd.set_option('display.max_columns', None)

plt.rcParams['figure.figsize'] = (10, 6)
%config InlineBackend.figure_format='retina'
plt.rcParams.update({'font.size': 15})

# Load data

In [3]:
books = pd.read_csv('../data/books.csv')
reviews = pd.read_csv('../data/reviews.csv')
authors = pd.read_csv('../data/authors.csv')

In [4]:
intr = pd.read_csv('../data/interactions.csv', nrows=100)

# Create subset of data

We will work only with books with most reviews and with users, who wrote the most reviews

In [5]:
top_n = 1000
min_review_count = 10

In [6]:
top_books = reviews.loc[:,'book_id'].value_counts().sort_values(ascending=False).index[:top_n]

In [7]:
top_book_reviews = reviews[reviews.book_id.isin(top_books)]

In [8]:
users = top_book_reviews.loc[:,'user_id'].value_counts().sort_values(ascending=False)

top_users = users[users >= min_review_count].index

In [9]:
len(top_users)

30365

In [10]:
top_reviews = reviews[reviews.user_id.isin(top_users) & reviews.book_id.isin(top_books)]

## Train, validation, test split

random split - maybe we will need to change this and use more precise split to prevent the cold start problem

In [11]:
data = top_reviews

In [12]:
train, test = train_test_split(data, test_size=0.01, random_state=42)

In [13]:
val, test = train_test_split(test, test_size=0.1, random_state=42)

In [14]:
len(train), len(val), len(test)

(843221, 7666, 852)

# Preprocessing pipeline

In [15]:
train.head(3)

Unnamed: 0,user_id,book_id,review_id,rating,review_text,date_added,date_updated,read_at,started_at,n_votes,n_comments
2224648,64aba3e21f822eaeafaef5887c955c50,7747064,dc993309713c5b04fa93af6e3236c82b,4,4.0 Sterne \n Hat mir etwas besser gefallen al...,Tue Nov 06 06:21:31 -0800 2012,Mon Nov 19 04:31:49 -0800 2012,Sat Nov 17 00:00:00 -0800 2012,Mon Nov 12 00:00:00 -0800 2012,0,0
1574935,3a50d6c6c3c0be86ae166eaa5b4b2b82,23203106,873ff598584c871b21d40ed8f9db8c7e,0,"Hello, beautiful cover.",Thu Aug 20 00:16:06 -0700 2015,Thu Aug 20 00:21:18 -0700 2015,,,10,2
1449120,8a1929c26541a4ea997d7bf17e5c2c67,12425532,62ac2827c8944d53bfc373f79b728783,3,Interesting concept and I found some the secon...,Thu Apr 05 23:42:27 -0700 2012,Tue Jun 21 17:01:19 -0700 2016,Thu Sep 20 00:00:00 -0700 2012,Thu Sep 20 00:00:00 -0700 2012,0,0


In [29]:
mean_cols = ['rating']

scaler = MinMaxScaler()
scale_cols = ['row_count']

drop_cols2 = ['user_id', 'book_id', 'review_id', 'review_text', 'date_added', 'date_updated', 'read_at',
             'started_at', 'n_votes', 'n_comments']

ppl = Pipeline([
    ("ExportBookData", pp.ExportBookData(mean_cols, mode='all')),
    ('EmptyValuesFilter', pp.EmptyValuesFilter(['review_text'])),
    ('TextPreprocessor', pp.TextPreprocessor('review_text')),
    ('ReviewLengthFilter', pp.ReviewLengthFilter('review_text', 0, 2000)),
    ('ReviewsLanguageFilter', pp.ReviewsLanguageFilter('review_text', 'en')),
    ('Scale', pp.Scale(scale_cols, scaler)),
    ('DropUnusedCols2', pp.DropColumns(drop_cols2)),
])

model = ppl.fit(train)

(fit) ExportBookData, mean_cols: ['rating'] book_id_col: book_id dist_func: <function euclidean_distances at 0x7f51164633b0> n_most_similar: 5 mode: all
(transform) ExportBookData, mean_cols: ['rating'] book_id_col: book_id dist_func: <function euclidean_distances at 0x7f51164633b0> n_most_similar: 5 mode: all
(fit) Empty values filter
(transform) Empty values filter
(fit) Text preprocessing
(transform) Text preprocessing
(fit) Review length filter
(transform) Review length filter
(fit) Reviews language filter
(transform) Reviews language filter
(fit) Scale cols: ['row_count']
(transform) Scale cols: ['row_count']
(fit) Drop columns: ['user_id', 'book_id', 'review_id', 'review_text', 'date_added', 'date_updated', 'read_at', 'started_at', 'n_votes', 'n_comments']


In [31]:
X_train, y_train = train.drop('rating', axis=1), train['rating']

x_val, y_val = val.drop('rating', axis=1), val['rating']

In [None]:
x_t = ppl.transform(X_train)

(transform) ExportBookData, mean_cols: ['rating'] book_id_col: book_id dist_func: <function euclidean_distances at 0x7f51164633b0> n_most_similar: 5 mode: all


In [None]:
x_v = ppl.transform(X_valid)

In [28]:
x.head(3)

Unnamed: 0,rating,row_count,rating_book_avg,sim_0_row_count,sim_0_rating,sim_1_row_count,sim_1_rating,sim_2_row_count,sim_2_rating,sim_3_row_count,sim_3_rating,sim_4_row_count,sim_4_rating
1574935,0,0.0,0,1,3,1,3,1,3,1,4,1,4
1449120,3,0.0,3,1,3,1,3,1,4,1,4,1,4
536688,5,0.0,5,1,5,1,4,1,4,1,4,1,4


# Training

In [None]:
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import PolynomialFeatures
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_errorlute_error
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import RandomizedSearchCV

In [None]:
def print_metrics(y_true, y_pred):
    print("MSE:", mean_squared_error(y_true, y_pred))
    print("RMSE:", sqrt(mean_squared_error(y_true, y_pred)))
    print("MAE:", mean_absolute_error(y_true, y_pred))

## linear regression

In [None]:
reg = LinearRegression().fit(X_train, y_train)

In [None]:
y_pred = reg.predict(X_val)

In [None]:
print_metrics(y_val, y_pred)

## polynomial regression

In [None]:
polynomial_features= PolynomialFeatures(degree=3)
x_poly = polynomial_features.fit_transform(X_train)

model = LinearRegression()
model.fit(x_poly, y_train)

x_poly = polynomial_features.fit_transform(X_val)
y_pred = model.predict(x_poly)

In [None]:
print_metrics(y_val, y_pred)

## random forest regression

In [None]:
regr = RandomForestRegressor(max_depth=10, random_state=0)

In [None]:
regr.fit(X_train, y_train)

y_pred = regr.predict(X_val)

In [None]:
print_metrics(y_val, y_pred)

# Feature selection

In [None]:
regr = RandomForestRegressor(max_depth=10, random_state=0)

In [None]:
regr.fit(X_train, y_train)

In [None]:
regr.feature_importances_

# Hyperparameter tunning

In [None]:
ind_params = {'random_state': 0}

random_params = {
    'n_estimators': stats.randint(3,30),
    'criterion': ['gini', 'entropy'],
    'max_depth': stats.randint(1,30),
    'min_samples_leaf': stats.randint(1,5),
    'splitter': ['best','random'],
    'max_features': ['auto', 'sqrt', 'log2']
}

clf = RandomForestRegressor

In [None]:
random_optimization = RandomizedSearchCV(clf(**ind_params), param_distributions = random_params, 
                               n_iter = 1000, scoring = 'f1_macro', cv = 10, verbose=True,
                                         random_state=42, n_jobs = -1)