In [2]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
from IPython.display import display
from sklearn.pipeline import Pipeline
import category_encoders as ce

import sys
sys.path.insert(1, '../')

from src import preprocessing as pp
from src import analysis

# Enable module reloading
%load_ext autoreload
%autoreload 2

pd.set_option('display.max_columns', None)

plt.rcParams['figure.figsize'] = (10, 6)
%config InlineBackend.figure_format='retina'
plt.rcParams.update({'font.size': 15})

In [3]:
from sklearn.base import TransformerMixin
import json
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler

## Load datasets

In [4]:
books = pd.read_csv('../data/books.csv')
reviews = pd.read_csv('../data/reviews.csv')
# intr = pd.read_csv('../data/interactions.csv')
authors = pd.read_csv('../data/authors.csv')

# Create subsets of data

In [247]:
top_n = 1000
min_review_count = 10

In [248]:
reviews.columns

Index(['user_id', 'book_id', 'review_id', 'rating', 'review_text',
       'date_added', 'date_updated', 'read_at', 'started_at', 'n_votes',
       'n_comments'],
      dtype='object')

In [249]:
top_books = reviews[(reviews['rating'] > 0) & (~pd.isnull(reviews['review_text']))]['book_id'].value_counts().sort_values(ascending=False)

In [250]:
books['text_reviews_count'] = [top_books[i] if i in top_books.index else 0 for i in books['book_id']]

In [251]:
top_books = books[(books['language_code'] == 'eng') & 
                   (~pd.isnull(books['description']))].sort_values(by='text_reviews_count',
                                                                   ascending=False)['book_id'][:top_n]

In [252]:
top_book_reviews = reviews[reviews.book_id.isin(top_books)]

In [253]:
users = top_book_reviews.loc[:,'user_id'].value_counts().sort_values(ascending=False)

top_users = users[users >= min_review_count].index

In [254]:
len(top_users)

26340

In [255]:
top_reviews = reviews[(reviews['rating'] > 0) & (~pd.isnull(reviews['review_text'])) &
                      (reviews.user_id.isin(top_users)) & reviews.book_id.isin(top_books)]

In [256]:
top_reviews.shape

(682861, 11)

# Train test split

In [257]:
data = top_reviews

train, test = train_test_split(data, test_size=0.01, random_state=42)

val, test = train_test_split(test, test_size=0.1, random_state=42)

In [258]:
len(train), len(val), len(test)

(676032, 6146, 683)

# Data preprocessing

In this notebook, we are going to preprocess our data, so we can use them during training.

In [None]:
from sklearn.metrics.pairwise import cosine_similarity

mean_cols = ['rating']

scaler = MinMaxScaler()
scale_cols = ['row_count']

drop_cols = ['user_id', 'book_id', 'review_id', 'review_text', 'date_added', 'date_updated', 'read_at',
             'started_at', 'n_votes', 'n_comments', 'neg', 'neu', 'pos', 'compound']

ppl = Pipeline([
    ("ExtractSentiment", pp.ExtractSentiment()),
    ("ExportBookData", pp.ExportBookData(mean_cols, books, authors, mode='avg', dist_func=cosine_similarity)),
    ('DropUnusedCols', pp.DropColumns(drop_cols)),
])

model = ppl.fit(train)

[nltk_data] Downloading package punkt to /home/vajk/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


(fit) ExtractSentiment
(transform) ExtractSentiment


In [None]:
p_train = ppl.transform(train)
p_train.to_csv("../data/train.csv")

In [None]:
p_train

In [None]:
p_val = ppl.transform(val)
p_val.to_csv("../data/val.csv")

In [None]:
p_val

In [None]:
p_test = ppl.transform(test)
p_test.to_csv("../data/test.csv")