In [2]:
# Enable module reloading
%load_ext autoreload
%autoreload 2

In [3]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
from IPython.display import display
from sklearn.pipeline import Pipeline
import category_encoders as ce
from sklearn.base import TransformerMixin
import json
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler

import sys
sys.path.insert(1, '../')

from src import preprocessing as pp
from src import analysis

pd.set_option('display.max_columns', None)

plt.rcParams['figure.figsize'] = (10, 6)
%config InlineBackend.figure_format='retina'
plt.rcParams.update({'font.size': 15})

# Load data

In [4]:
books = pd.read_csv('../data/books.csv')
reviews = pd.read_csv('../data/reviews.csv')
authors = pd.read_csv('../data/authors.csv')

In [5]:
intr = pd.read_csv('../data/interactions.csv', nrows=100)

# Create subset of data

We will work only with books with most reviews and with users, who wrote the most reviews

In [6]:
top_n = 1000
min_review_count = 10

In [7]:
top_books = reviews.loc[:,'book_id'].value_counts().sort_values(ascending=False).index[:top_n]

In [8]:
top_book_reviews = reviews[reviews.book_id.isin(top_books)]

In [9]:
users = top_book_reviews.loc[:,'user_id'].value_counts().sort_values(ascending=False)

top_users = users[users >= min_review_count].index

In [10]:
len(top_users)

30365

In [11]:
top_reviews = reviews[reviews.user_id.isin(top_users) & reviews.book_id.isin(top_books)]

## Train, validation, test split

random split - maybe we will need to change this and use more precise split to prevent the cold start problem

In [17]:
data = top_reviews

In [18]:
train, test = train_test_split(data, test_size=0.01, random_state=42)

In [19]:
val, test = train_test_split(test, test_size=0.1, random_state=42)

In [20]:
len(train), len(val), len(test)

(843221, 7666, 852)

# Preprocessing

## User-book matrice

In [22]:
def numbers(start):
    while True:
        yield start
        start += 1

In [23]:
n_gen = numbers(0)
books_map = { i:next(n_gen) for i in top_books}

In [24]:
n_gen = numbers(0)
users_map = { i:next(n_gen) for i in top_users}

In [31]:
rows = train.book_id.nunique()
cols = train.user_id.nunique()

In [32]:
rows, cols

(1000, 30365)

In [33]:
class Review:
    def __init__(self):
        self.rating = 0
        self.review_text = 0
        # TODO: add other params
        
    def set_params(self, rating=0, review_text=""):
        self.rating = rating
        self.review_text = review_text
        

In [34]:
mat = [[Review() for _ in range(cols)] for i in range(rows)]

In [35]:
mat = np.array(mat)

In [36]:
mat.shape

(1000, 30365)

In [38]:
for index, r in train.iterrows():
    book = r['book_id']
    user = r['user_id']
    mat[books_map[book]][users_map[user]].set_params(r['rating'], r['review_text'])