In [1]:
# Enable module reloading
%load_ext autoreload
%autoreload 2

In [2]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
from IPython.display import display
from sklearn.pipeline import Pipeline
import category_encoders as ce
from sklearn.base import TransformerMixin
import json
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler

import sys
sys.path.insert(1, '../')

from src import preprocessing as pp
from src import analysis

pd.set_option('display.max_columns', None)

plt.rcParams['figure.figsize'] = (10, 6)
%config InlineBackend.figure_format='retina'
plt.rcParams.update({'font.size': 15})

In [3]:
books = pd.read_csv('../data/books.csv')
reviews = pd.read_csv('../data/reviews.csv')

In [4]:
books = books[books['text_reviews_count'] > 2000]

In [5]:
books.shape

(678, 29)

In [7]:
reviews = reviews[reviews['book_id'].isin(books['book_id'])]

In [9]:
reviews.shape, reviews.user_id.nunique()

((1068661, 11), 163177)

In [10]:
def numbers(start):
    while True:
        yield start
        start += 1

In [11]:
n_gen = numbers(0)
books_map = { i:next(n_gen) for i in books.book_id.unique()}

In [12]:
books = books.replace({'book_id': books_map})

In [13]:
reviews = reviews.replace({'book_id': books_map})

In [14]:
n_gen = numbers(0)
users_map = { i:next(n_gen) for i in reviews.user_id.unique()}

In [18]:
reviews['user_id'] = [users_map[i] for i in reviews['user_id']]

In [20]:
rows = reviews['user_id'].nunique()
cols = books.shape[0]

In [21]:
rows, cols

(163177, 678)

In [23]:
class Review:
    def __init__(self):
        self.rating = 0
        self.review_text = 0
        
    def set_params(self, rating=0, review_text=""):
        self.rating = rating
        self.review_text = review_text
        

In [24]:
row = [Review() for i in range(rows)]


In [25]:
mat = [np.array(row) for i in range(cols)]

In [28]:
reviews.columns

Index(['user_id', 'book_id', 'review_id', 'rating', 'review_text',
       'date_added', 'date_updated', 'read_at', 'started_at', 'n_votes',
       'n_comments'],
      dtype='object')

In [34]:
for index, row in reviews.iterrows():
    book = row['book_id']
    user = row['user_id']
    mat[book][user].set_params(row['rating'], row['review_text'])
    print


146 0
373 1


In [37]:
mat[0][0].review_text

"I cracked and finally picked this up. Very enjoyable quick read - couldn't put it down - it was like crack. \n I'm a bit bothered by the lack of backstory of how Panem and the Hunger Games come about. It is just kind of explained away in a few paragraphs and we are left to accept this very strange world where teenagers are pitted into an arena each year to kill each other? I was expecting it because I've seen Battle Royale, but I would have appreciated knowing more of the backstory of how the world could have come into such a odd state. \n I suppose what makes a book like this interesting is thinking about the strategy of it all. The players are going to be statistically encouraged to band together because they will last longer that way, but by definition of course any partnership will be broken, and the drama of how that unfolds is always interesting and full of friendships broken and betrayal. Each character approached the game in their own way. Some banded together in larger coalit