In [1]:
import gzip
from collections import defaultdict
import random
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import normalize
from nltk.stem.porter import *
import string
import copy
import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
def readGz(path):
    for l in gzip.open(path, 'rt'):
        yield eval(l)

In [3]:
data = list(readGz("train_Category.json.gz"));
df = pd.DataFrame(data);

# Split data in training and validation set
df_splitp = 0.95;
train, valid = train_test_split(df, test_size=0.05);

data_test = list(readGz("test_Category.json.gz"));
test = pd.DataFrame(data_test);

In [7]:
stop_words = set({'ourselves', 'hers', 'between', 'yourself', 'but', 'again', \
              'there', 'about', 'once', 'during', 'out', 'very', 'having', \
              'with', 'they', 'own', 'an', 'be', 'some', 'for', 'do', 'its',\
              'yours', 'such', 'into', 'of', 'most', 'itself', 'other', 'off',\
              'is', 's', 'am', 'or', 'who', 'as', 'from', 'him', 'each', 'the',\
              'themselves', 'until', 'below', 'are', 'we', 'these', 'your', 'his',\
              'through', 'don', 'nor', 'me', 'were', 'her', 'more', 'himself', 'this',\
              'down', 'should', 'our', 'their', 'while', 'above', 'both', 'up', 'to', \
              'ours', 'had', 'she', 'all', 'no', 'when', 'at', 'any', 'before', 'them',\
              'same', 'and', 'been', 'have', 'in', 'will', 'on', 'does', 'yourselves',\
              'then', 'that', 'because', 'what', 'over', 'why', 'so', 'can', 'did', 'not',\
              'now', 'under', 'he', 'you', 'herself', 'has', 'just', 'where', 'too', 'only',\
              'myself', 'which', 'those', 'i', 'after', 'few', 'whom', 't', 'being', 'if', \
              'theirs', 'my', 'against', 'a', 'by', 'doing', 'it', 'how', 'further', 'was', 'here', 'than'});

### Parse user information

In [8]:
usersBook = defaultdict(list);
train_sorted_by_user = train.sort_values(by=['user_id']);

for i in range(train_sorted_by_user.shape[0]):
    
    username = train_sorted_by_user.iloc[i]['user_id'];
    genre = train_sorted_by_user.iloc[i]['genreID'];
    
    if not (username in usersBook):
        usersBook[username] = np.array([0, 0, 0, 0, 0]);
    
    usersBook[username][genre] += 1;

### Prepocessing

In [9]:
stemmer = PorterStemmer();
punct = string.punctuation;
train_reviews = train['review_text'].apply(
        lambda s: ''.join([c for c in s.lower() if not (c in punct)])
        ).values;
n_reviews = len(train_reviews);

In [12]:
train.iloc[0]['genreID']

3

### Generate

In [46]:
totalWords = 0;
wordCount = defaultdict(int);
pairCount = defaultdict(int);
wordAppear = defaultdict(int);
pairAppear = defaultdict(int);

genre_word_countsU = defaultdict(list);
genre_word_countsB = defaultdict(list);

for i in range(train.shape[0]):
    
    entry = train.iloc[i];
    genreID = entry['genreID'];
    
    text = entry['review_text'];
    t = text.lower() # lowercase string
    t = [c for c in t if not (c in punct)] # non-punct characters
    t = ''.join(t) # convert back to string
    words = t.strip().split() # tokenizes

    prev_w = " ";
    
    wordsinDoc = set();
    pairsinDoc = set();
    
    for w in words:
        
        # filter word of size 1 and 2
        if len(w) < 3: continue;
            
        # filter stop_words
        if w in stop_words: continue;
        
        # stem words
        w = stemmer.stem(w);
        totalWords += 1;
        
        # Deal with unigram
        wordCount[w] += 1;
        
        # attach to genreID
        if not (w in genre_word_countsU):
            genre_word_countsU[w] = np.array([0,0,0,0,0]);
        genre_word_countsU[w][genreID] += 1;
        
        if not (w in wordsinDoc):
            wordsinDoc.add(w);
            wordAppear[w] += 1;
        
        # Deal with bigram
        if (prev_w != " "):
            
            pair = (prev_w, w);
            
            pairCount[pair] += 1;
            
            if not (pair in genre_word_countsB):
                genre_word_countsB[pair] = np.array([0,0,0,0,0]);
            genre_word_countsB[pair][genreID] += 1;
            
            if not (pair in pairsinDoc):
                pairAppear[pair] += 1;
                pairsinDoc.add(pair);
                
        prev_w = w;
            
print("Number of word counts:", totalWords);

Number of word counts: 15028502


In [47]:
chi_squareU = defaultdict(float);

for word in genre_word_countsU:
    
    total = wordCount[word];
    
    expected = np.full((1,5), total/5);
    observed = genre_word_countsU[word] / total;
    
    chi_squared = np.sum(np.square(observed - expected) / expected);
    
    chi_squareU[word] = chi_squared;

In [49]:
chi_squareB = defaultdict(int);

for pair in genre_word_countsB:
    
    total = wordCount[pair];       
    
    expected = np.full((1,5), total/5);
    observed = genre_word_countsB[pair] / total;
    
    chi_squared = np.sum(np.square(observed - expected) / expected);
    
    chi_squareB[pair] = chi_squared;

  
  


In [50]:
word_scale = defaultdict(float);

for word in chi_squareU:
    word_scale[word] = chi_squareU[word] / np.sqrt(wordCount[word]);

In [75]:
word_ranking = sorted(word_scale.items(), key=lambda kv: kv[1])
word_ranking.reverse();

word_ranking = [(wordCount[w], w) for w in wordCount];
word_ranking.sort();
word_ranking.reverse();

In [76]:
word_ranking

[('book', 579.6309170676378),
 ('read', 431.894663125885),
 ('stori', 386.1282171065945),
 ('like', 371.96236375451963),
 ('one', 354.01412408964813),
 ('charact', 345.433061619385),
 ('love', 342.52737125718744),
 ('realli', 301.6040451918676),
 ('get', 274.0930500619037),
 ('seri', 270.7895864052921),
 ('time', 264.80181286922476),
 ('end', 264.3482553368171),
 ('would', 247.74583772832287),
 ('good', 245.58501606100097),
 ('first', 245.43634636512016),
 ('much', 241.91940831484195),
 ('want', 231.4303353733041),
 ('enjoy', 230.54717549974617),
 ('know', 230.53633147797362),
 ('thing', 229.49291956828415),
 ('make', 229.10696220032608),
 ('way', 229.0371151806997),
 ('think', 228.7990387443185),
 ('even', 222.1283415952386),
 ('well', 220.54251323227334),
 ('also', 218.81727569564663),
 ('feel', 214.90695696314452),
 ('see', 214.80456268637138),
 ('novel', 209.4540526164105),
 ('didnt', 206.67123690487523),
 ('great', 204.25474330179264),
 ('world', 200.80089691594173),
 ('dont', 199

### Main

In [68]:
rank = 1000;

In [69]:
words = [w[1] for w in word_ranking[:rank]];
wordId = dict(zip(words, range(len(words))));
wordSet = set(words);

In [70]:
def feature(text):

    feat = [0] * len(wordSet);

    t = text.lower() # lowercase string
    t = [c for c in t if not (c in punct)] # non-punct characters
    t = ''.join(t) # convert back to string
    words = t.strip().split() # tokenizes
    
    for w in words:
        if len(w) < 3: continue;
        if w in stop_words: continue;
        w = stemmer.stem(w);
        if not (w in wordSet): continue;
        feat[wordId[w]] += 1;

    # Bias term
    feat.append(1);

    return np.array(feat);

### Create data frames

In [71]:
X = [];

for i in range(train.shape[0]):
    
    username = train.iloc[i]['user_id'];
    user_predict = [];
    if username in usersBook:
        user_predict = (np.array(usersBook[username]) == max(usersBook[username])).astype(int);
    else:
        user_predict = np.array([0,0,0,0,0]);
    
    text = train.iloc[i]['review_text'];
    word_vector = feature(text);
    
    X.append(np.hstack((user_predict, word_vector)));

X = np.array(X);
#normalize(X, axis=0, norm='max', copy=False);
y = train['genreID'].values;

In [72]:
Xva = [];

for i in range(valid.shape[0]):
    
    username = valid.iloc[i]['user_id'];
    user_predict = [];
    if username in usersBook:
        user_predict = (np.array(usersBook[username]) == max(usersBook[username])).astype(int);
    else:
        user_predict = np.array([0,0,0,0,0]);
    
    text = valid.iloc[i]['review_text'];
    word_vector = feature(text);
    
    Xva.append(np.hstack((user_predict, word_vector)));

Xva = np.array(Xva);
#normalize(Xva, axis=0, norm='max', copy=False);
yva = valid['genreID'].values;

In [73]:
clf = LogisticRegression(C=1000, solver="lbfgs",multi_class="multinomial");
clf.fit(X, y);



In [85]:
clf.coef_

array([[ 2.52884973, -0.13937378, -0.58251588, ...,  0.00995063,
        -0.07906772, -0.17233746],
       [-0.26116748,  2.71377385, -0.37871891, ..., -0.05450493,
         0.00605361, -0.18371277],
       [-0.84061008, -0.66507854,  1.79048439, ...,  0.11574403,
         0.04286056,  0.20891981],
       [-0.91464934, -1.02237062, -0.35696323, ..., -0.13261159,
         0.08661935,  0.11684677],
       [-0.51242284, -0.8869509 , -0.47228637, ...,  0.06142185,
        -0.0564658 ,  0.03028365]])

In [74]:
pred = clf.predict(Xva);
correct = pred == yva;

accuracy = sum(correct) / len(correct);
print("The accuracy of the regression model is", accuracy);

The accuracy of the regression model is 0.6949
