In [1]:
import numpy as np
import pandas as pd
import re

from collections import Counter
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix
import seaborn as sns
import matplotlib.pyplot as plt

In [2]:
sns.set()
df = pd.read_csv("reviews.csv")
df.head()

Unnamed: 0,rating,review
0,negative,terrible place to work for i just heard a stor...
1,negative,"hours , minutes total time for an extremely s..."
2,negative,my less than stellar review is for service . w...
3,negative,i m granting one star because there s no way t...
4,negative,the food here is mediocre at best . i went aft...


In [3]:
df.tail()

Unnamed: 0,rating,review
55995,positive,"great food . wonderful , friendly service . i ..."
55996,positive,charlotte should be the new standard for moder...
55997,positive,get the encore sandwich ! ! make sure to get i...
55998,positive,i m a pretty big ice cream gelato fan . pretty...
55999,positive,where else can you find all the parts and piec...


In [4]:
vocab = {}
def initialise_vocabulary():
    unknown_token = "<UNK>"
    vocab['t_2_i'] = {}
    vocab['i_2_t'] = {}
    vocab['unknownToken'] = unknown_token
    idx = add_token(unknown_token)
    vocab['unknownTokenIdx'] = idx

In [5]:
def add_token(token)-> int:
    if token in vocab['t_2_i']:
        idx = vocab['t_2_i'][token]
    else:
        idx = len(vocab['t_2_i'])
        vocab['t_2_i'][token] = idx
        vocab['i_2_t'][idx] = token
    return idx

In [6]:
def add_many_token(tokens)-> list:
    idxes = [add_token(token) for token in tokens]
    return idxes

In [7]:
def look_up_token(token)-> int:
    if vocab['unknownTokenIdx'] >= 0:
        return vocab['t_2_i'].get(token,vocab['unknownTokenIdx'])
    else:
        return vocab['t_2_i'][token] 

In [8]:
def look_up_index(idx):
    if idx not in vocab['i_2_t']:
        raise KeyError("The index (%d) is not there" %idx)
    return vocab['i_2_t'][idx]

In [9]:
def vocabulary_from_data_frame(df, cutoff=25):
    initialise_vocabulary()
    word_counts = Counter()
    for r in df.review:
        for word in re.split('\W+',r):
            word_counts[word] += 1

    for word,count in word_counts.items():
        if count > cutoff:
            add_token(word)

  for word in re.split('\W+',r):


In [10]:
def vocabulary_from_corpus(Corpus, cutoff=25):
    initialise_vocabulary()
    word_counts = Counter()
    for doc in Corpus:
        for word in re.split('\W+',doc):
            word_counts[word] += 1

    for word,count in word_counts.items():
        if count > cutoff:
            add_token(word)

  for word in re.split('\W+',doc):


In [11]:
# vocabulary_from_data_frame(df)
Corpus = np.asarray(df['review'])
vocabulary_from_corpus(Corpus)

In [12]:
print(look_up_token('the'))
print(look_up_token('book'))
print(len(vocab['t_2_i']))
print(len(vocab['i_2_t']))


38
215
8946
8946


In [13]:
vocab['i_2_t']

{0: '<UNK>',
 1: 'terrible',
 2: 'place',
 3: 'to',
 4: 'work',
 5: 'for',
 6: 'i',
 7: 'just',
 8: 'heard',
 9: 'a',
 10: 'story',
 11: 'of',
 12: 'them',
 13: 'find',
 14: 'girl',
 15: 'over',
 16: 'her',
 17: 'father',
 18: 'coming',
 19: 'in',
 20: 'there',
 21: 'who',
 22: 'she',
 23: 'hadn',
 24: 't',
 25: 'seen',
 26: 'years',
 27: 'said',
 28: 'hi',
 29: 'him',
 30: 'which',
 31: 'upset',
 32: 'his',
 33: 'wife',
 34: 'and',
 35: 'they',
 36: 'left',
 37: 'finished',
 38: 'the',
 39: 'rest',
 40: 'day',
 41: 'working',
 42: 'fine',
 43: 'next',
 44: 'when',
 45: 'went',
 46: 'into',
 47: 'fired',
 48: 'that',
 49: 'situation',
 50: 'one',
 51: 'texas',
 52: 'roadhouse',
 53: 'because',
 54: 'any',
 55: 'could',
 56: 'be',
 57: 'their',
 58: 'staff',
 59: 'does',
 60: 'not',
 61: 'deserve',
 62: 'my',
 63: 'business',
 64: 'yelp',
 65: 'wants',
 66: 'me',
 67: 'give',
 68: 'star',
 69: 'but',
 70: 'don',
 71: 'believe',
 72: 'it',
 73: '',
 74: 'hours',
 75: 'minutes',
 76: 'tot

In [14]:
# N is the length of the vocab
def one_hot_vector(token, N):
    one_hot = np.zeros((N,1))
    one_hot[look_up_token(token)] = 1
    return one_hot

In [15]:
N = len(vocab['t_2_i'])
print(one_hot_vector('worried', N))

[[0.]
 [0.]
 [0.]
 ...
 [0.]
 [0.]
 [0.]]


In [16]:
def compute_features(doc,N):
    is_first = True
    for token in doc:
        one_hot = one_hot_vector(token,N)
        if is_first:
            xF = one_hot
            is_first = False
        else:
            xF = np.hstack((xF,one_hot))
    return np.mean(xF,axis=1)[:,np.newaxis]

In [17]:
def compute_features_fast(doc,N):
    feature_vector = np.zeros(N)
    num_tokens = 0
    for token in doc:
        feature_vector[look_up_token(token)] += 1
        num_tokens += 1
    return feature_vector/num_tokens

In [18]:
def corpus_to_feature_matrix(Corpus,N):
    is_first = True
    for doc in Corpus:
        feature_vector = compute_features(doc,N)
        if is_first:
            feature_matrix = feature_vector
            is_first = False
        else:
            feature_matrix = np.hstack((feature_matric,feature_vector))
    return feature_matrix.T 

In [19]:
def corpus_to_feature_matrix_fast(Corpus,N):
    feature_matrix = np.zeros((N,len(Corpus)))
    i = 0
    for doc in Corpus:
        feature_matrix[:,i] = compute_features_fast(doc,N)
        i += 1
    return feature_matrix.T

In [20]:
feature_vector = compute_features(Corpus[0],N)

In [21]:
feature_vector.shape

(8946, 1)

In [22]:
df

Unnamed: 0,rating,review
0,negative,terrible place to work for i just heard a stor...
1,negative,"hours , minutes total time for an extremely s..."
2,negative,my less than stellar review is for service . w...
3,negative,i m granting one star because there s no way t...
4,negative,the food here is mediocre at best . i went aft...
...,...,...
55995,positive,"great food . wonderful , friendly service . i ..."
55996,positive,charlotte should be the new standard for moder...
55997,positive,get the encore sandwich ! ! make sure to get i...
55998,positive,i m a pretty big ice cream gelato fan . pretty...


In [23]:
X = np.asarray(df['review'])
y = np.asarray(df['rating'])


In [24]:
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.3,shuffle=True)

In [36]:
vocabulary_from_corpus(X_train)
N = len(vocab['i_2_t'])
X_train_feature_matrix = corpus_to_feature_matrix_fast(X_train,N)
X_test_feature_matrix = corpus_to_feature_matrix_fast(X_test,N)

In [None]:
X_train_feature_matrix.shape , X_test_feature_matrix.shape

In [38]:
model = LogisticRegression().fit(X_train_feature_matrix,y_train)

In [39]:
y_preds = model.predict(X_test_feature_matrix)

In [None]:
plot = confusion_matrix(y_test,y_preds)
sns.heatmap(plot.T,square = True,annot = True,fmt='d',cbar=False,xticklabels=np.unique(y),yticklabels=np.unique(y))
plt.xlabel("True Label")
plt.xlabel("Predicted Label")