In [2]:
import numpy as np
import pandas as pd
import torch
import sys
import string
from collections import Counter

In [3]:
vocab = {}

In [4]:
def initializeVocabulary():
    unkToken = '<UNK>'
    vocab['t_2_i'] = {}
    vocab['i_2_t'] = {}
    idx = addToken(unkToken)
    vocab['addUnk'] = True
    vocab['unkToken'] = unkToken
    vocab['unkTokenIdx'] = idx

In [5]:
def addToken(token):
    if token in vocab['t_2_i']:
        idx = vocab['t_2_i'][token]
    else:
        idx = len(vocab['t_2_i'])
        vocab['t_2_i'][token] = idx
        vocab['i_2_t'][idx] = token
    return idx

In [6]:
def addManyTokens(tokens):
    idxes = [addToken(token) for token in tokens]
    return idxes

In [7]:
def lookUpToken(token):
    if vocab['unkTokenIdx'] >= 0:
        return vocab['t_2_i'].get(token,vocab['unkTokenIdx'])
    else:
        return vocab['t_2_i'][token]

In [8]:
def lookUpIndex(idx):
    if idx not in vocab['i_2_t']:
        raise KeyError("the index (%d) is not there" % idx)
    return vocab['i_2_t'][idx]

In [9]:
def vocabularyFromDataFrame(df,cutoff=25):
    initializeVocabulary()
    wordCounts = Counter()
    for r in df.review:
        for word in r.split(" "):
            if word not in string.punctuation:
                wordCounts[word] += 1
    for word,count in wordCounts.items():
        if count > cutoff:
            addToken(word)
    

In [10]:
df = pd.read_csv(r'C:\Users\DeLL\Desktop\ML\Data\reviews.csv')

In [11]:
vocabularyFromDataFrame(df,cutoff=25)

In [12]:
lookUpToken('this')

128

In [13]:
lookUpIndex(128)

'this'

In [14]:
len(vocab['t_2_i'])

8945

In [15]:
def vectorize(review):
    isFirst = True
    for token in review.split(" "):
        if token not in string.punctuation:
            oneHot = np.zeros((len(vocab['t_2_i']),1))
            oneHot[lookUpToken(token)] = 1
            if isFirst:
                xF = oneHot
                isFirst = False
            else:
                xF = np.hstack((xF,oneHot))
    return xF

In [18]:
xF = vectorize(df['review'][1])

In [19]:
xF.shape

(8945, 17)

In [20]:
df

Unnamed: 0,rating,review
0,negative,terrible place to work for i just heard a stor...
1,negative,"hours , minutes total time for an extremely s..."
2,negative,my less than stellar review is for service . w...
3,negative,i m granting one star because there s no way t...
4,negative,the food here is mediocre at best . i went aft...
...,...,...
55995,positive,"great food . wonderful , friendly service . i ..."
55996,positive,charlotte should be the new standard for moder...
55997,positive,get the encore sandwich ! ! make sure to get i...
55998,positive,i m a pretty big ice cream gelato fan . pretty...


In [21]:
smallDf_pos = df[df['rating']=='positive'].iloc[:5]
smallDf_neg = df[df['rating']=='negative'].iloc[:5]
df_small = pd.concat([smallDf_pos,smallDf_neg],axis=0)

In [22]:
df_small

Unnamed: 0,rating,review
28000,positive,my experience was by far the most pleasant i h...
28001,positive,i have been to this place a couple of times on...
28002,positive,very popular sushi bar in the heart of old tow...
28003,positive,the staff is nice . it s pretty clean . they u...
28004,positive,my co worker picked up lunch for us from this ...
0,negative,terrible place to work for i just heard a stor...
1,negative,"hours , minutes total time for an extremely s..."
2,negative,my less than stellar review is for service . w...
3,negative,i m granting one star because there s no way t...
4,negative,the food here is mediocre at best . i went aft...


In [24]:
vocabularyFromDataFrame(df_small,cutoff=0)

In [25]:
len(vocab['t_2_i'])

491

In [26]:
numFeatures = len(vocab['t_2_i'])
hiddenUnits = 10
h0 = torch.tensor(np.zeros((hiddenUnits,1)))
Wx = torch.tensor(np.random.uniform(0,1,(hiddenUnits,numFeatures)),requires_grad=True)
Wh = torch.tensor(np.random.uniform(0,1,(hiddenUnits,hiddenUnits)),requires_grad=True)
Wy = torch.tensor(np.random.uniform(0,1,(1,hiddenUnits)),requires_grad=True)

In [50]:
def stepForward(xt,Wx,Wh,Wy,prevMemory):
    x_frd = torch.matmul(Wx,torch.from_numpy(xt[:,np.newaxis]))
    h_frd = torch.matmul(Wh,prevMemory)
    ht = torch.tanh(x_frd+h_frd)
    yt_hat = torch.sigmoid(torch.matmul(Wy,ht))
    return ht,yt_hat

In [51]:
def fullForwardRNN(X,Wx,Wh,Wy,prevMemory):
    y_hat = 0
    for t in range(X.shape[1]):
        ht,yt_hat = stepForward(X[:,t],Wx,Wh,Wy,prevMemory)
        prevMemory = ht
        y_hat = yt_hat
    return y_hat  

In [52]:
def computeLoss(y,y_hat):
    loss = 0
    for yi,yi_hat in zip(y,y_hat):
        if yi == 1:
            loss += -torch.log2(yi_hat)
        else:
            loss += -torch.log2(1-yi_hat)
    return loss/len(y)

In [53]:
def updateParams(Wx,Wh,Wy,dWx,dWh,dWy,lr):
    with torch.no_grad():
        Wx -= lr*dWx
        Wh -= lr*dWh
        Wy -= lr*dWy
    return Wx,Wh,Wy

In [54]:
def trainRNN(train_df,Wx,Wh,Wy,prevMemory,lr,nepoch):
    losses = []
    for epoch in range(nepoch):
        y,y_hat = [],[]
        for rv,rt in zip(train_df['review'],train_df['rating']):
            X = vectorize(rv)
            yi_hat = fullForwardRNN(X,Wx,Wh,Wy,prevMemory)
            yi = 0
            if rt == 'positive':
                yi = 1
            y.append(yi)
            y_hat.append(yi_hat)
            
        loss = computeLoss(y,y_hat)
        loss.backward()
        losses.append(loss)
        print("Loss after epoch=%d: %f" %(epoch,loss))
        sys.stdout.flush()
        dWx = Wx.grad.data
        dWh = Wh.grad.data
        dWy = Wy.grad.data
        Wx,Wh,Wy = updateParams(Wx,Wh,Wy,dWx,dWh,dWy,lr)
        Wx.grad.data.zero_()
        Wh.grad.data.zero_()
        Wy.grad.data.zero_()
    return Wx,Wh,Wy,losses

In [57]:
Wx,Wh,Wy,losses = trainRNN(df_small,Wx,Wh,Wy,h0,0.01,50)

Loss after epoch=0: 2.592204
Loss after epoch=1: 2.546002
Loss after epoch=2: 2.500184
Loss after epoch=3: 2.454772
Loss after epoch=4: 2.409788
Loss after epoch=5: 2.365255
Loss after epoch=6: 2.321197
Loss after epoch=7: 2.277640
Loss after epoch=8: 2.234607
Loss after epoch=9: 2.192125
Loss after epoch=10: 2.150219
Loss after epoch=11: 2.108915
Loss after epoch=12: 2.068241
Loss after epoch=13: 2.028222
Loss after epoch=14: 1.988884
Loss after epoch=15: 1.950253
Loss after epoch=16: 1.912356
Loss after epoch=17: 1.875216
Loss after epoch=18: 1.838857
Loss after epoch=19: 1.803304
Loss after epoch=20: 1.768578
Loss after epoch=21: 1.734698
Loss after epoch=22: 1.701686
Loss after epoch=23: 1.669557
Loss after epoch=24: 1.638327
Loss after epoch=25: 1.608010
Loss after epoch=26: 1.578618
Loss after epoch=27: 1.550158
Loss after epoch=28: 1.522638
Loss after epoch=29: 1.496063
Loss after epoch=30: 1.470434
Loss after epoch=31: 1.445750
Loss after epoch=32: 1.422008
Loss after epoch=33:

In [75]:
r = df_small['review'].iloc[6]
y = df_small['rating'].iloc[6]

In [76]:
X = vectorize(r)

In [77]:
y_hat = fullForwardRNN(X,Wx,Wh,Wy,h0)

In [78]:
y_hat

tensor([[0.7097]], dtype=torch.float64, grad_fn=<SigmoidBackward>)

In [79]:
y

'negative'