In [3]:
import numpy as np
import pandas as pd
import torch
import sys
import string
from collections import Counter

In [4]:
vocab = {} #global dict storing words/corresponding indexes

### Lets Create our own Tokenizer

In [5]:
def addToken(token):
    if token in vocab['t_2_i']: #if token in present
        idx = vocab['t_2_i'][token] #return index token
    else:
        idx = len(vocab['t_2_i']) 
        vocab['t_2_i'][token] = idx #at token return idx
        vocab['i_2_t'][idx] = token #at idx return token
    return idx

In [6]:
def initializeVocabulary():
    unkToken = '<UNK>'
    vocab['t_2_i'] = {} #token to index
    vocab['i_2_t'] = {} #index to token
    idx = addToken(unkToken) #Add unknown token
    vocab['addUnk'] = True 
    vocab['unkToken'] = unkToken
    vocab['unkTokenIdx'] = idx 

In [7]:
def addManyTokens(tokens): #List of indexes for tokens
    idxes = [addToken(token) for token in tokens] #Call addToken function for token in list tokens
    return idxes #return list of token indexes

In [8]:
def lookUpToken(token): #given a token return an index
    if vocab['unkTokenIdx'] >= 0: #token is not there
        return vocab['t_2_i'].get(token,vocab['unkTokenIdx']) #return index of unknown token
    else:
        return vocab['t_2_i'][token] #return token index

In [9]:
def lookUpIndex(idx): #given an index what is token
    if idx not in vocab['i_2_t']: #index is not there
        raise KeyError("the index (%d) is not there" % idx) #error message
    return vocab['i_2_t'][idx] #return index token

In [10]:
#Use classes as it is neater
def vocabularyFromDataFrame(df,cutoff=25): #Given a file containing words how to build vocabulary
    initializeVocabulary()       #If word does not occur more than 25 times it is not important so ignore
    wordCounts = Counter()
    for r in df.review: #dataframe review is a cloumn
        for word in r.split(" "):
            if word not in string.punctuation: #not punctioation
                wordCounts[word] += 1 #counting frquencies of the words
    for word,count in wordCounts.items(): 
        if count > cutoff:
            addToken(word)
    

In [11]:
df = pd.read_csv(r'/home/rahul/WSL_Projects/ReferencesPython/Recurrent Neural Networks/RNNSentimentAnalysis/reviews.csv')

In [12]:
vocabularyFromDataFrame(df,cutoff=25)

In [13]:
lookUpToken('this')

128

In [14]:
lookUpIndex(3)

'to'

In [15]:
len(vocab['t_2_i']) #vocab included

8945

In [16]:
vocab

{'t_2_i': {'<UNK>': 0,
  'terrible': 1,
  'place': 2,
  'to': 3,
  'work': 4,
  'for': 5,
  'i': 6,
  'just': 7,
  'heard': 8,
  'a': 9,
  'story': 10,
  'of': 11,
  'them': 12,
  'find': 13,
  'girl': 14,
  'over': 15,
  'her': 16,
  'father': 17,
  'coming': 18,
  'in': 19,
  'there': 20,
  'who': 21,
  'she': 22,
  'hadn': 23,
  't': 24,
  'seen': 25,
  'years': 26,
  'said': 27,
  'hi': 28,
  'him': 29,
  'which': 30,
  'upset': 31,
  'his': 32,
  'wife': 33,
  'and': 34,
  'they': 35,
  'left': 36,
  'finished': 37,
  'the': 38,
  'rest': 39,
  'day': 40,
  'working': 41,
  'fine': 42,
  'next': 43,
  'when': 44,
  'went': 45,
  'into': 46,
  'fired': 47,
  'that': 48,
  'situation': 49,
  'one': 50,
  'texas': 51,
  'roadhouse': 52,
  'because': 53,
  'any': 54,
  'could': 55,
  'be': 56,
  'their': 57,
  'staff': 58,
  'does': 59,
  'not': 60,
  'deserve': 61,
  'my': 62,
  'business': 63,
  'yelp': 64,
  'wants': 65,
  'me': 66,
  'give': 67,
  'star': 68,
  'but': 69,
  'don':

In [17]:
def vectorize(review):
    isFirst = True #first word
    for token in review.split(" "):
        if token not in string.punctuation:
            oneHot = np.zeros((len(vocab['t_2_i']),1)) #column vector with 0s with 8945 rows
            oneHot[lookUpToken(token)] = 1 #at the particular word index make 1
            if isFirst: #first word 
                xF = oneHot #use the one hot array
                isFirst = False 
            else:
                xF = np.hstack((xF,oneHot)) #2-d array for one review
    return xF

In [18]:
xF = vectorize(df['review'][1])

In [19]:
xF.shape #this review has 17 words and different review have different words

(8945, 17)

In [20]:
df #data set is huge we need to trim it

Unnamed: 0,rating,review
0,negative,terrible place to work for i just heard a stor...
1,negative,"hours , minutes total time for an extremely s..."
2,negative,my less than stellar review is for service . w...
3,negative,i m granting one star because there s no way t...
4,negative,the food here is mediocre at best . i went aft...
...,...,...
55995,positive,"great food . wonderful , friendly service . i ..."
55996,positive,charlotte should be the new standard for moder...
55997,positive,get the encore sandwich ! ! make sure to get i...
55998,positive,i m a pretty big ice cream gelato fan . pretty...


In [21]:
smallDf_pos = df[df['rating']=='positive'].iloc[:5]
smallDf_neg = df[df['rating']=='negative'].iloc[:5]
df_small = pd.concat([smallDf_pos,smallDf_neg],axis=0)

In [22]:
df_small #make a small dataset

Unnamed: 0,rating,review
28000,positive,my experience was by far the most pleasant i h...
28001,positive,i have been to this place a couple of times on...
28002,positive,very popular sushi bar in the heart of old tow...
28003,positive,the staff is nice . it s pretty clean . they u...
28004,positive,my co worker picked up lunch for us from this ...
0,negative,terrible place to work for i just heard a stor...
1,negative,"hours , minutes total time for an extremely s..."
2,negative,my less than stellar review is for service . w...
3,negative,i m granting one star because there s no way t...
4,negative,the food here is mediocre at best . i went aft...


In [23]:
vocabularyFromDataFrame(df_small,cutoff=0) #rebuild vocab no cutoff

In [24]:
len(vocab['t_2_i'])

491

In [25]:
numFeatures = len(vocab['t_2_i']) #network architecture
hiddenUnits = 10
h0 = torch.tensor(np.zeros((hiddenUnits,1)))
Wx = torch.tensor(np.random.uniform(0,1,(hiddenUnits,numFeatures)),requires_grad=True)
Wh = torch.tensor(np.random.uniform(0,1,(hiddenUnits,hiddenUnits)),requires_grad=True)
Wy = torch.tensor(np.random.uniform(0,1,(1,hiddenUnits)),requires_grad=True)

In [26]:
def stepForward(xt,Wx,Wh,Wy,prevMemory):
    x_frd = torch.matmul(Wx,torch.from_numpy(xt[:,np.newaxis]))
    h_frd = torch.matmul(Wh,prevMemory)
    ht = torch.tanh(x_frd+h_frd)
    yt_hat = torch.sigmoid(torch.matmul(Wy,ht))
    return ht,yt_hat

In [27]:
def fullForwardRNN(X,Wx,Wh,Wy,prevMemory):
    y_hat = 0
    for t in range(X.shape[1]):
        ht,yt_hat = stepForward(X[:,t],Wx,Wh,Wy,prevMemory)
        prevMemory = ht
        y_hat = yt_hat #only care of last output, 0 or 1 instead of appending
    return y_hat  

In [28]:
def computeLoss(y,y_hat): # binary cross-entropy loss
    loss = 0
    for yi,yi_hat in zip(y,y_hat):
        if yi == 1:
            loss += -torch.log2(yi_hat)
        else:
            loss += -torch.log2(1-yi_hat)
    return loss/len(y)

In [29]:
def updateParams(Wx,Wh,Wy,dWx,dWh,dWy,lr):
    with torch.no_grad():
        Wx -= lr*dWx
        Wh -= lr*dWh
        Wy -= lr*dWy
    return Wx,Wh,Wy

In [30]:
def trainRNN(train_df,Wx,Wh,Wy,prevMemory,lr,nepoch):
    losses = []
    for epoch in range(nepoch): #apply batch gradient descent
        y,y_hat = [],[]
        for rv,rt in zip(train_df['review'],train_df['rating']):
            X = vectorize(rv)
            yi_hat = fullForwardRNN(X,Wx,Wh,Wy,prevMemory)
            yi = 0
            if rt == 'positive': #convert negative or positive to 0 or 1
                yi = 1
            y.append(yi) #append target to list
            y_hat.append(yi_hat) #append output to list
            
        loss = computeLoss(y,y_hat)
        loss.backward()
        losses.append(loss)
        print("Loss after epoch=%d: %f" %(epoch,loss))
        sys.stdout.flush()
        dWx = Wx.grad.data
        dWh = Wh.grad.data
        dWy = Wy.grad.data
        Wx,Wh,Wy = updateParams(Wx,Wh,Wy,dWx,dWh,dWy,lr)
        Wx.grad.data.zero_()
        Wh.grad.data.zero_()
        Wy.grad.data.zero_()
    return Wx,Wh,Wy,losses

In [31]:
Wx,Wh,Wy,losses = trainRNN(df_small,Wx,Wh,Wy,h0,0.01,50)

Loss after epoch=0: 3.964508
Loss after epoch=1: 3.913349
Loss after epoch=2: 3.862255
Loss after epoch=3: 3.811228
Loss after epoch=4: 3.760276
Loss after epoch=5: 3.709401
Loss after epoch=6: 3.658612
Loss after epoch=7: 3.607912
Loss after epoch=8: 3.557309
Loss after epoch=9: 3.506810
Loss after epoch=10: 3.456421
Loss after epoch=11: 3.406151
Loss after epoch=12: 3.356008
Loss after epoch=13: 3.306000
Loss after epoch=14: 3.256137
Loss after epoch=15: 3.206429
Loss after epoch=16: 3.156887
Loss after epoch=17: 3.107520
Loss after epoch=18: 3.058343
Loss after epoch=19: 3.009366
Loss after epoch=20: 2.960603
Loss after epoch=21: 2.912069
Loss after epoch=22: 2.863777
Loss after epoch=23: 2.815743
Loss after epoch=24: 2.767985
Loss after epoch=25: 2.720518
Loss after epoch=26: 2.673360
Loss after epoch=27: 2.626531
Loss after epoch=28: 2.580050
Loss after epoch=29: 2.533938
Loss after epoch=30: 2.488215
Loss after epoch=31: 2.442904
Loss after epoch=32: 2.398027
Loss after epoch=33:

In [35]:
r = df['review'].iloc[6]
y = df['rating'].iloc[6]

In [36]:
print(r,y)

i had an appointment that was made months in advance and when i turned up they told me that the person i had the appointment with quit weeks ago and they didn t have anyone available to take me . they made no attempt to reschedule me with someone else or call me to cancel . to make things worse , the battleax at the front desk had such an attitude . here s a tip , when you are in the beauty salon business , staff the front desk with someone who looks like they have actually been to a salon . not with rude women that have hairdos from years ago . will never go back .  negative


In [37]:
X = vectorize(r)

In [38]:
y_hat = fullForwardRNN(X,Wx,Wh,Wy,h0)

In [39]:
y_hat

tensor([[0.8929]], dtype=torch.float64, grad_fn=<SigmoidBackward0>)

In [40]:
y

'negative'