In [1]:
import numpy as np
import pandas as pd
import torch
import sys
import string
from collections import Counter

In [2]:
vocab = {}

In [3]:
def addToken(token):
    if token in vocab['t_2_i']:
        idx =  vocab['t_2_i'][token]
    else:
        idx = len(vocab['t_2_i'])
        vocab['t_2_i'][token] = idx
        vocab['i_2_t'][idx] = token
    return idx

In [4]:
def initializeVocabulary():
    unkToken = '<UNK>'
    vocab['t_2_i'] = {}
    vocab['i_2_t'] = {}
    idx = addToken(unkToken)
    vocab['addUnk'] = True
    vocab['unkToken'] = unkToken
    vocab['unkTokenIdx'] = idx

In [5]:
def addManyTokens(tokens):
    idexs = [addToken(token) for token in tokens]
    return idexs

In [6]:
def lookuptoken(token):
    if vocab['unkTokenIdx'] >= 0:
        return vocab['t_2_i'].get(token,vocab['unkTokenIdx'])
    else:
        return vocab['t_2_i'][token]

In [7]:
def lookupidx(idx):
    if idx not in vocab['i_2_t']:
        raise keyError("the index (%d) is not there" %(idx))
    return vocab['i_2_t'][idx]

In [8]:
# if cutoff in not more than 25 than don't add in vocabulary
def vocabularyFromDataFrame(df,cutoff=25):
    initializeVocabulary()
    wordCounts = Counter()
    for i in df.review:
        for word in i.split(" "):
            if word not in string.punctuation:
                wordCounts[word] += 1
    for word,count in wordCounts.items():
        if count > cutoff:
            addToken(word)

In [9]:
df = pd.read_csv(r"C:\Users\Sub\CodeSpace\AI\mastering_recurrent_neural_networks\Data\reviews.csv")

In [10]:
vocabularyFromDataFrame(df)

In [11]:
lookuptoken('this')

128

In [12]:
lookupidx(128)

'this'

In [13]:
def vectorize(review):
    isFirst = True
    for token in review.split(" "):
        if token not in string.punctuation:
            oneHot = np.zeros((len(vocab['t_2_i']),1))
            oneHot[lookuptoken(token)] = 1
            if isFirst:
                xF = oneHot
                isFirst = False
            else:
                xF = np.hstack((xF,oneHot))
    return xF

In [14]:
xF = vectorize(df['review'][1])

In [15]:
xF.shape

(8945, 17)

In [16]:
smallDef_pos = df[df['rating']=='positive'].iloc[:5]
smallDef_neg = df[df['rating']=='negative'].iloc[:5]
df_small = pd.concat((smallDef_pos,smallDef_neg))

In [17]:
df_small

Unnamed: 0,rating,review
28000,positive,my experience was by far the most pleasant i h...
28001,positive,i have been to this place a couple of times on...
28002,positive,very popular sushi bar in the heart of old tow...
28003,positive,the staff is nice . it s pretty clean . they u...
28004,positive,my co worker picked up lunch for us from this ...
0,negative,terrible place to work for i just heard a stor...
1,negative,"hours , minutes total time for an extremely s..."
2,negative,my less than stellar review is for service . w...
3,negative,i m granting one star because there s no way t...
4,negative,the food here is mediocre at best . i went aft...


In [18]:
vocabularyFromDataFrame(df_small,cutoff=0)

In [19]:
len(vocab['t_2_i'])

491

In [64]:
numFeatures = len(vocab['t_2_i'])
numhiddenUnits = 10
h0 = torch.tensor(np.zeros((numhiddenUnits,1)))
Wx = torch.tensor(np.random.uniform(0,1,(numhiddenUnits,numFeatures)),requires_grad=True)
Wh = torch.tensor(np.random.uniform(0,1,(numhiddenUnits,numhiddenUnits)),requires_grad=True)
Wy = torch.tensor(np.random.uniform(0,1,(1,numhiddenUnits)),requires_grad=True)

In [65]:
def stepForward(xt,Wx,Wh,Wy,prevMemory):
    x_frd = torch.matmul(Wx,torch.from_numpy(xt[:,np.newaxis]))
    h_frd = torch.matmul(Wh,prevMemory)
    ht = torch.tanh(x_frd + h_frd)
    y_hat = torch.sigmoid(torch.matmul(Wy,ht))
    return ht,y_hat

In [66]:
def fullForward(X,Wx,Wh,Wy,prevMemory):
    y_hat = 0
    for i in range(X.shape[1]):
        ht,yhat = stepForward(X[:,i],Wx,Wh,Wy,prevMemory)
        prevMemory = ht
        y_hat = yhat
    return y_hat

In [67]:
def computeLoss(y,y_hat):
    loss = 0
    for yi,yi_hat in zip(y,y_hat):
        if yi == 1:
            loss += -torch.log2(yi_hat)
        else:
            loss += -torch.log2(1-yi_hat)
    return loss/len(y)

In [68]:
def updateParams(Wx,Wh,Wy,dWx,dWh,dWy,lr):
    with torch.no_grad():
        Wx -= lr*dWx
        Wy -= lr*dWy
        Wh -= lr*dWh
    return Wx,Wh,Wy

In [69]:
def trainRNN(train_df,Wx,Wh,Wy,prevMemory,lr,nepochs):
    losses = []
    for epoch in range(nepochs):
        y,y_hat = [],[]
        for rt,rv in zip(train_df['rating'],train_df['review']):
            X = vectorize(rv)
            yi_hat = fullForward(X,Wx,Wh,Wy,prevMemory)
            yi = 0
            if rt == 'positive':
                yi = 1
            y.append(yi)
            y_hat.append(yi_hat)
        loss = computeLoss(y,y_hat)
        loss.backward()
        losses.append(loss)
        print("Loss after epoch %d : %f" %(epoch,loss))
        sys.stdout.flush()
        dWx = Wx.grad.data
        dWh = Wh.grad.data
        dWy  = Wy.grad.data
        Wx,Wh,Wy = updateParams(Wx,Wh,Wy,dWx,dWh,dWy,lr)
        Wx.grad.data.zero_()
        Wh.grad.data.zero_()
        Wy.grad.data.zero_()
    return Wx,Wh,Wy, losses

In [74]:
Wx,Wh,Wy, losses = trainRNN(df_small,Wx,Wh,Wy,h0,0.01,50)

Loss after epoch 0 : 2.620737
Loss after epoch 1 : 2.574303
Loss after epoch 2 : 2.528239
Loss after epoch 3 : 2.482568
Loss after epoch 4 : 2.437312
Loss after epoch 5 : 2.392493
Loss after epoch 6 : 2.348134
Loss after epoch 7 : 2.304260
Loss after epoch 8 : 2.260896
Loss after epoch 9 : 2.218067
Loss after epoch 10 : 2.175799
Loss after epoch 11 : 2.134117
Loss after epoch 12 : 2.093048
Loss after epoch 13 : 2.052619
Loss after epoch 14 : 2.012855
Loss after epoch 15 : 1.973783
Loss after epoch 16 : 1.935428
Loss after epoch 17 : 1.897816
Loss after epoch 18 : 1.860972
Loss after epoch 19 : 1.824919
Loss after epoch 20 : 1.789679
Loss after epoch 21 : 1.755275
Loss after epoch 22 : 1.721726
Loss after epoch 23 : 1.689051
Loss after epoch 24 : 1.657266
Loss after epoch 25 : 1.626386
Loss after epoch 26 : 1.596424
Loss after epoch 27 : 1.567391
Loss after epoch 28 : 1.539293
Loss after epoch 29 : 1.512138
Loss after epoch 30 : 1.485928
Loss after epoch 31 : 1.460665
Loss after epoch 3

In [105]:
v = df_small['review'].iloc[9]
y = df_small['rating'].iloc[9]

In [106]:
X = vectorize(v)

In [107]:
y_hat = fullForward(X,Wx,Wh,Wy,h0)

In [108]:
y_hat

tensor([[0.7136]], dtype=torch.float64, grad_fn=<SigmoidBackward0>)

In [109]:
y

'negative'