## Connecting to S3 Bucket 

In [1]:
# codes for s3 retrieval
# import boto3
# import io
# bucket = 'textgenerationbucket'
# key = 'text_data/Reviews.csv'
# s3_client = boto3.client('s3')
# obj = s3_client.get_object(Bucket=bucket, Key=key)

In [29]:
# Importing Nessesary Packages:

import pandas as pd
import os
import re
import numpy as np
import torch
import torch.nn as nn

## Reading The Data

In [3]:
# Reading the Data:

df = pd.read_csv('../Input/Reviews.csv')
df.head()

Unnamed: 0,Id,ProductId,UserId,ProfileName,HelpfulnessNumerator,HelpfulnessDenominator,Score,Time,Summary,Text
0,1,B001E4KFG0,A3SGXH7AUHU8GW,delmartian,1,1,5,1303862400,Good Quality Dog Food,I have bought several of the Vitality canned d...
1,2,B00813GRG4,A1D87F6ZCVE5NK,dll pa,0,0,1,1346976000,Not as Advertised,Product arrived labeled as Jumbo Salted Peanut...
2,3,B000LQOCH0,ABXLMWJIXXAIN,"Natalia Corres ""Natalia Corres""",1,1,4,1219017600,"""Delight"" says it all",This is a confection that has been around a fe...
3,4,B000UA0QIQ,A395BORC6FGVXV,Karl,3,3,2,1307923200,Cough Medicine,If you are looking for the secret ingredient i...
4,5,B006K2ZZ7K,A1UQRSCLF8GW1T,"Michael D. Bigham ""M. Wassir""",0,0,5,1350777600,Great taffy,Great taffy at a great price. There was a wid...


In [4]:
data = df[['Text']]

In [5]:
# Making all the words to lower case:

data["Text"] = [re.sub("[^a-z' ]", "", i.lower()) for i in data["Text"]]
data

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data["Text"] = [re.sub("[^a-z' ]", "", i.lower()) for i in data["Text"]]


Unnamed: 0,Text
0,i have bought several of the vitality canned d...
1,product arrived labeled as jumbo salted peanut...
2,this is a confection that has been around a fe...
3,if you are looking for the secret ingredient i...
4,great taffy at a great price there was a wide...
...,...
568449,great for sesame chickenthis is a good if not ...
568450,i'm disappointed with the flavor the chocolate...
568451,these stars are small so you can give of thos...
568452,these are the best treats for training and rew...


In [6]:
# Printing a sample:

data["Text"][0]

'i have bought several of the vitality canned dog food products and have found them all to be of good quality the product looks more like a stew than a processed meat and it smells better my labrador is finicky and she appreciates this product better than  most'

## Creating the Sequence

In [7]:
# Function to create a sequence of length 10 Tokens:
def create_seq(text, seq_len = 10):
    
    sequences = []
    
    #if the number of tokens in text is greater than 5
    if len(text.split()) > seq_len:
        for i in range(seq_len, len(text.split())):
            # Select sequence of tokens
            seq = text.split()[i-seq_len:i+1]
            #add to the list
            sequences.append(" ".join(seq))
        return sequences
    else:
        return[text]
        

In [8]:
sentence ="i have bought several of the vitality canned dog food products and have found them all to be of good quality the product looks more like a stew than a processed meatand it smells better my labrador is finicky and she appreciates this product better than most."

In [9]:
create_seq(sentence)

['i have bought several of the vitality canned dog food products',
 'have bought several of the vitality canned dog food products and',
 'bought several of the vitality canned dog food products and have',
 'several of the vitality canned dog food products and have found',
 'of the vitality canned dog food products and have found them',
 'the vitality canned dog food products and have found them all',
 'vitality canned dog food products and have found them all to',
 'canned dog food products and have found them all to be',
 'dog food products and have found them all to be of',
 'food products and have found them all to be of good',
 'products and have found them all to be of good quality',
 'and have found them all to be of good quality the',
 'have found them all to be of good quality the product',
 'found them all to be of good quality the product looks',
 'them all to be of good quality the product looks more',
 'all to be of good quality the product looks more like',
 'to be of good

In [10]:
# Creating a list of text:

seq = []
text = data["Text"].values
for i in range(10000):
    seqi = create_seq(text[i])
    seq.extend([s for s in seqi if len(s.split(" ")) == 11])
    

In [11]:
len(seq)

652591

In [12]:
for i in range(652581,652591):
    print(seq[i])

one of the more expensive places target has the best price
of the more expensive places target has the best price so
the more expensive places target has the best price so for
more expensive places target has the best price so for now
expensive places target has the best price so for now it
places target has the best price so for now it works
target has the best price so for now it works and
has the best price so for now it works and i
the best price so for now it works and i recommend
best price so for now it works and i recommend it


In [13]:
# create inputs and targets (x and y)
x = []
y = []

for s in seq:
      if len(s.split()) == 11:
        x.append(" ".join(s.split()[:-1]))
        y.append(" ".join(s.split()[1:]))

In [14]:
# Printing Last 5 Texts of  x:

for i in range(652581,652591):
    print(x[i])

one of the more expensive places target has the best
of the more expensive places target has the best price
the more expensive places target has the best price so
more expensive places target has the best price so for
expensive places target has the best price so for now
places target has the best price so for now it
target has the best price so for now it works
has the best price so for now it works and
the best price so for now it works and i
best price so for now it works and i recommend


In [15]:
#Printing Last 5 Texts of y:

for i in range(652581,652591):
    print(y[i])

of the more expensive places target has the best price
the more expensive places target has the best price so
more expensive places target has the best price so for
expensive places target has the best price so for now
places target has the best price so for now it
target has the best price so for now it works
has the best price so for now it works and
the best price so for now it works and i
best price so for now it works and i recommend
price so for now it works and i recommend it


In [16]:
# create integer-to-token mapping
int2token = {}
cnt = 0

for w in set(" ".join(seq).split()):
    int2token[cnt] = w
    cnt+= 1

# create token-to-integer mapping
token2int = {t: i for i, t in int2token.items()}

In [17]:
#Creating 2 dictionary that maps token

print(token2int["the"]) # Token-to-Integer

print(int2token[7171])  # Integer-to-Token

9669
mush


## Saving the Dictionary as Json File to s3

In [18]:
# import json 
# dict1 = token2int
# dict2 = int2token
# s3 = boto3.resource('s3') 
# obj1 = s3.Object('textgenerationbucket','inputs/token2int.json')
# obj = s3.Object('textgenerationbucket','inputs/int2token.json') 
# obj1.put(Body=json.dumps(dict1))
# obj.put(Body=json.dumps(dict2))

In [19]:
# set vocabulary size
vocab_size = len(int2token)
vocab_size

24301

In [22]:
def get_integer_seq(word):
    temp = []
    for w in word.split():
        temp.append(token2int[w])

    return temp

In [23]:
# converting text sequences to integer sequences:

x_int = [get_integer_seq(i) for i in x]
y_int = [get_integer_seq(i) for i in y]

In [24]:
np.array(x_int).shape

(652591, 10)

## Saving the processed Input to S3

In [25]:
upload_dir = 'inputs/'
if not os.path.exists(upload_dir): # Make sure that the folder exists
    os.makedirs(upload_dir)

np.save(os.path.join(upload_dir, 'y_int.npy'), y_int)
np.save(os.path.join(upload_dir, 'x_int.npy'), x_int)

In [26]:
# convert lists to numpy arrays
x_int = torch.tensor(np.array(x_int))
y_int = torch.tensor(np.array(y_int))

In [27]:
x_int[0]

tensor([10074, 17397, 24262,  3372,  1252,  9669,  1463,  1988,  5382, 15097],
       dtype=torch.int32)

## Defining the Model 

In [32]:
class WordLSTM(nn.Module):
    
    def __init__(self, n_hidden=128, n_layers=4, drop_prob=0.3, lr=0.001):
        super().__init__()

        self.drop_prob = drop_prob
        self.n_layers = n_layers
        self.n_hidden = n_hidden
        self.lr = lr
        
        self.emb_layer = nn.Embedding(vocab_size, 100)

        ## define the LSTM
        self.lstm = nn.LSTM(100, n_hidden, n_layers, 
                            dropout=drop_prob, batch_first=True)
        
        ## define a dropout layer
        self.dropout = nn.Dropout(drop_prob)
        
        ## define the fully-connected layer
        self.fc = nn.Linear(n_hidden, vocab_size)      
    
    def forward(self, x, hidden):
        ''' Forward pass through the network. 
            These inputs are x, and the hidden/cell state `hidden`. '''

        ## pass input through embedding layer
        embedded = self.emb_layer(x)     
        
        ## Get the outputs and the new hidden state from the lstm
        lstm_output, hidden = self.lstm(embedded, hidden)
        
        ## pass through a dropout layer
        out = self.dropout(lstm_output)
        
        #out = out.contiguous().view(-1, self.n_hidden) 
        out = out.reshape(-1, self.n_hidden) 

        ## put "out" through the fully-connected layer
        out = self.fc(out)

        # return the final output and the hidden state
        return out, hidden
    
    
    def init_hidden(self, batch_size):
        ''' initializes hidden state '''
        # Create two new tensors with sizes n_layers x batch_size x n_hidden,
        # initialized to zero, for hidden state and cell state of LSTM
        weight = next(self.parameters()).data

        # if GPU is available
        if (torch.cuda.is_available()):
          hidden = (weight.new(self.n_layers, batch_size, self.n_hidden).zero_().cuda(),
                    weight.new(self.n_layers, batch_size, self.n_hidden).zero_().cuda())
        
        # if GPU is not available
        else:
          hidden = (weight.new(self.n_layers, batch_size, self.n_hidden).zero_(),
                    weight.new(self.n_layers, batch_size, self.n_hidden).zero_())
        
        return hidden

# Making the Model Use GPU

In [34]:
net = WordLSTM()

# push the model to GPU (avoid it if you are not using the GPU)
# net.cuda()

print(net)

WordLSTM(
  (emb_layer): Embedding(24301, 100)
  (lstm): LSTM(100, 128, num_layers=4, batch_first=True, dropout=0.3)
  (dropout): Dropout(p=0.3, inplace=False)
  (fc): Linear(in_features=128, out_features=24301, bias=True)
)


##  Function to Training the Model

In [44]:
def train(net, epochs=10, batch_size=32, lr=0.001, clip=1, print_every=32):
    
    # optimizer
    opt = torch.optim.Adam(net.parameters(), lr=lr)
    
    # loss
    criterion = nn.CrossEntropyLoss()
    
    # push model to GPU
    # net.cuda()

    counter = 0

    net.train()

    for e in range(epochs):

        # initialize hidden state
        h = net.init_hidden(batch_size)
        
        for x, y in get_batches(x_int, y_int, batch_size):
            counter+= 1
            
            # convert numpy arrays to PyTorch arrays
            # inputs, targets = torch.tensor(x, dtype=torch.float), torch.tensor(y, dtype=torch.float)
            inputs, targets = x, y
            
            # push tensors to GPU
            # inputs, targets = inputs.cuda(), targets.cuda()

            # detach hidden states
            h = tuple([each.data for each in h])

            # zero accumulated gradients
            net.zero_grad()
            
            # get the output from the model
            output, h = net(inputs, h)
            
            # calculate the loss and perform backprop
            loss = criterion(output, targets.view(-1))

            # back-propagate error
            loss.backward()

            # `clip_grad_norm` helps prevent the exploding gradient problem in RNNs / LSTMs.
            nn.utils.clip_grad_norm_(net.parameters(), clip)

            # update weigths
            opt.step()            
            
            if counter % print_every == 0:
            
              print("Epoch: {}/{}...".format(e+1, epochs),
                    "Step: {}...".format(counter))

In [45]:
def get_batches(arr_x, arr_y, batch_size):
         
    # iterate through the arrays
    prv = 0
    for n in range(batch_size, arr_x.shape[0], batch_size):
      # print(arr_x)
      x = arr_x[prv:n]
      y = arr_y[prv:n]
      prv = n
      yield x, y

## Training the Model

In [None]:
train(net, batch_size = 100, epochs=20, print_every=512)

## Function to Make Prediction

In [None]:
# predict next token
def predict(net, tkn, h=None):
         
  # tensor inputs
  x = np.array([[token2int[tkn]]])
  inputs = torch.from_numpy(x)
  
  # push to GPU
  inputs = inputs.cuda()

  # detach hidden state from history
  h = tuple([each.data for each in h])

  # get the output of the model
  print(inputs, h)
  out, h = net(inputs)

  # get the token probabilities
  p = F.softmax(out, dim=1).data

  p = p.cpu()

  p = p.numpy()
  p = p.reshape(p.shape[1],)

  # get indices of top 3 values
  top_n_idx = p.argsort()[-3:][::-1]

  # randomly select one of the three indices
  sampled_token_index = top_n_idx[random.sample([0,1,2],1)[0]]

  # return the encoded value of the predicted char and the hidden state
  return int2token[sampled_token_index], h


# function to generate text
def sample(net, size, prime='it is'):
        
    # push to GPU
    # net.cuda()
    
    net.eval()

    # batch size is 1
    h = net.init_hidden(1)
    
    toks = prime.split()

    # predict next token
    for t in prime.split():
      token, h = predict(net, t, h)
    
    toks.append(token)

    # predict subsequent tokens
    for i in range(size-1):
        token, h = predict(net, toks[-1], h)
        toks.append(token)

    return ' '.join(toks)

## Making the Model Predict New Data

In [None]:
sample(net, 5, prime = "amazing product")