In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import spacy
import torch
import torch.nn as nn
from torch.nn.utils.rnn import pack_padded_sequence, pad_packed_sequence
import torch.optim as optim
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import seaborn as sns
from torch.utils.data import DataLoader, TensorDataset
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
from tqdm import tqdm

In [4]:
#Reading csv file
train = pd.read_csv("/content/drive/MyDrive/DL_DialogDataset/train.csv")
test = pd.read_csv("/content/drive/MyDrive/DL_DialogDataset/test.csv")
train

Unnamed: 0,utterance,act
0,"Say , Jim , how about going for a few beers af...",3
1,You know that is tempting but is really not g...,4
2,What do you mean ? It will help us to relax .,2
3,Do you really think so ? I don't . It will ju...,2
4,I guess you are right.But what shall we do ? ...,2
...,...,...
5085,Tom and Helen got married at last .,1
5086,How did you know that ? I heart Tom ’ s fathe...,2
5087,I was invited to attend their wedding .,1
5088,It ’ s great . Although his father didn ’ t a...,1


In [5]:
Xtrain = train.utterance
ytrain = train.act
Xtest = test.utterance
ytest = test.act
  

## Preprocessing

In [6]:
!pip install contractions 

Collecting contractions
  Downloading contractions-0.1.68-py2.py3-none-any.whl (8.1 kB)
Collecting textsearch>=0.0.21
  Downloading textsearch-0.0.21-py2.py3-none-any.whl (7.5 kB)
Collecting anyascii
  Downloading anyascii-0.3.0-py3-none-any.whl (284 kB)
[K     |████████████████████████████████| 284 kB 8.0 MB/s 
[?25hCollecting pyahocorasick
  Downloading pyahocorasick-1.4.4-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (106 kB)
[K     |████████████████████████████████| 106 kB 57.7 MB/s 
[?25hInstalling collected packages: pyahocorasick, anyascii, textsearch, contractions
Successfully installed anyascii-0.3.0 contractions-0.1.68 pyahocorasick-1.4.4 textsearch-0.0.21


In [7]:
def lowerCase(df):
  df= df.apply(lambda x: x.lower())
  return df

Xtrain = lowerCase(Xtrain)
Xtest = lowerCase(Xtest)

In [8]:
import contractions
def contractionExpand(text):
  return contractions.fix(text)

Xtrain = Xtrain.apply(lambda x: contractionExpand(x))

In [9]:
Xtest = Xtest.apply(lambda x: contractionExpand(x))

In [10]:
import nltk
nltk.download('punkt')

def tokenize(text):
  tokens = nltk.word_tokenize(text)
  return tokens

Xtrain = Xtrain.apply(lambda x: tokenize(x))

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


In [11]:
Xtest = Xtest.apply(lambda x: tokenize(x))

In [12]:
Xtrain

0       [say, ,, jim, ,, how, about, going, for, a, fe...
1       [you, know, that, is, tempting, but, is, reall...
2       [what, do, you, mean, ?, it, will, help, us, t...
3       [do, you, really, think, so, ?, i, do, not, .,...
4       [i, guess, you, are, right.but, what, shall, w...
                              ...                        
5085         [tom, and, helen, got, married, at, last, .]
5086    [how, did, you, know, that, ?, i, heart, tom, ...
5087     [i, was, invited, to, attend, their, wedding, .]
5088    [it, ’, s, great, ., although, his, father, di...
5089                           [how, moving, love, is, .]
Name: utterance, Length: 5090, dtype: object

In [13]:
Xtest

0      [i, am, better, now, ., want, to, play, again, ?]
1      [i, will, let, you, break, the, balls, this, t...
2      [let, us, get, all, the, balls, out, of, the, ...
3      [ok, ., how, much, do, you, want, to, bet, on,...
4      [you, are, crazy, ., gambling, is, against, my...
                             ...                        
717    [yeah, ,, i, got, to, eat, as, much, pizza, as...
718                       [did, you, like, the, play, ?]
719    [not, really, ., it, is, a, dull, one, ,, and,...
720    [you, are, absolutely, right, ., the, acting, ...
721    [to, be, fair, ,, though, ,, both, the, costum...
Name: utterance, Length: 722, dtype: object

In [14]:
max_seq_len = len(max(Xtrain, key=len))

In [15]:
max_seq_len_test = len(max(Xtest, key=len))

### Sentence Embedding for each utterance: Using Glove and then pooling

In [16]:
!wget http://nlp.stanford.edu/data/glove.6B.zip
!unzip glove.6B.zip
!ls -lat

--2022-03-28 18:45:59--  http://nlp.stanford.edu/data/glove.6B.zip
Resolving nlp.stanford.edu (nlp.stanford.edu)... 171.64.67.140
Connecting to nlp.stanford.edu (nlp.stanford.edu)|171.64.67.140|:80... connected.
HTTP request sent, awaiting response... 302 Found
Location: https://nlp.stanford.edu/data/glove.6B.zip [following]
--2022-03-28 18:45:59--  https://nlp.stanford.edu/data/glove.6B.zip
Connecting to nlp.stanford.edu (nlp.stanford.edu)|171.64.67.140|:443... connected.
HTTP request sent, awaiting response... 301 Moved Permanently
Location: http://downloads.cs.stanford.edu/nlp/data/glove.6B.zip [following]
--2022-03-28 18:45:59--  http://downloads.cs.stanford.edu/nlp/data/glove.6B.zip
Resolving downloads.cs.stanford.edu (downloads.cs.stanford.edu)... 171.64.64.22
Connecting to downloads.cs.stanford.edu (downloads.cs.stanford.edu)|171.64.64.22|:80... connected.
HTTP request sent, awaiting response... 200 OK
Length: 862182613 (822M) [application/zip]
Saving to: ‘glove.6B.zip’


2022-0

In [17]:
vocab,embeddings = [],[]
with open('glove.6B.50d.txt','rt') as fi:
    full_content = fi.read().strip().split('\n')
for i in range(len(full_content)):
    i_word = full_content[i].split(' ')[0]
    i_embeddings = [float(val) for val in full_content[i].split(' ')[1:]]
    vocab.append(i_word)
    embeddings.append(i_embeddings)

In [18]:
vocab_npa = np.array(vocab)
embs_npa = np.array(embeddings)

In [19]:
#insert '<pad>' and '<unk>' tokens at start of vocab_npa.
vocab_npa = np.insert(vocab_npa, 0, '<pad>')
vocab_npa = np.insert(vocab_npa, 1, '<unk>')
print(vocab_npa[:10])

pad_emb_npa = np.zeros((1,embs_npa.shape[1]))   #embedding for '<pad>' token.
unk_emb_npa = np.mean(embs_npa,axis=0,keepdims=True)    #embedding for '<unk>' token.

#insert embeddings for pad and unk tokens at top of embs_npa.
embs_npa = np.vstack((pad_emb_npa,unk_emb_npa,embs_npa))
print(embs_npa.shape)

['<pad>' '<unk>' 'the' ',' '.' 'of' 'to' 'and' 'in' 'a']
(400002, 50)


In [20]:
my_embedding_layer = torch.nn.Embedding.from_pretrained(torch.from_numpy(embs_npa).float())

assert my_embedding_layer.weight.shape == embs_npa.shape
print(my_embedding_layer.weight.shape)

torch.Size([400002, 50])


In [21]:
def convert_text_to_id(df,vocab,unk_token,pad_to_len,pad_token):
  word2idx = {term:idx for idx,term in enumerate(vocab)}
  idx2word = {idx:word for word,idx in word2idx.items()}

  for i in range(len(df)):
    deficit = pad_to_len - len(df[i])
    df[i].extend([pad_token]*deficit)
    for j in range(len(df[i])):
      if df[i][j] not in word2idx:
        df[i][j] = word2idx[unk_token]
      else:
        df[i][j] = word2idx[df[i][j]]
  return df


Xtrain_id = convert_text_to_id(Xtrain,vocab_npa,'<unk>',max_seq_len,'<pad>')


In [22]:
Xtest_id = convert_text_to_id(Xtest,vocab_npa,'<unk>',max_seq_len_test,'<pad>')

In [23]:
len(Xtrain_id[0])

102

In [24]:
Xtrain_id

0       [205, 3, 2017, 3, 199, 61, 224, 12, 9, 308, 13...
1       [83, 348, 14, 16, 21655, 36, 16, 590, 38, 221,...
2       [104, 90, 83, 1704, 190, 22, 45, 277, 97, 6, 1...
3       [90, 83, 590, 271, 102, 190, 43, 90, 38, 4, 22...
4       [43, 5022, 83, 34, 1, 104, 5286, 55, 90, 190, ...
                              ...                        
5085    [1616, 7, 7216, 407, 1169, 24, 78, 4, 0, 0, 0,...
5086    [199, 121, 83, 348, 14, 190, 43, 1060, 1616, 3...
5087    [43, 17, 2862, 6, 2057, 46, 4320, 4, 0, 0, 0, ...
5088    [22, 3073, 1536, 355, 4, 378, 28, 631, 73332, ...
5089    [199, 1235, 837, 16, 4, 0, 0, 0, 0, 0, 0, 0, 0...
Name: utterance, Length: 5090, dtype: object

In [25]:
#Load dataset
Xtrain_id = np.array(Xtrain_id)
Xtrain_id= np.vstack(Xtrain_id).astype('int')
ytrain = np.array(ytrain)
ytrain = ytrain - 1
train_data = TensorDataset(torch.from_numpy(Xtrain_id), torch.from_numpy(ytrain))


In [26]:
Xtest_id = np.array(Xtest_id)
Xtest_id= np.vstack(Xtest_id).astype('int')
ytest = np.array(ytest)
ytest = ytest - 1
test_data = TensorDataset(torch.from_numpy(Xtest_id), torch.from_numpy(ytest))

In [27]:
ytrain

array([2, 3, 1, ..., 0, 0, 0])

In [28]:
trainDataLoader = DataLoader(train_data,batch_size=32)

In [29]:
testDataLoader = DataLoader(test_data, batch_size=32)

In [30]:
class LSTMClass(nn.Module):
    def __init__(self,embs_nps,hidden_size,num_layers,drop_prob):
        super().__init__()
        self.vocab_size = embs_npa.shape[0]
        self.embedding_dim = embs_npa.shape[1]
        self.embedding = nn.Embedding.from_pretrained(torch.from_numpy(embs_npa).float(),freeze=True)
        self.hidden_size = hidden_size
        self.num_layers = num_layers
        self.pool = nn.AdaptiveAvgPool2d((1,self.embedding_dim))
        self.lstm = nn.LSTM(input_size=self.embedding_dim,hidden_size=self.hidden_size,num_layers=self.num_layers,batch_first=True,bidirectional=False)
        self.dropout = nn.Dropout(drop_prob)
        self.linear = nn.Linear(hidden_size, 4)
        # self.softmax = nn.Softmax()
        

    def forward(self, x):
      embed_out = self.embedding(x)
      # print("emb",embed_out.shape)
      sentence_embed_out = self.pool(embed_out)
      # print("Senemb",sentence_embed_out.shape)
      hnot = torch.zeros(self.num_layers,x.size(0),self.hidden_size).to(device)
      cnot = torch.zeros(self.num_layers,x.size(0),self.hidden_size).to(device)
      out,_ = self.lstm(sentence_embed_out,(hnot,cnot))
      out = out[:,-1,:]
      # print("lstm",out.shape)
      # out = self.dropout(out)
      out = self.linear(out)
      # print("lin",out.shape)
      # out = self.softmax(out)
      return out

In [31]:
lstm = LSTMClass(embs_npa,256,1,0.2).to(device)
criterion = nn.CrossEntropyLoss()
# criterion = nn.L1Loss()
optimizer = optim.Adam(lstm.parameters(), lr=0.1)

In [32]:
totalSteps = len(trainDataLoader)
numEpochs = 2
X = 2  # X previous utterances for t th utterance
for epoch in range(numEpochs):  # loop over the dataset multiple times

    running_loss = 0.0
    
    for i, data in tqdm(enumerate(trainDataLoader)):
        # get the inputs; data is a list of [inputs, labels]
        inputs, labels = data
        inputs = inputs.to(device)
        # print("l",labels.shape)
        # print(labels)
        labels = labels.reshape((-1,)).to(device)
        # print("l",labels.shape)
        # print(labels)
        
        outputs = lstm(inputs)
          
        # outputs = torch.squeeze(outputs)
        # outputs = outputs.reshape((-1, 1))
        # print(outputs.shape)
        loss = criterion(outputs, labels)

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

    
        if (i+1) % 10 == 0:
            print(f'Epoch {epoch + 1} / {numEpochs}, Step {i+1} / {totalSteps}, Loss: {loss.item():.4f}')
            # print(outputs)

print('Finished Training')

27it [00:00, 91.90it/s]

Epoch 1 / 2, Step 10 / 160, Loss: 1.2260
Epoch 1 / 2, Step 20 / 160, Loss: 0.6718
Epoch 1 / 2, Step 30 / 160, Loss: 0.7292


54it [00:00, 114.44it/s]

Epoch 1 / 2, Step 40 / 160, Loss: 1.1773
Epoch 1 / 2, Step 50 / 160, Loss: 1.2134
Epoch 1 / 2, Step 60 / 160, Loss: 1.0491


95it [00:00, 125.57it/s]

Epoch 1 / 2, Step 70 / 160, Loss: 0.9515
Epoch 1 / 2, Step 80 / 160, Loss: 0.8669
Epoch 1 / 2, Step 90 / 160, Loss: 1.0670


122it [00:01, 127.42it/s]

Epoch 1 / 2, Step 100 / 160, Loss: 0.8975
Epoch 1 / 2, Step 110 / 160, Loss: 0.8577
Epoch 1 / 2, Step 120 / 160, Loss: 0.8935


148it [00:01, 120.66it/s]

Epoch 1 / 2, Step 130 / 160, Loss: 0.7036
Epoch 1 / 2, Step 140 / 160, Loss: 0.6445
Epoch 1 / 2, Step 150 / 160, Loss: 0.9972


160it [00:01, 113.71it/s]


Epoch 1 / 2, Step 160 / 160, Loss: 0.3381


13it [00:00, 126.43it/s]

Epoch 2 / 2, Step 10 / 160, Loss: 0.9576
Epoch 2 / 2, Step 20 / 160, Loss: 0.5656


26it [00:00, 126.40it/s]

Epoch 2 / 2, Step 30 / 160, Loss: 0.6433


53it [00:00, 129.82it/s]

Epoch 2 / 2, Step 40 / 160, Loss: 1.1423
Epoch 2 / 2, Step 50 / 160, Loss: 1.0493


66it [00:00, 126.73it/s]

Epoch 2 / 2, Step 60 / 160, Loss: 0.8986


80it [00:00, 128.28it/s]

Epoch 2 / 2, Step 70 / 160, Loss: 0.6818
Epoch 2 / 2, Step 80 / 160, Loss: 0.8373


94it [00:00, 127.18it/s]

Epoch 2 / 2, Step 90 / 160, Loss: 0.8553


107it [00:00, 124.72it/s]

Epoch 2 / 2, Step 100 / 160, Loss: 0.7660
Epoch 2 / 2, Step 110 / 160, Loss: 0.7828


120it [00:00, 124.91it/s]

Epoch 2 / 2, Step 120 / 160, Loss: 0.7428


133it [00:01, 122.84it/s]

Epoch 2 / 2, Step 130 / 160, Loss: 0.6102
Epoch 2 / 2, Step 140 / 160, Loss: 0.6596


146it [00:01, 123.98it/s]

Epoch 2 / 2, Step 150 / 160, Loss: 0.9388


160it [00:01, 126.37it/s]

Epoch 2 / 2, Step 160 / 160, Loss: 0.2727
Finished Training





In [33]:
with torch.no_grad():
    nCorrect = 0
    nSamples = 0
    nClassCorrect = [0 for i in range(4)]
    nClassTotal = [0 for i in range(4)]
    
        
    for i,data in tqdm(enumerate(testDataLoader)):
        # get the inputs; data is a list of [inputs, labels]
        inputs, labels = data
        inputs = inputs.to(device)
        # print("l",labels.shape)
        # print(labels)
        labels = labels.reshape((-1,)).to(device)
        # print("l",labels.shape)
        # print(labels)
    
        outputs = lstm(inputs)
        # outputs = torch.squeeze(outputs)
        # outputs = outputs.reshape((-1, 1))
        # print(outputs.shape)
        _, predictions = torch.max(outputs, 1)
        nSamples += labels.shape[0]
        # nCorrect += (predictions == labels).sum().item()
        for i in range(len(predictions)):
            if predictions[i] == labels[i]:
                nCorrect += 1

    
    acc = 100.0 * nCorrect / nSamples
    print(f'Accuracy on test: {acc:.2f}%')

    

23it [00:00, 478.54it/s]

Accuracy on test: 69.25%





In [34]:
outputs.shape

torch.Size([18, 4])

In [35]:
labels.shape

torch.Size([18])

In [36]:
ytrain.shape

(5090,)