In [96]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import spacy
import torch
import torch.nn as nn
from torch.nn.utils.rnn import pack_padded_sequence, pad_packed_sequence
import torch.optim as optim
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import seaborn as sns
from torch.utils.data import DataLoader, TensorDataset, Dataset
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
# device = 'cpu'
from tqdm import tqdm
from transformers import AutoTokenizer, AutoModel
from sentence_transformers import SentenceTransformer

In [201]:
#Reading csv file
train = pd.read_csv("./train_a2.csv")
test = pd.read_csv("test_a2.csv")
train

Unnamed: 0,utterance,act
0,"Say , Jim , how about going for a few beers af...",3
1,You know that is tempting but is really not g...,4
2,What do you mean ? It will help us to relax .,2
3,Do you really think so ? I don't . It will ju...,2
4,I guess you are right.But what shall we do ? ...,2
...,...,...
5085,Tom and Helen got married at last .,1
5086,How did you know that ? I heart Tom ’ s fathe...,2
5087,I was invited to attend their wedding .,1
5088,It ’ s great . Although his father didn ’ t a...,1


In [202]:
Xtrain = train.utterance
ytrain = train.act - 1
Xtest = test.utterance
ytest = test.act - 1 

In [203]:
def lowerCase(df):
  df= df.apply(lambda x: x.lower())
  return df

Xtrain = lowerCase(Xtrain)
Xtest = lowerCase(Xtest)

In [204]:
Xtrain

0       say , jim , how about going for a few beers af...
1        you know that is tempting but is really not g...
2          what do you mean ? it will help us to relax . 
3        do you really think so ? i don't . it will ju...
4        i guess you are right.but what shall we do ? ...
                              ...                        
5085                 tom and helen got married at last . 
5086     how did you know that ? i heart tom ’ s fathe...
5087             i was invited to attend their wedding . 
5088     it ’ s great . although his father didn ’ t a...
5089                                how moving love is . 
Name: utterance, Length: 5090, dtype: object

In [205]:
import contractions
def contractionExpand(text):
  return contractions.fix(text)

Xtrain = Xtrain.apply(lambda x: contractionExpand(x))

In [206]:
Xtest = Xtest.apply(lambda x: contractionExpand(x))

In [207]:
Xtrain

0       say , jim , how about going for a few beers af...
1        you know that is tempting but is really not g...
2          what do you mean ? it will help us to relax . 
3        do you really think so ? i do not . it will j...
4        i guess you are right.but what shall we do ? ...
                              ...                        
5085                 tom and helen got married at last . 
5086     how did you know that ? i heart tom ’ s fathe...
5087             i was invited to attend their wedding . 
5088     it ’ s great . although his father didn ’ t a...
5089                                how moving love is . 
Name: utterance, Length: 5090, dtype: object

In [208]:
import nltk
nltk.download('punkt')

def tokenize(text):
  tokens = nltk.word_tokenize(text)
  return tokens

Xtrain = Xtrain.apply(lambda x: tokenize(x))

[nltk_data] Downloading package punkt to /home/mann/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [209]:
Xtest = Xtest.apply(lambda x: tokenize(x))


In [210]:
max_seq_len = len(max(Xtrain, key=len))


In [211]:
max_seq_len_test = len(max(Xtest, key=len))


In [212]:
# !wget http://nlp.stanford.edu/data/glove.6B.zip
# !unzip glove.6B.zip
# !ls -lat

In [213]:
from nltk.corpus import stopwords
import string
sw = stopwords.words('english')

In [214]:
def remove_stopwords(data):
    for i in tqdm(range(len(data))):
        data[i] = [word for word in data[i] if word not in sw and word not in string.punctuation]
    return data

In [215]:
Xtrain = remove_stopwords(Xtrain)
Xtest = remove_stopwords(Xtest)

100%|██████████| 5090/5090 [00:00<00:00, 22030.61it/s]
100%|██████████| 722/722 [00:00<00:00, 19928.32it/s]


In [152]:
sbert_model = SentenceTransformer('all-mpnet-base-v2')

Downloading:   0%|          | 0.00/1.18k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/10.1k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/571 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/116 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/39.3k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/349 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/438M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/239 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/466k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/363 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/13.1k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/190 [00:00<?, ?B/s]

In [216]:
Xtrain = [[' '.join(i)] for i in Xtrain]

In [217]:
Xtrain = [i[0] for i in Xtrain]

In [218]:
Xtest = [[' '.join(i)] for i in Xtest]

In [219]:
Xtest = [i[0] for i in Xtest]

In [200]:
tokenizer = AutoTokenizer.from_pretrained('sentence-transformers/nli-bert-base')

Downloading:   0%|          | 0.00/375 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/613 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/466k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/112 [00:00<?, ?B/s]

In [228]:
Xtrain2 = []
ytrain2 = []
Xtest2 = []
ytest2 = []
for i in tqdm(range(2, len(Xtrain))):
    newStr = ""
#     Xtrain2.append(Xtrain[i] + tokenizer.sep_token + Xtrain[i-4] + ' '+Xtrain[i-3] + ' '+Xtrain[i-2] + ' '+Xtrain[i-1] + ' ' + Xtrain[i])
    Xtrain2.append(Xtrain[i-2] + ' '+Xtrain[i-1] + ' ' + Xtrain[i])
    ytrain2.append(ytrain[i])
# for i in tqdm(range(1, len(Xtest))):
#     Xtest2.append(Xtest[i-1] + ' ' + Xtest[i])
#     ytest2.append(ytest[i])

100%|██████████| 5088/5088 [00:00<00:00, 167665.39it/s]


In [240]:
Xtrain

['say jim going beers dinner',
 'know tempting really good fitness',
 'mean help us relax',
 'really think make us fat act silly remember last time',
 'guess right.but shall feel like sitting home',
 'suggest walk gym play singsong meet friends',
 'good idea hear mary sally often go play pingpong.perhaps make foursome',
 'sounds great willing could ask go dancing us.that excellent exercise fun',
 'good.let go',
 'right',
 'push-ups',
 'course piece cake believe 30 push-ups minute',
 'really think impossible',
 'mean 30 push-ups',
 'yeah',
 'easy exercise everyday make',
 'study radio',
 'listen background music',
 'difference',
 'radio many comerials',
 'true buy record player',
 'right',
 'right soon terrified watched fall wire',
 'worry.he acrobat 。',
 'see',
 'hey john nice skates new',
 'yeah got started playing ice hockey community league finally got new skates',
 'position play',
 '’ defender ’ lot fun ’ able skate fast defense',
 'yeah ’ pretty big guy play goalie',
 'oh yeah te

In [221]:
len(Xtrain)

5090

In [229]:
len(Xtrain2)

5088

In [230]:
train_sentence_embeddings = sbert_model.encode(Xtrain2)

In [231]:
test_sentence_embeddings = sbert_model.encode(Xtest)

In [232]:
train_sentence_embeddings.shape, test_sentence_embeddings.shape

((5088, 768), (722, 768))

In [233]:
train_sentence_embeddings.shape, len(ytrain2)

((5088, 768), 5088)

In [241]:
import torchtext.vocab

glove = torchtext.vocab.GloVe(name = '6B', dim = 100)

100%|█████████▉| 399999/400000 [00:20<00:00, 19403.23it/s]


In [244]:
glove.vectors[glove.stoi['the']]

tensor([-0.0382, -0.2449,  0.7281, -0.3996,  0.0832,  0.0440, -0.3914,  0.3344,
        -0.5755,  0.0875,  0.2879, -0.0673,  0.3091, -0.2638, -0.1323, -0.2076,
         0.3340, -0.3385, -0.3174, -0.4834,  0.1464, -0.3730,  0.3458,  0.0520,
         0.4495, -0.4697,  0.0263, -0.5415, -0.1552, -0.1411, -0.0397,  0.2828,
         0.1439,  0.2346, -0.3102,  0.0862,  0.2040,  0.5262,  0.1716, -0.0824,
        -0.7179, -0.4153,  0.2033, -0.1276,  0.4137,  0.5519,  0.5791, -0.3348,
        -0.3656, -0.5486, -0.0629,  0.2658,  0.3020,  0.9977, -0.8048, -3.0243,
         0.0125, -0.3694,  2.2167,  0.7220, -0.2498,  0.9214,  0.0345,  0.4674,
         1.1079, -0.1936, -0.0746,  0.2335, -0.0521, -0.2204,  0.0572, -0.1581,
        -0.3080, -0.4162,  0.3797,  0.1501, -0.5321, -0.2055, -1.2526,  0.0716,
         0.7056,  0.4974, -0.4206,  0.2615, -1.5380, -0.3022, -0.0734, -0.2831,
         0.3710, -0.2522,  0.0162, -0.0171, -0.3898,  0.8742, -0.7257, -0.5106,
        -0.5203, -0.1459,  0.8278,  0.27

In [251]:
Xtrain2

['say jim going beers dinner know tempting really good fitness mean help us relax',
 'know tempting really good fitness mean help us relax really think make us fat act silly remember last time',
 'mean help us relax really think make us fat act silly remember last time guess right.but shall feel like sitting home',
 'really think make us fat act silly remember last time guess right.but shall feel like sitting home suggest walk gym play singsong meet friends',
 'guess right.but shall feel like sitting home suggest walk gym play singsong meet friends good idea hear mary sally often go play pingpong.perhaps make foursome',
 'suggest walk gym play singsong meet friends good idea hear mary sally often go play pingpong.perhaps make foursome sounds great willing could ask go dancing us.that excellent exercise fun',
 'good idea hear mary sally often go play pingpong.perhaps make foursome sounds great willing could ask go dancing us.that excellent exercise fun good.let go',
 'sounds great willi

In [253]:
trainData = []
testData = []
for i in tqdm(range(len(train_sentence_embeddings))):
    trainData.append((Xtrain2[i],ytrain2[i]))
for i in tqdm(range(len(test_sentence_embeddings))):
    testData.append((Xtest[i],ytest[i]))

100%|██████████| 5088/5088 [00:00<00:00, 586441.85it/s]
100%|██████████| 722/722 [00:00<00:00, 63801.78it/s]


In [254]:
train_loader = torch.utils.data.DataLoader(dataset=trainData, 
                                           batch_size=32, 
                                           shuffle=False)
test_loader = torch.utils.data.DataLoader(dataset=testData, 
                                          batch_size=32, 
                                          shuffle=False)

In [246]:
para_encoder = SentenceTransformer('sentence-transformers/nli-bert-base', device='cuda')

Downloading:   0%|          | 0.00/690 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/3.89k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/613 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/122 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/229 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/438M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/112 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/466k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/375 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/190 [00:00<?, ?B/s]

In [255]:
class LSTMClass(nn.Module):
    def __init__(self,para_encoder, hidden_size,num_layers,drop_prob):
        super().__init__()
        self.para_encoder = para_encoder
        self.hidden_size = hidden_size
        self.num_layers = num_layers
        self.lstm = nn.LSTM(input_size=768,hidden_size=self.hidden_size,num_layers=self.num_layers,batch_first=True,bidirectional=False)
        self.dropout = nn.Dropout(drop_prob)
        self.linear = nn.Linear(hidden_size, 4)
        # self.softmax = nn.Softmax()
        

    def forward(self, x):
      # print("emb",embed_out.shape)
      # print("Senemb",sentence_embed_out.shape)
        para_encoding = self.para_encoder.encode(x, show_progress_bar=False, convert_to_tensor=True) 
        para_encoding = para_encoding.unsqueeze(1)
        hnot = torch.zeros(self.num_layers,x.size(0),self.hidden_size).to(device)
        cnot = torch.zeros(self.num_layers,x.size(0),self.hidden_size).to(device)
        out,_ = self.lstm(x,(hnot,cnot))
        out = out[:,-1,:]
        # print("lstm",out.shape)
        # out = self.dropout(out)
        out = self.linear(out)
        # print("lin",out.shape)
        # out = self.softmax(out)
        return out

In [256]:
lstm = LSTMClass(para_encoder,128,2,0.2).to(device)
criterion = nn.CrossEntropyLoss()
# criterion = nn.L1Loss()
optimizer = optim.Adam(lstm.parameters(), lr=0.0001)

In [258]:

numEpochs = 20
for epoch in range(numEpochs):  # loop over the dataset multiple times

    running_loss = 0.0
    print("epoch:", epoch)
    for i, data in tqdm(enumerate(train_loader)):
        # get the inputs; data is a list of [inputs, labels]
        inputs, labels = data
        print(inputs)
        inputs = inputs.to(device)
        labels = labels.to(device)

        inputs = torch.unsqueeze(inputs, 1)
#         print(inputs.shape, labels.shape)
        # print("l",labels.shape)
        # print(labels)
#         inputs = torch.squeeze(inputs,0)
        # print(inputs.shape)
        # print(inputs)
        
        outputs = lstm(inputs)
        
          
        # outputs = torch.squeeze(outputs)
        # outputs = outputs.reshape((-1, 1))
        # print(outputs.shape)
        loss = criterion(outputs, labels)

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        
    
#         if (i+1) % 10 == 0:
#         print(f'Epoch {epoch + 1} / {numEpochs}, Step {i+1} / {totalSteps}, Loss: {loss.item():.4f}')
            # print(outputs)

print('Finished Training')

0it [00:00, ?it/s]

epoch: 0
('say jim going beers dinner know tempting really good fitness mean help us relax', 'know tempting really good fitness mean help us relax really think make us fat act silly remember last time', 'mean help us relax really think make us fat act silly remember last time guess right.but shall feel like sitting home', 'really think make us fat act silly remember last time guess right.but shall feel like sitting home suggest walk gym play singsong meet friends', 'guess right.but shall feel like sitting home suggest walk gym play singsong meet friends good idea hear mary sally often go play pingpong.perhaps make foursome', 'suggest walk gym play singsong meet friends good idea hear mary sally often go play pingpong.perhaps make foursome sounds great willing could ask go dancing us.that excellent exercise fun', 'good idea hear mary sally often go play pingpong.perhaps make foursome sounds great willing could ask go dancing us.that excellent exercise fun good.let go', 'sounds great wil




AttributeError: 'tuple' object has no attribute 'to'

In [239]:
with torch.no_grad():
    nCorrect = 0
    nSamples = 0
    nClassCorrect = [0 for i in range(4)]
    nClassTotal = [0 for i in range(4)]
    
        
    for i,data in tqdm(enumerate(test_loader)):
        # get the inputs; data is a list of [inputs, labels]
        inputs, labels = data
        inputs = inputs.to(device)
        inputs = torch.unsqueeze(inputs, 1)
        # print("l",labels.shape)
        # print(labels)
        labels = labels.to(device)
        # print("l",labels.shape)
        # print(labels)
#         inputs = torch.squeeze(inputs, 0)
        outputs = lstm(inputs)

        # outputs = outputs.reshape((-1, 1))
        # print(outputs.shape)
        _, predictions = torch.max(outputs,1)
#         print(predictions)
#         print("l",labels)
        nSamples += labels.shape[0]
        # nCorrect += (predictions == labels).sum().item()
        for i in range(len(predictions)):
            if predictions[i] == labels[i]:
                nCorrect += 1

    
    acc = 100.0 * nCorrect / nSamples
    print(f'Accuracy on test: {acc:.2f}%')

23it [00:00, 681.00it/s]

Accuracy on test: 50.83%



