In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
cd drive/MyDrive/dlnlp/assign2

/content/drive/MyDrive/dlnlp/assign2


In [None]:
import os
import torch
from torch import nn
from torch.utils.data import DataLoader
from torchvision import datasets, transforms
import torch.utils.data as data_utils

import string
import numpy as np
import pandas as pd
import re

In [None]:
max_words = 300

def remove_tags(x):
    result = re.sub('<.*?>','',x)
    return result

def to_lo_vecs(sentence):
  lo_words =  sentence.strip().split()
  if len(lo_words)<max_words:
    lo_words = lo_words + [0]* (max_words-len(lo_words))
  else:
    lo_words = lo_words[:max_words]
  lo_vecs_glove = [word_to_vec_map['unk'] if x not in word_to_vec_map.keys() else word_to_vec_map[x] for x in lo_words]
  # lo_vecs_w2v = [w2vmodel['unk'] if x not in w2vmodel.wv.vocab else w2vmodel[x] for x in lo_words]
  # print(lo_vecs_glove.shape)
  # lo_vecs = np.append([lo_vecs_glove],[lo_vecs_w2v], axis = 1)
  return lo_vecs_glove #lo_vecs


In [None]:
device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Using {device} device")

Using cuda device


In [None]:
embed_len = 200
hidden_dim = 100
n_layers = 1
fc_dim = 100
fc_dim2 = 50
in_channels = 1
out_channels = 200
kernel_size = (3, 200)
kernel_size_pool = (10,1)
dropout = 0.5
class Net(nn.Module):
    def __init__(self):
        super(Net, self).__init__()
        # self.embedding_layer = nn.Embedding(num_embeddings=len(vocab), embedding_dim=embed_len)
        self.conv = nn.Conv2d(in_channels, out_channels, kernel_size, stride=1, padding=0)
        self.maxp = nn.MaxPool2d(kernel_size_pool, stride = 2)
        self.gru = nn.GRU(input_size=out_channels, hidden_size=hidden_dim, num_layers=n_layers, batch_first=True, dropout = 0.5)
        self.linear = nn.Linear(hidden_dim, fc_dim)
        self.linear1 = nn.Linear(fc_dim,fc_dim2)
        self.linear2 = nn.Linear(fc_dim2,1)
        self.dropout = nn.Dropout(dropout)
        self.sigmoid = nn.Sigmoid()

    def forward(self, X_batch):
        # embeddings = self.embedding_layer(X_batch)
        # print("X_batch : ", X_batch.shape)
        X_batch = X_batch.unsqueeze(1) 
        # print(X_batch.shape)
        X_batch = self.dropout(self.conv(X_batch))
        # print(X_batch.shape)
        X_batch = self.maxp(X_batch)
        # print(X_batch.shape)
        X_batch = X_batch.squeeze(dim=3)
        # print(X_batch.shape)
        X_batch = X_batch.swapaxes(1,2)
        # print(X_batch.shape)
        hidden = torch.randn(n_layers, len(X_batch), hidden_dim).to(device)
        # print(hidden.shape)
        output, hidden = self.gru(X_batch, hidden)
        y1 = torch.relu_(self.dropout(self.linear(output[:,-1])))
        y1 = torch.relu_(self.linear1(y1))
        y2 = self.linear2(y1)
        return self.sigmoid(y2)



In [None]:
model = Net().to(device)

  "num_layers={}".format(dropout, num_layers))


In [None]:
model

Net(
  (conv): Conv2d(1, 200, kernel_size=(3, 200), stride=(1, 1))
  (maxp): MaxPool2d(kernel_size=(10, 1), stride=2, padding=0, dilation=1, ceil_mode=False)
  (gru): GRU(200, 100, batch_first=True, dropout=0.5)
  (linear): Linear(in_features=100, out_features=100, bias=True)
  (linear1): Linear(in_features=100, out_features=50, bias=True)
  (linear2): Linear(in_features=50, out_features=1, bias=True)
  (dropout): Dropout(p=0.5, inplace=False)
  (sigmoid): Sigmoid()
)

In [None]:
def validation_loop(dataloader, model, loss_fn):
    size = len(dataloader.dataset)
    num_batches = len(dataloader)
    validation_loss, correct, incorrect = 0, 0, 0

    with torch.no_grad():
        for X, y in dataloader:
            pred = model(X)
            validation_loss += loss_fn(pred, y).item()
            correct += ((pred>=0.5) == y).type(torch.float).sum().item()
            incorrect += ((pred>=0.5) != y).type(torch.float).sum().item()

    validation_loss /= num_batches
    correct /= size
    incorrect /= size
    print(f"validation Error: \n Accuracy: {(100*correct):>0.1f}%, Avg loss: {validation_loss:>8f} \n")
    print(f"Incorrect percentage : \n Accuracy: {(100*incorrect):>0.1f}%\n")
    return validation_loss

In [None]:
def train_loop(dataloader, model, loss_fn, optimizer):
    size = len(dataloader.dataset)
    for batch, (X, y) in enumerate(dataloader):

        # Compute prediction and loss
        pred = model(X)
        loss = loss_fn(pred, y)

        # Backpropagation
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        if batch % 100 == 0:
            loss, current = loss.item(), batch * len(X)
            print(f"loss: {loss:>7f}  [{current:>5d}/{size:>5d}]")

In [None]:
def test_loop(dataloader, model, loss_fn):
    size = len(dataloader.dataset)
    num_batches = len(dataloader)
    validation_loss, correct, incorrect = 0, 0, 0

    with torch.no_grad():
        for X, y in dataloader:
            pred = model(X)
            validation_loss += loss_fn(pred, y).item()
            correct += ((pred>=0.5) == y).type(torch.float).sum().item()
            incorrect += ((pred>=0.5) != y).type(torch.float).sum().item()

    validation_loss /= num_batches
    correct /= size
    incorrect /= size
    print(f"Test Stats: \n Accuracy: {(100*correct):>0.1f}%, Avg loss: {validation_loss:>8f} \n")
    print(f"Incorrect percentage : \n Accuracy: {(100*incorrect):>0.1f}%\n")

In [None]:
def getTrainLoseAndError(dataloader, model, loss_fn):
    size = len(dataloader.dataset)
    num_batches = len(dataloader)
    train_loss, correct, incorrect = 0, 0, 0

    with torch.no_grad():
        for X, y in dataloader:
            pred = model(X)
            train_loss += loss_fn(pred, y).item()
            correct += ((pred>=0.5) == y).type(torch.float).sum().item()
            incorrect += ((pred>=0.5) != y).type(torch.float).sum().item()

    train_loss /= num_batches
    correct /= size
    incorrect /= size
    print(f"Train Error : \n Accuracy: {(100*correct):>0.1f}%, Avg loss: {train_loss:>8f} \n")
    print(f"Incorrect percentage : \n Accuracy: {(100*incorrect):>0.1f}%\n")

In [None]:
learning_rate = 0.003
batch_size = 128
epochs = 20
loss_fn = nn.BCELoss()
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)
print("In the begining :")
getTrainLoseAndError(train_loader, model, loss_fn)
validation_loop(validation_loader, model, loss_fn)
print("------------------")
vLossMin = 100
for t in range(epochs):
    print(f"Epoch {t+1}\n-------------------------------")
    train_loop(train_loader, model, loss_fn, optimizer)
    vLoss = validation_loop(validation_loader, model, loss_fn)
    if(vLossMin>vLoss):
      torch.save(model, "a2_lstm_fc2_do_gru.p")
      vLossMin = vLoss

print("Done!")

print("In the End :")
getTrainLoseAndError(train_loader, model, loss_fn)
validation_loop(validation_loader, model, loss_fn)
print("------------------")

# learning_rate = 1e-3
# batch_size = 64
# epochs = 100
# loss_fn = nn.BCELoss()
# optimizer = torch.optim.Adagrad(model.parameters(), lr=learning_rate) #try adagrad too

# for t in range(epochs):
#     print(f"Epoch {t+1}\n-------------------------------")
#     train_loop(train_loader, model, loss_fn, optimizer)
#     test_loop(test_loader, model, loss_fn)
# print("Done!")

In [None]:
# model = torch.load("a2_lstm_fc2_do_gru.p").to(device)
# getTrainLoseAndError(train_loader, model, loss_fn)
# validation_loop(validation_loader, model, loss_fn)

In [None]:
def read_glove_vector(glove_vec):
  with open(glove_vec, 'r', encoding='UTF-8') as f:
    word_to_vec_map = {}
    for line in f:
      w_line = line.split()
      curr_word = w_line[0]
      word_to_vec_map[curr_word] = np.array(w_line[1:], dtype=np.float64)

  return word_to_vec_map

gloveFile = "glove.6B.200d.txt"
word_to_vec_map = read_glove_vector(gloveFile)

In [None]:
# max_words = 300

# def remove_tags(x):
#     result = re.sub('<.*?>','',x)
#     return result

# def to_lo_vecs(sentence):
#   lo_words =  sentence.strip().split()
#   if len(lo_words)<max_words:
#     lo_words = lo_words + [0]* (max_words-len(lo_words))
#   else:
#     lo_words = lo_words[:max_words]
#   lo_vecs_glove = [word_to_vec_map['unk'] if x not in word_to_vec_map.keys() else word_to_vec_map[x] for x in lo_words]
#   # lo_vecs_w2v = [w2vmodel['unk'] if x not in w2vmodel.wv.vocab else w2vmodel[x] for x in lo_words]
#   # print(lo_vecs_glove.shape)
#   # lo_vecs = np.append([lo_vecs_glove],[lo_vecs_w2v], axis = 1)
#   return lo_vecs_glove #lo_vecs

In [None]:
filenameTest = 'Test_Dataset.csv'

In [None]:
df_test = pd.read_csv(filenameTest)
print(len(df_test))

# Preprocessing
df_test.loc[:,"review"] = df_test.review.apply(lambda x : str.lower(x))
df_test['review'] = df_test.review.apply(lambda x : remove_tags(x))
df_test.loc[:,"review"] = df_test.review.apply(lambda x : " ".join(re.findall('[\w]+',x)))
### check if you want to remove stopwords
df_test.loc[:,"review"] = df_test.review.apply(lambda sentence : to_lo_vecs(sentence))

df_testp = df_test[df_test.sentiment == 'positive']
df_testn = df_test[df_test.sentiment == 'negative'] # df.loc[df.sentiment == 'positive', 'sentiment'] = 1

df_testp.drop('sentiment', inplace=True, axis=1)
df_testn.drop('sentiment', inplace=True, axis=1)
lenPosT = len(df_testp)
print(lenPosT)
lenNegT = len(df_testn)
print(lenNegT)

shape1 = (lenPosT,1)
y1t = torch.ones(shape1)
shape0 = (lenNegT,1)
y0t = torch.zeros(shape0)
yt = torch.cat((y1t,y0t)).to(device)
print(yt.size())

vecDataset = df_testp['review'].tolist()
vecDatasetN = df_testn['review'].tolist()
vecDataset.extend(vecDatasetN)
del df_test
del df_testn
del df_testp
x_data_test = torch.tensor(vecDataset, dtype=torch.float).to(device)
testDataset = data_utils.TensorDataset(x_data_test, yt)
test_loader = data_utils.DataLoader(testDataset, batch_size=64, shuffle=True)

10000


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  errors=errors,


5110
4878
torch.Size([9988, 1])




In [None]:
loss_fn = nn.BCELoss()
model = torch.load("a2_lstm_fc2_do_gru.p")
test_loop(test_loader, model, loss_fn)

Test Stats: 
 Accuracy: 51.2%, Avg loss: 0.693128 

Incorrect percentage : 
 Accuracy: 48.8%

