In [None]:
from google.colab import drive
drive.mount('/content/drive/')

Drive already mounted at /content/drive/; to attempt to forcibly remount, call drive.mount("/content/drive/", force_remount=True).


In [None]:
import torch
from torchtext import data
from torchtext.vocab import Vectors
import spacy
import pandas as pd
import numpy as np
from sklearn.metrics import accuracy_score
import os
import time
import torch.nn.functional as F
from torch.autograd import Variable
import torch.optim as optim
import torchtext as text

In [None]:
%cd /content/drive/My Drive/Classroom/Ass10

/content/drive/My Drive/Classroom/Ass10


In [None]:
SEED = 42
torch.manual_seed(SEED)
torch.backends.cudnn.deterministic = True  

In [None]:
TEXT = data.Field(tokenize='spacy',batch_first=True,include_lengths=True)
LABEL = data.LabelField(dtype = torch.float,batch_first=True)

In [None]:
fields = [('text',TEXT),('label', LABEL)]

In [None]:
#loading custom dataset
training_data=data.TabularDataset(path = 'train.csv',format = 'csv',fields = fields,skip_header = True)
valid_data = data.TabularDataset(path = 'test.csv',format = 'csv',fields = fields,skip_header = True)

#print preprocessed text
print(vars(training_data.examples[0]))

{'text': ['21', 'Times', 'British', 'People', 'Confused', 'The', 'Hell', 'Out', 'Of', 'Everyone', 'On', 'Twitter'], 'label': 'clickbait'}


In [None]:
import random
train_data = training_data

In [None]:
TEXT.build_vocab(train_data,min_freq=3,vectors = "glove.6B.50d")  
LABEL.build_vocab(train_data)

print("Size of TEXT vocabulary: ",len(TEXT.vocab))
print("Size of LABEL vocabulary:",len(LABEL.vocab))
print(TEXT.vocab.freqs.most_common(10))
print(TEXT.vocab.stoi)   

Size of TEXT vocabulary:  9569
Size of LABEL vocabulary: 2
[('You', 4438), ('"', 4274), ('The', 4099), ('in', 3464), (',', 3233), ('to', 2743), ('To', 2601), ("'s", 2375), ('A', 2301), ('of', 2142)]


In [None]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')  
BATCH_SIZE = 64
train_iterator, valid_iterator = data.BucketIterator.splits(
    (train_data, valid_data), 
    batch_size = BATCH_SIZE,
    sort_key = lambda x: len(x.text),
    sort_within_batch=True,
    device = device)

In [None]:
import torch.nn as nn

class classifier(nn.Module):
    #define all the layers used in model
    def __init__(self, vocab_size, embedding_dim, hidden_dim, output_dim, n_layers, 
                 bidirectional, dropout):
        super().__init__()          
        
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.lstm = nn.LSTM(embedding_dim, hidden_dim, num_layers=n_layers, bidirectional=bidirectional, dropout=dropout, batch_first=True)
        #dense layer
        self.fc = nn.Linear(hidden_dim * 2, output_dim)
        #activation function
        self.act = nn.Sigmoid()
        
    def forward(self, text, text_lengths):
        
        #text = [batch size,sent_length]
        embedded = self.embedding(text)
        #embedded = [batch size, sent_len, emb dim]
      
        #packed sequence
        packed_embedded = nn.utils.rnn.pack_padded_sequence(embedded, text_lengths,batch_first=True)
        
        packed_output, (hidden, cell) = self.lstm(packed_embedded)
        #hidden = [batch size, num layers * num directions,hid dim]
        #cell = [batch size, num layers * num directions,hid dim]
        
        #concat the final forward and backward hidden state
        hidden = torch.cat((hidden[-2,:,:], hidden[-1,:,:]), dim = 1)
                
        #hidden = [batch size, hid dim * num directions]
        dense_outputs=self.fc(hidden)

        #Final activation function
        outputs=self.act(dense_outputs)
        
        return outputs

In [None]:
size_of_vocab = len(TEXT.vocab)
embedding_dim = 50
num_hidden_nodes = 32
num_output_nodes = 1
n_layers = 2
bidirection = True
dropout = 0.2

model = classifier(size_of_vocab, embedding_dim, num_hidden_nodes,num_output_nodes, n_layers, 
                    bidirectional = True, dropout = dropout)

In [None]:
#architecture
print(model)

#No. of trianable parameters
def count_parameters(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)
    
print(f'The model has {count_parameters(model):,} trainable parameters')

#Initialize the pretrained embedding
pretrained_embeddings = TEXT.vocab.vectors
model.embedding.weight.data.copy_(pretrained_embeddings)

print(pretrained_embeddings.shape)

classifier(
  (embedding): Embedding(9569, 50)
  (lstm): LSTM(50, 32, num_layers=2, batch_first=True, dropout=0.2, bidirectional=True)
  (fc): Linear(in_features=64, out_features=1, bias=True)
  (act): Sigmoid()
)
The model has 525,107 trainable parameters
torch.Size([9569, 50])


In [None]:
import torch.optim as optim

#define optimizer and loss
optimizer = optim.Adam(model.parameters(),lr = 0.01)
criterion = nn.BCELoss()

#define metric
def binary_accuracy(preds, y):
    #round predictions to the closest integer
    rounded_preds = torch.round(preds)
    correct = (rounded_preds == y).float() 
    acc = correct.sum() / len(correct)
    return acc
    
#push to cuda if available
model = model.to(device)
criterion = criterion.to(device)


In [None]:
def train(model, iterator, optimizer, criterion):
    
    #initialize every epoch 
    epoch_loss = 0
    epoch_acc = 0    
    #set the model in training phase
    model.train()  
    
    for batch in iterator:
        
        #resets the gradients after every batch
        optimizer.zero_grad()   
        
        #retrieve text and no. of words
        text, text_lengths = batch.text   
        
        #convert to 1D tensor
        predictions = model(text, text_lengths).squeeze()  
        
        #compute the loss
        loss = criterion(predictions, batch.label)        
        
        #compute the binary accuracy
        acc = binary_accuracy(predictions, batch.label)   
        
        #backpropage the loss and compute the gradients
        loss.backward()       
        
        #update the weights
        optimizer.step()      
        
        #loss and accuracy
        epoch_loss += loss.item()  
        epoch_acc += acc.item()    
        
    return epoch_loss / len(iterator), epoch_acc / len(iterator)


In [None]:
def evaluate(model, iterator, criterion):
    
    #initialize every epoch
    epoch_loss = 0
    epoch_acc = 0

    #deactivating dropout layers
    model.eval()
    
    #deactivates autograd
    with torch.no_grad():
    
        for batch in iterator:
        
            #retrieve text and no. of words
            text, text_lengths = batch.text
            
            #convert to 1d tensor
            predictions = model(text, text_lengths).squeeze()
            
            #compute loss and accuracy
            loss = criterion(predictions, batch.label)
            acc = binary_accuracy(predictions, batch.label)
            
            #keep track of loss and accuracy
            epoch_loss += loss.item()
            epoch_acc += acc.item()
        
    return epoch_loss / len(iterator), epoch_acc / len(iterator)


In [None]:
N_EPOCHS = 5
best_valid_loss = float('inf')

for epoch in range(N_EPOCHS):
     
    #train the model
    train_loss, train_acc = train(model, train_iterator, optimizer, criterion)
    
    #evaluate the model
    valid_loss, valid_acc = evaluate(model, valid_iterator, criterion)
    
    #save the best model
    if valid_loss < best_valid_loss:
        best_valid_loss = valid_loss
        torch.save(model.state_dict(), 'saved_weights.pt')
    
    print(f'\tTrain Loss: {train_loss:.3f} | Train Acc: {train_acc*100:.2f}%')
    print(f'\t Val. Loss: {valid_loss:.3f} |  Val. Acc: {valid_acc*100:.2f}%')


	Train Loss: 0.044 | Train Acc: 98.32%
	 Val. Loss: 0.017 |  Val. Acc: 99.44%
	Train Loss: 0.005 | Train Acc: 99.86%
	 Val. Loss: 0.023 |  Val. Acc: 99.41%
	Train Loss: 0.002 | Train Acc: 99.95%
	 Val. Loss: 0.020 |  Val. Acc: 99.38%
	Train Loss: 0.002 | Train Acc: 99.95%
	 Val. Loss: 0.023 |  Val. Acc: 99.36%
	Train Loss: 0.001 | Train Acc: 99.96%
	 Val. Loss: 0.027 |  Val. Acc: 99.33%


In [None]:
#load weights
path='/content/drive/My Drive/Classroom/Ass10/saved_weights.pt'
model.load_state_dict(torch.load(path));
model.eval();

#inference
import spacy
nlp = spacy.load('en')

def predict(model, sentence):
    tokenized = [tok.text for tok in nlp.tokenizer(sentence)]  #tokenize the sentence 
    indexed = [TEXT.vocab.stoi[t] for t in tokenized]          #convert to integer sequence
    length = [len(indexed)]                                    #compute no. of words
    tensor = torch.LongTensor(indexed).to(device)              #convert to tensor
    tensor = tensor.unsqueeze(1).T                             #reshape in form of batch,no. of words
    length_tensor = torch.Tensor(length)                   #convert to tensor
    prediction = model(tensor, length_tensor)                  #prediction 
    return prediction.item()


In [None]:
from sklearn.metrics import f1_score
def f1(model, iterator):

    p = []
    y = []
    model.eval()
    for batch in iterator:
        text, text_lengths = batch.text
        predictions = model(text, text_lengths).squeeze()
        p = p + list(predictions.detach().cpu().numpy().round())
        y = y + list(batch.label.detach().cpu().numpy())
    
    return f1_score(y,p)

In [None]:
f1(model,valid_iterator)

0.994408201304753

ConvNet Implementation


In [None]:
class CNN(nn.Module):
	def __init__(self, batch_size, output_size, in_channels, out_channels, kernel_heights, stride, padding, keep_probab, vocab_size, embedding_length, weights):
		super(CNN, self).__init__()
		
		"""
		Arguments
		---------
		batch_size : Size of each batch which is same as the batch_size of the data returned by the TorchText BucketIterator
		output_size : 2 = (pos, neg)
		in_channels : Number of input channels. Here it is 1 as the input data has dimension = (batch_size, num_seq, embedding_length)
		out_channels : Number of output channels after convolution operation performed on the input matrix
		kernel_heights : A list consisting of 3 different kernel_heights. Convolution will be performed 3 times and finally results from each kernel_height will be concatenated.
		keep_probab : Probability of retaining an activation node during dropout operation
		vocab_size : Size of the vocabulary containing unique words
		embedding_length : Embedding dimension of GloVe word embeddings
		weights : Pre-trained GloVe word_embeddings which we will use to create our word_embedding look-up table
		--------
		
		"""
		self.batch_size = batch_size
		self.output_size = output_size
		self.in_channels = in_channels
		self.out_channels = out_channels
		self.kernel_heights = kernel_heights
		self.stride = stride
		self.padding = padding
		self.vocab_size = vocab_size
		self.embedding_length = embedding_length
		
		self.word_embeddings = nn.Embedding(vocab_size, embedding_length)
		self.word_embeddings.weight = nn.Parameter(weights, requires_grad=False)
		self.conv1 = nn.Conv2d(in_channels, out_channels, (kernel_heights[0], embedding_length), stride, padding)
		self.conv2 = nn.Conv2d(in_channels, out_channels, (kernel_heights[1], embedding_length), stride, padding)
		self.conv3 = nn.Conv2d(in_channels, out_channels, (kernel_heights[2], embedding_length), stride, padding)
		self.dropout = nn.Dropout(keep_probab)
		self.label = nn.Linear(len(kernel_heights)*out_channels, output_size)
	
	def conv_block(self, input, conv_layer):
		conv_out = conv_layer(input)# conv_out.size() = (batch_size, out_channels, dim, 1)
		activation = F.relu(conv_out.squeeze(3))# activation.size() = (batch_size, out_channels, dim1)
		max_out = F.max_pool1d(activation, activation.size()[2]).squeeze(2)# maxpool_out.size() = (batch_size, out_channels)
		
		return max_out
	
	def forward(self, input_sentences, batch_size=None):
		
		"""
		The idea of the Convolutional Neural Netwok for Text Classification is very simple. We perform convolution operation on the embedding matrix 
		whose shape for each batch is (num_seq, embedding_length) with kernel of varying height but constant width which is same as the embedding_length.
		We will be using ReLU activation after the convolution operation and then for each kernel height, we will use max_pool operation on each tensor 
		and will filter all the maximum activation for every channel and then we will concatenate the resulting tensors. This output is then fully connected
		to the output layers consisting two units which basically gives us the logits for both positive and negative classes.
		
		Parameters
		----------
		input_sentences: input_sentences of shape = (batch_size, num_sequences)
		batch_size : default = None. Used only for prediction on a single sentence after training (batch_size = 1)
		
		Returns
		-------
		Output of the linear layer containing logits for pos & neg class.
		logits.size() = (batch_size, output_size)
		
		"""
		
		input = self.word_embeddings(input_sentences)
		# input.size() = (batch_size, num_seq, embedding_length)
		input = input.unsqueeze(1)
		# input.size() = (batch_size, 1, num_seq, embedding_length)
		max_out1 = self.conv_block(input, self.conv1)
		max_out2 = self.conv_block(input, self.conv2)
		max_out3 = self.conv_block(input, self.conv3)
		
		all_out = torch.cat((max_out1, max_out2, max_out3), 1)
		# all_out.size() = (batch_size, num_kernels*out_channels)
		fc_in = self.dropout(all_out)
		# fc_in.size()) = (batch_size, num_kernels*out_channels)
		logits = self.label(fc_in)
		
		return logits

In [None]:
word_idx = dict(TEXT.vocab.stoi)

In [None]:
def create_embedding_matrix(filepath, word_idx, embedding_dim):
    vocab_size = len(word_idx) + 1
    # Adding again 1 because of reserved 0 index
    embedding_matrix = np.zeros((vocab_size, embedding_dim))
    with open(filepath) as f:
        for line in f:
            word = line.split()[0]
            vector = line.split()[1:]
            if word in word_idx.keys():
                idx = word_idx[word] 
                embedding_matrix[idx] = np.array(vector, dtype=np.float32)[:embedding_dim]
    return embedding_matrix
embedding_matrix = create_embedding_matrix('/content/drive/My Drive/Classroom/Ass10/.vector_cache/glove.6B.50d.txt',
                                            word_idx,
                                            embedding_dim = 50)

In [None]:
model = CNN(batch_size = BATCH_SIZE,output_size = 1, in_channels=1,out_channels=1,kernel_heights=[3,2,1],keep_probab=0.5,stride=1,padding=0,embedding_length=50,vocab_size= size_of_vocab, weights = torch.Tensor(embedding_matrix).cuda() )

In [None]:
print(model)

CNN(
  (word_embeddings): Embedding(9569, 50)
  (conv1): Conv2d(1, 1, kernel_size=(3, 50), stride=(1, 1))
  (conv2): Conv2d(1, 1, kernel_size=(2, 50), stride=(1, 1))
  (conv3): Conv2d(1, 1, kernel_size=(1, 50), stride=(1, 1))
  (dropout): Dropout(p=0.5, inplace=False)
  (label): Linear(in_features=3, out_features=1, bias=True)
)


In [None]:
def count_parameters(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)
    
print(f'The model has {count_parameters(model):,} trainable parameters')

The model has 307 trainable parameters


In [None]:
import torch.optim as optim

#define optimizer and loss
optimizer = optim.Adam(model.parameters(),lr = 0.01)
criterion = nn.BCELoss()

#define metric
def binary_accuracy(preds, y):
    #round predictions to the closest integer
    rounded_preds = torch.round(preds)
    correct = (rounded_preds == y).float() 
    acc = correct.sum() / len(correct)
    return acc
    
#push to cuda if available
model = model.to(device)
criterion = criterion.to(device)


In [None]:
def train(model, iterator, optimizer, criterion):
    
    #initialize every epoch 
    epoch_loss = 0
    epoch_acc = 0    
    #set the model in training phase
    model.train()  
    
    for batch in iterator:
        
        #resets the gradients after every batch
        optimizer.zero_grad()   
        
        #retrieve text and no. of words
        text, text_lengths = batch.text   
        
        #convert to 1D tensor
        predictions = model(text, text_lengths).squeeze()  
        
        #compute the loss
        loss = criterion(predictions, batch.label)        
        
        #compute the binary accuracy
        acc = binary_accuracy(predictions, batch.label)   
        
        #backpropage the loss and compute the gradients
        loss.backward()       
        
        #update the weights
        optimizer.step()      
        
        #loss and accuracy
        epoch_loss += loss.item()  
        epoch_acc += acc.item()    
        
    return epoch_loss / len(iterator), epoch_acc / len(iterator)


In [None]:
N_EPOCHS = 5
best_valid_loss = float('inf')

for epoch in range(N_EPOCHS):
     
    #train the model
    train_loss, train_acc = train(model, train_iterator, optimizer, criterion)
    
    #evaluate the model
    valid_loss, valid_acc = evaluate(model, valid_iterator, criterion)
    
    #save the best model
    if valid_loss < best_valid_loss:
        best_valid_loss = valid_loss
        torch.save(model.state_dict(), 'saved_weights.pt')
    
    print(f'\tTrain Loss: {train_loss:.3f} | Train Acc: {train_acc*100:.2f}%')
    print(f'\t Val. Loss: {valid_loss:.3f} |  Val. Acc: {valid_acc*100:.2f}%')

	Train Loss: nan | Train Acc: 50.81%
	 Val. Loss: 0.946 |  Val. Acc: 49.73%
	Train Loss: nan | Train Acc: 49.74%
	 Val. Loss: 0.946 |  Val. Acc: 49.73%


In [None]:
from sklearn.metrics import f1_score
def f1(model, iterator):

    p = []
    y = []
    model.eval()
    for batch in iterator:
        text, text_lengths = batch.text
        predictions = model(text, text_lengths).squeeze()
        p = p + list(predictions.detach().cpu().numpy().round())
        y = y + list(batch.label.detach().cpu().numpy())
    
    return f1_score(y,p)
f1(model,valid_iterator)

0.0