## English ( LSTM )

https://www.analyticsvidhya.com/blog/2020/01/first-text-classification-in-pytorch/

In [1]:
import torch   

# handling text data.
from torchtext import data 

# set random seed.
SEED = 2020
torch.manual_seed(SEED)

<torch._C.Generator at 0x114d86790>

### 1. load dataset:

In [2]:
from torchtext.vocab import Vectors
from torch.nn import init
from tqdm import tqdm

TEXT = data.Field(tokenize=lambda x: x.split(), batch_first=True, include_lengths=True)
LABEL = data.LabelField(dtype = torch.float, batch_first=True)

In [3]:
# choose text and label from dataset. 
fields = [(None, None), (None, None), (None, None), (None, None), ('text',TEXT), (None, None), ('label', LABEL)]
train_data=data.TabularDataset(path='preprocess_train.csv', format='csv',fields=fields, skip_header=True)
valid_data=data.TabularDataset(path='preprocess_dev.csv', format='csv',fields=fields, skip_header=True)

#print preprocessed text.
print(vars(train_data.examples[0]))

{'text': ['since', 'never', 'get', 'report', 'medium', 'want', 'share', 'copy', 'check', 'realdonaldtrump', 'donate', 'salary', 'back', 'united', 'states', 'government', '—', 'quarter', 'donate', 'hhsgov', 'confront', 'contain', 'combat', 'coronavirus', 'flag', 'united', 'statesflag', 'united', 'states'], 'label': '1'}


### 2. build vocab:

In [4]:
# initialize glove embeddings.
TEXT.build_vocab(train_data, min_freq = 3, vectors = "glove.6B.300d")  

LABEL.build_vocab(train_data)

# vocab.
print("Size of TEXT vocabulary:",len(TEXT.vocab))

# label.
print("Size of LABEL vocabulary:",len(LABEL.vocab))

# Commonly used words.
print(TEXT.vocab.freqs.most_common(10))  
# Word dictionary.
# print(TEXT.vocab.stoi)

Size of TEXT vocabulary: 741
Size of LABEL vocabulary: 2
[('coronavirus', 184), ('virus', 124), ('corona', 112), ('covid-19', 110), ('covid19', 104), ('case', 78), ('people', 76), ('test', 65), ('get', 57), ('amp', 55)]


In [5]:
# GPU or CPU.
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')  

#set batch size
BATCH_SIZE = 64

#Load an iterator
train_iterator, valid_iterator = data.BucketIterator.splits(
    (train_data, valid_data), 
    batch_size = BATCH_SIZE,
    sort_key = lambda x: len(x.text),
    sort_within_batch=True,
    device = device)

### 2. define neuronal network architecture:

<img src="architecture.png" width="400" height="400">

In [6]:
import torch.nn as nn

class classifier(nn.Module):
    
    # define all the layers used in model
    def __init__(self, vocab_size, embedding_dim, hidden_dim, output_dim, n_layers, 
                 bidirectional, dropout):
        
        # Constructor
        super().__init__()          
        
        # embedding layer
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        
        # LSTM layer
        self.lstm = nn.LSTM(embedding_dim, 
                           hidden_dim, 
                           num_layers=n_layers, 
                           bidirectional=bidirectional, 
                           dropout=dropout,
                           batch_first=True)
        
        # dense layer
        self.fc = nn.Linear(hidden_dim * 2, output_dim)
        
        # activation function
        self.act = nn.Sigmoid()
        
    def forward(self, text, text_lengths):

            #text = [batch size,sent_length]
            embedded = self.embedding(text)
            #embedded = [batch size, sent_len, emb dim]

            #packed sequence
            packed_embedded = nn.utils.rnn.pack_padded_sequence(embedded, text_lengths,batch_first=True)

            packed_output, (hidden, cell) = self.lstm(packed_embedded)
            #hidden = [batch size, num layers * num directions,hid dim]
            #cell = [batch size, num layers * num directions,hid dim]

            #concat the final forward and backward hidden state
            hidden = torch.cat((hidden[-2,:,:], hidden[-1,:,:]), dim = 1)

            #hidden = [batch size, hid dim * num directions]
            dense_outputs=self.fc(hidden)

            #Final activation function
            outputs=self.act(dense_outputs)

            return outputs

In [7]:
#define hyperparameters
size_of_vocab = len(TEXT.vocab)
embedding_dim = 300
num_hidden_nodes = 32
num_output_nodes = 1
num_layers = 2
bidirection = True
dropout = 0.2

#instantiate the model
model = classifier(size_of_vocab, embedding_dim, num_hidden_nodes,num_output_nodes, num_layers,
                   bidirectional = True, dropout = dropout)

In [8]:
#architecture
print(model)

#No. of trianable parameters
def count_parameters(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)
    
print(f'The model has {count_parameters(model):,} trainable parameters')

#Initialize the pretrained embedding
pretrained_embeddings = TEXT.vocab.vectors
model.embedding.weight.data.copy_(pretrained_embeddings)

print(pretrained_embeddings.shape)

classifier(
  (embedding): Embedding(741, 300)
  (lstm): LSTM(300, 32, num_layers=2, batch_first=True, dropout=0.2, bidirectional=True)
  (fc): Linear(in_features=64, out_features=1, bias=True)
  (act): Sigmoid()
)
The model has 332,957 trainable parameters
torch.Size([741, 300])


In [9]:
import torch.optim as optim

#define optimizer and loss
optimizer = optim.Adam(model.parameters())
criterion = nn.BCELoss()

#define metric
def binary_accuracy(preds, y):
    #round predictions to the closest integer
    rounded_preds = torch.round(preds)
    
    correct = (rounded_preds == y).float() 
    acc = correct.sum() / len(correct)
    return acc
    
#push to cuda if available
model = model.to(device)
criterion = criterion.to(device)

### 3. define training function: 

In [10]:
def train(model, iterator, optimizer, criterion):
    
    #initialize every epoch 
    epoch_loss = 0
    epoch_acc = 0
    
    #set the model in training phase
    model.train() 
    
    for batch in iterator:
        
        #resets the gradients after every batch
        optimizer.zero_grad()   
        
        #retrieve text and no. of words
        text, text_lengths = batch.text   
        
        #convert to 1D tensor
        predictions = model(text, text_lengths).squeeze()  
        
        #compute the loss
        loss = criterion(predictions, batch.label)        
        
        #compute the binary accuracy
        acc = binary_accuracy(predictions, batch.label)   
        
        #backpropage the loss and compute the gradients
        loss.backward()       
        
        #update the weights
        optimizer.step()      
        
        #loss and accuracy
        epoch_loss += loss.item()  
        epoch_acc += acc.item()    
        
    return epoch_loss / len(iterator), epoch_acc / len(iterator)

### 4. define evaluate function:

In [11]:
def evaluate(model, iterator, criterion):
    pred_prob_list = list()
    
    #initialize every epoch
    epoch_loss = 0
    epoch_acc = 0

    #deactivating dropout layers
    model.eval()
    
    #deactivates autograd
    with torch.no_grad():
    
        for batch in iterator:
        
            #retrieve text and no. of words
            text, text_lengths = batch.text
            
            #convert to 1d tensor
            predictions = model(text, text_lengths).squeeze()
            
            
            pred_prob_list.append(predictions)
            
            #compute loss and accuracy
            loss = criterion(predictions, batch.label)
            acc = binary_accuracy(predictions, batch.label)
            
            #keep track of loss and accuracy
            epoch_loss += loss.item()
            epoch_acc += acc.item()
        
    return epoch_loss / len(iterator), epoch_acc / len(iterator), pred_prob_list

In [12]:
N_EPOCHS = 10
best_valid_loss = float('inf')

for epoch in range(N_EPOCHS):
    
    #train the model
    train_loss, train_acc = train(model, train_iterator, optimizer, criterion)
    
    #evaluate the model
    valid_loss, valid_acc, pred_prob = evaluate(model, valid_iterator, criterion)
    
    #save the best model
    if valid_loss < best_valid_loss:
        best_valid_loss = valid_loss
        torch.save(model.state_dict(), 'saved_weights.pt')
        # prob of best model.
        prob_list = list()
        prob_list = pred_prob # 64 by 64 by 20.
    
    print('Epoch: ', epoch+1)
    print(f'\tTrain Loss: {train_loss:.3f} | Train Acc: {train_acc*100:.2f}%')
    print(f'\t Val. Loss: {valid_loss:.3f} |  Val. Acc: {valid_acc*100:.2f}%')
    print('-----------------------------------------------')

Epoch:  1
	Train Loss: 0.703 | Train Acc: 46.03%
	 Val. Loss: 0.690 |  Val. Acc: 51.42%
-----------------------------------------------
Epoch:  2
	Train Loss: 0.693 | Train Acc: 45.90%
	 Val. Loss: 0.684 |  Val. Acc: 56.11%
-----------------------------------------------
Epoch:  3
	Train Loss: 0.682 | Train Acc: 62.04%
	 Val. Loss: 0.678 |  Val. Acc: 74.20%
-----------------------------------------------
Epoch:  4
	Train Loss: 0.668 | Train Acc: 71.22%
	 Val. Loss: 0.668 |  Val. Acc: 58.24%
-----------------------------------------------
Epoch:  5
	Train Loss: 0.644 | Train Acc: 68.83%
	 Val. Loss: 0.648 |  Val. Acc: 59.80%
-----------------------------------------------
Epoch:  6
	Train Loss: 0.602 | Train Acc: 74.79%
	 Val. Loss: 0.608 |  Val. Acc: 72.68%
-----------------------------------------------
Epoch:  7
	Train Loss: 0.528 | Train Acc: 77.79%
	 Val. Loss: 0.557 |  Val. Acc: 71.16%
-----------------------------------------------
Epoch:  8
	Train Loss: 0.431 | Train Acc: 83.69%

In [13]:
# load model:
# path = 'saved_weights.pt'
# model.load_state_dict(torch.load(path))
# model.eval()

# evaluate(model, iterator, criterion)

In [14]:
import numpy as np
   
preds_proba = list(np.array(prob_list[0])) + list(np.array(prob_list[1])) + list(np.array(prob_list[2]))

len(preds_proba)

150

In [15]:
import pandas as pd

dev = pd.read_csv('preprocess_dev.csv', usecols=['tweet_id'])

dev.shape

(150, 1)

In [16]:
results = pd.DataFrame(columns=['topic_id', 'tweet_id', 'score', 'run_id'])
results['tweet_id'] = list(dev['tweet_id'])
results['score'] = [x for x in preds_proba]
results['topic_id'] = 'covid-19'
results['run_id'] = 'Model_1'

results.loc[:10]

Unnamed: 0,topic_id,tweet_id,score,run_id
0,covid-19,1235714275752267776,0.461557,Model_1
1,covid-19,1235256530728972290,0.513492,Model_1
2,covid-19,1235648554338791427,0.840732,Model_1
3,covid-19,1235674258858061825,0.882272,Model_1
4,covid-19,1235663306246860800,0.909752,Model_1
5,covid-19,1235436227140055040,0.914788,Model_1
6,covid-19,1235602629247537154,0.849051,Model_1
7,covid-19,1235566351093137408,0.856797,Model_1
8,covid-19,1235620307534258176,0.351739,Model_1
9,covid-19,1235758466784014337,0.033685,Model_1


In [17]:
results.to_csv('golf_system_results_2.tsv', sep='\t', header=False, index=False)

INFO : ================ RESULTS for golf_system_results_2.tsv ====================================

INFO : AVERAGE PRECISION:            0.5540    
INFO : ==================================================================================

INFO : RECIPROCAL RANK:              1.0000    
INFO : ===================================================================================

INFO : R-PRECISION (R=72):           0.5278    
INFO : ===================================================================================

INFO : PRECISION@N:                  @1        @3        @5        @10       @20       @50       
INFO :                               1.0000    0.6667    0.6000    0.5000    0.5500    0.6000    
INFO : ====================================================================================