We are using PyTorch as our deep learning framework. 
Importing necessary libraries to pre-processing, tokenizing, train, writing model states and evaluation.

In [1]:
import torch
from torch.utils.data import DataLoader, RandomSampler, SequentialSampler
from torch.utils.data import TensorDataset

from tqdm.notebook import tqdm
from tqdm.auto import tqdm

from transformers import BertTokenizer
from transformers import BertForSequenceClassification
from transformers import AdamW, get_linear_schedule_with_warmup

In [2]:
import os
import numpy as np
import pandas as pd
import gensim
import zipfile

from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score

import random


Checking the device. We will proceed if there is a GPU available.

In [3]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

print(torch.cuda.device_count())
if device.type == 'cuda':
    print(torch.cuda.get_device_name(0))
    print('Memory Usage:')
    print('Allocated:', round(torch.cuda.memory_allocated(0)/1024**3,1), 'GB')
    print('Cached:   ', round(torch.cuda.memory_reserved(0)/1024**3,1), 'GB')
else:
    exit(0)
torch.cuda.empty_cache()

2
Tesla T4
Memory Usage:
Allocated: 0.0 GB
Cached:    0.0 GB


Download the train set and load into a DataFrame

In [4]:
#if not os.path.exists("./Dataset"):
#    os.makedirs("./Dataset")

#if not os.path.isfile("./Dataset/github-labels-top3-803k-train.csv"):
#    !curl "https://tickettagger.blob.core.windows.net/datasets/github-labels-top3-803k-train.tar.gz" | tar -xz 
#    !mv github-labels-top3-803k-train.csv ./Dataset/

dataset_path = '/kaggle/input/tweet-classification-dataset/Corona_NLP_train.csv'

df = pd.read_csv(dataset_path, encoding='latin-1')

print(df.head())

#df = pd.read_csv('/kaggle/input/tweet-classification-dataset')
#print(df.head())

   UserName  ScreenName   Location     TweetAt  \
0      3799       48751     London  16-03-2020   
1      3800       48752         UK  16-03-2020   
2      3801       48753  Vagabonds  16-03-2020   
3      3802       48754        NaN  16-03-2020   
4      3803       48755        NaN  16-03-2020   

                                       OriginalTweet           Sentiment  
0  @MeNyrbie @Phil_Gahan @Chrisitv https://t.co/i...             Neutral  
1  advice Talk to your neighbours family to excha...            Positive  
2  Coronavirus Australia: Woolworths to give elde...            Positive  
3  My food stock is not the only one which is emp...            Positive  
4  Me, ready to go at supermarket during the #COV...  Extremely Negative  


Check the labels and map to a index value. 

In [5]:
print(df['Sentiment'].value_counts())
possible_labels = df.Sentiment.unique()

label_dict = {}
for index, possible_label in enumerate(possible_labels):
    label_dict[possible_label] = index
print(label_dict)


Sentiment
Positive              11422
Negative               9917
Neutral                7713
Extremely Positive     6624
Extremely Negative     5481
Name: count, dtype: int64
{'Neutral': 0, 'Positive': 1, 'Extremely Negative': 2, 'Negative': 3, 'Extremely Positive': 4}


Replace the label colomn with label index.

In [6]:
df['label'] = df.Sentiment.replace(label_dict)

print(df.head())

   UserName  ScreenName   Location     TweetAt  \
0      3799       48751     London  16-03-2020   
1      3800       48752         UK  16-03-2020   
2      3801       48753  Vagabonds  16-03-2020   
3      3802       48754        NaN  16-03-2020   
4      3803       48755        NaN  16-03-2020   

                                       OriginalTweet           Sentiment  \
0  @MeNyrbie @Phil_Gahan @Chrisitv https://t.co/i...             Neutral   
1  advice Talk to your neighbours family to excha...            Positive   
2  Coronavirus Australia: Woolworths to give elde...            Positive   
3  My food stock is not the only one which is emp...            Positive   
4  Me, ready to go at supermarket during the #COV...  Extremely Negative   

   label  
0      0  
1      1  
2      1  
3      1  
4      2  


  df['label'] = df.Sentiment.replace(label_dict)


Pre-preocessing function for removing whitespace and creating new feature.

In [7]:
# preprocessing can be customized by participants
def preprocess(row):
  # concatenate title and body, then remove whitespaces
  doc = ""
  doc += str(row.TweetAt)
  doc += " "
  doc += str(row.OriginalTweet)
  # https://radimrehurek.com/gensim/parsing/preprocessing.html
  doc = gensim.parsing.preprocessing.strip_multiple_whitespaces(doc)
  return doc

Applying preporcessing step on the dataframe.

In [8]:
df['tweet_info'] = df.apply(preprocess, axis=1)
print(df.head())

newDF = df[['Sentiment','OriginalTweet','label']]
df = newDF.copy()
print(df.head())

   UserName  ScreenName   Location     TweetAt  \
0      3799       48751     London  16-03-2020   
1      3800       48752         UK  16-03-2020   
2      3801       48753  Vagabonds  16-03-2020   
3      3802       48754        NaN  16-03-2020   
4      3803       48755        NaN  16-03-2020   

                                       OriginalTweet           Sentiment  \
0  @MeNyrbie @Phil_Gahan @Chrisitv https://t.co/i...             Neutral   
1  advice Talk to your neighbours family to excha...            Positive   
2  Coronavirus Australia: Woolworths to give elde...            Positive   
3  My food stock is not the only one which is emp...            Positive   
4  Me, ready to go at supermarket during the #COV...  Extremely Negative   

   label                                         tweet_info  
0      0  16-03-2020 @MeNyrbie @Phil_Gahan @Chrisitv htt...  
1      1  16-03-2020 advice Talk to your neighbours fami...  
2      1  16-03-2020 Coronavirus Australia: Woolworths t

Split the dataset into train and validation set.

In [9]:
X_train, X_val, y_train, y_val = train_test_split(df.index.values, 
                                                  df.label.values, 
                                                  test_size=0.15, 
                                                  random_state=42, 
                                                  stratify=df.label.values)

Marking train and validation data.

In [10]:
df['data_type'] = ['not_set']*df.shape[0]

print(df.head())
df.loc[X_train, 'data_type'] = 'train'
df.loc[X_val, 'data_type'] = 'val'

print(df.head())


            Sentiment                                      OriginalTweet  \
0             Neutral  @MeNyrbie @Phil_Gahan @Chrisitv https://t.co/i...   
1            Positive  advice Talk to your neighbours family to excha...   
2            Positive  Coronavirus Australia: Woolworths to give elde...   
3            Positive  My food stock is not the only one which is emp...   
4  Extremely Negative  Me, ready to go at supermarket during the #COV...   

   label data_type  
0      0   not_set  
1      1   not_set  
2      1   not_set  
3      1   not_set  
4      2   not_set  
            Sentiment                                      OriginalTweet  \
0             Neutral  @MeNyrbie @Phil_Gahan @Chrisitv https://t.co/i...   
1            Positive  advice Talk to your neighbours family to excha...   
2            Positive  Coronavirus Australia: Woolworths to give elde...   
3            Positive  My food stock is not the only one which is emp...   
4  Extremely Negative  Me, ready to g

Initiating BertTokenizer from 'bert-base-uncased' model.

In [11]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True)

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]



config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

Encoding train set.

In [12]:
encoded_data_train = tokenizer.batch_encode_plus(
    df[df.data_type=='train'].OriginalTweet.values, 
    add_special_tokens=True, 
    return_attention_mask=True, 
    padding='longest',
    truncation=True, 
    return_tensors='pt'
)

input_ids_train = encoded_data_train['input_ids']
attention_masks_train = encoded_data_train['attention_mask']
labels_train = torch.tensor(df[df.data_type=='train'].label.values)

Encoding validation set.

In [13]:
encoded_data_val = tokenizer.batch_encode_plus(
    df[df.data_type=='val'].OriginalTweet.values, 
    add_special_tokens=True, 
    return_attention_mask=True, 
    padding='longest',
    truncation=True,
    return_tensors='pt'
)

input_ids_val = encoded_data_val['input_ids']
attention_masks_val = encoded_data_val['attention_mask']
labels_val = torch.tensor(df[df.data_type=='val'].label.values)

Creating TensorDataset from encoded and masked train and validation set.

In [14]:
dataset_train = TensorDataset(input_ids_train, attention_masks_train, labels_train)
dataset_val = TensorDataset(input_ids_val, attention_masks_val, labels_val)
print(len(dataset_train), len(dataset_val))


34983 6174


Initiating model from 'bert-base-uncased' pretrained model.

In [15]:
model = BertForSequenceClassification.from_pretrained("bert-base-uncased",
                                                      num_labels=len(label_dict),
                                                      output_attentions=False,
                                                      output_hidden_states=False)

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Fixing batch_size and creating dataloader for training and validating.

In [16]:

batch_size = 4

dataloader_train = DataLoader(dataset_train, 
                              sampler=RandomSampler(dataset_train), 
                              batch_size=batch_size)

dataloader_validation = DataLoader(dataset_val, 
                                   sampler=SequentialSampler(dataset_val), 
                                   batch_size=batch_size)


Initiating optimizer.

In [17]:
optimizer = AdamW(model.parameters(),
                  lr=1e-5, 
                  eps=1e-8)




Fixing epochs number and initiating scheduler.

In [18]:
epochs = 4

scheduler = get_linear_schedule_with_warmup(optimizer, 
                                            num_warmup_steps=0,
                                            num_training_steps=len(dataloader_train)*epochs)

Declaring functions for evaluting.

In [19]:
def f1_score_func(preds, labels):
    preds_flat = np.argmax(preds, axis=1).flatten()
    labels_flat = labels.flatten()
    return f1_score(labels_flat, preds_flat, average='weighted')
    
def evaluate(dataloader_val):

    model.eval()
    
    loss_val_total = 0
    predictions, true_vals = [], []
    
    for batch in dataloader_val:
        
        batch = tuple(b.to(device) for b in batch)
        
        inputs = {'input_ids':      batch[0],
                  'attention_mask': batch[1],
                  'labels':         batch[2],
                 }

        with torch.no_grad():        
            outputs = model(**inputs)
            
        loss = outputs[0]
        logits = outputs[1]
        loss_val_total += loss.item()

        logits = logits.detach().cpu().numpy()
        label_ids = inputs['labels'].cpu().numpy()
        predictions.append(logits)
        true_vals.append(label_ids)
    
    loss_val_avg = loss_val_total/len(dataloader_val) 
    
    predictions = np.concatenate(predictions, axis=0)
    true_vals = np.concatenate(true_vals, axis=0)
            
    return loss_val_avg, predictions, true_vals

Fixing seed value for random sampling.

In [20]:

seed_val = 17
random.seed(seed_val)
np.random.seed(seed_val)
torch.manual_seed(seed_val)
torch.cuda.manual_seed_all(seed_val)

Moving the model to GPU.

In [21]:
model.to(device)

BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e

Starting the training process.

In [22]:
for epoch in tqdm(range(1, epochs+1)):
    
    model.train()
    
    loss_train_total = 0

    progress_bar = tqdm(dataloader_train, desc='Epoch {:1d}'.format(epoch), leave=False, disable=False)
    for batch in progress_bar:

        model.zero_grad()
        
        batch = tuple(b.to(device) for b in batch)
        
        inputs = {'input_ids':      batch[0],
                  'attention_mask': batch[1],
                  'labels':         batch[2],
                 }       

        outputs = model(**inputs)
        
        loss = outputs[0]
        loss_train_total += loss.item()
        loss.backward()

        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)

        optimizer.step()
        scheduler.step()
        
        progress_bar.set_postfix({'training_loss': '{:.3f}'.format(loss.item()/len(batch))})
         
        
    torch.save(model.state_dict(), f'/kaggle/working/model.pth')   
    tqdm.write(f'\nEpoch {epoch}')
    
    loss_train_avg = loss_train_total/len(dataloader_train)            
    tqdm.write(f'Training loss: {loss_train_avg}')
    
    val_loss, predictions, true_vals = evaluate(dataloader_validation)
    val_f1 = f1_score_func(predictions, true_vals)
    tqdm.write(f'Validation loss: {val_loss}')
    tqdm.write(f'F1 Score (Weighted): {val_f1}')

  0%|          | 0/4 [00:00<?, ?it/s]

Epoch 1:   0%|          | 0/8746 [00:00<?, ?it/s]


Epoch 1
Training loss: 0.8424105559132411
Validation loss: 0.6738682794782762
F1 Score (Weighted): 0.8112004897794232


Epoch 2:   0%|          | 0/8746 [00:00<?, ?it/s]


Epoch 2
Training loss: 0.571240645499295
Validation loss: 0.7156030633565976
F1 Score (Weighted): 0.84216169339436


Epoch 3:   0%|          | 0/8746 [00:00<?, ?it/s]


Epoch 3
Training loss: 0.4110366538661865
Validation loss: 0.838454674407321
F1 Score (Weighted): 0.8344256728717292


Epoch 4:   0%|          | 0/8746 [00:00<?, ?it/s]


Epoch 4
Training loss: 0.29352923692163796
Validation loss: 0.7490406930984216
F1 Score (Weighted): 0.8645572614873447
