In [None]:
# connecting to google drive
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
# installing dependencies
!pip install transformers --quiet

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.0/7.0 MB[0m [31m93.4 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m224.2/224.2 kB[0m [31m21.8 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.8/7.8 MB[0m [31m82.0 MB/s[0m eta [36m0:00:00[0m
[?25h

In [None]:
# importing the required libraries
import numpy as np 
import pandas as pd
import re
import string
from tqdm.notebook import tqdm
import plotly.express as px
import plotly.graph_objects as go
import torch
from torch.utils.data import Dataset, DataLoader
from sklearn.metrics import classification_report, accuracy_score
from transformers import (set_seed, TrainingArguments, Trainer, GPT2Config, GPT2Tokenizer, AdamW, get_linear_schedule_with_warmup, GPT2ForSequenceClassification)
from sklearn import metrics
from sklearn.metrics import confusion_matrix
import seaborn as sns

In [None]:
# reading the training dataset
train_df = pd.read_csv('/content/drive/MyDrive/SEM6/NLP/Project/data/clean_train.csv')
train_df.head()

Unnamed: 0,headline,label
0,CWG18 ಕುಸ್ತಿಯಲ್ಲಿ ಚಿನ್ನಗಳಿಸಿದ ರಾಹುಲ್ ಅವಾರೆ ಸುಶ...,1
1,ಏಷ್ಯಾ ಕಪ್ 2018 ಪಾಕ್ ವಿರುದ್ಧ ಘರ್ಜಿಸಲು ರೋಹಿತ್ ಸೈ...,1
2,ಸಮಂತಾ ವಿಷಯದಲ್ಲಿ ಯೂ ಟರ್ನ್ ಹೊಡೆದ ನಾಗ ಚೈತನ್ಯ,0
3,ಐಶ್ ಬೇಬಿ ಸೌಂದರ್ಯದ ಗುಟ್ಟು ರಟ್ಟು 40 ದಾಟಿದರೂ ಹಾಟ...,0
4,ಟೀಂ ಇಂಡಿಯಾ ಆಯ್ಕೆ ಸಮಿತಿ ಸದಸ್ಯರ ಸಂಭಾವನೆ ಎಷ್ಟು ಗೊ...,1


In [None]:
# splitting the data into training and validation set
num_of_rows = int(len(train_df) * 0.8)
values = train_df.values
np.random.shuffle(values)
train = values[:num_of_rows] 
validation = values[num_of_rows:]
train = pd.DataFrame(train, columns = ['headline', 'label'])
validation = pd.DataFrame(validation, columns = ['headline', 'label'])
train.rename(columns = {'headline': 'text'}, inplace = True)
validation.rename(columns = {'headline': 'text'}, inplace = True)

In [None]:
# defining the hyperparameters
max_len = None 
batch_size = 8
epochs = 10
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [None]:
# dataset creator for Pytorch
class DatasetCreator(Dataset):
    def __init__(self, processed_data, train):
        self.data = processed_data
        self.train = train
        
    def __len__(self):
        return len(self.data)
    
    def __getitem__(self, index):
        line = self.data.iloc[index]
        if self.train:
            return {'text': line['text'], 'label': line['label']}
        else:
            return {'text': line['text'], 'label': 0}

#  defining class to tokenize and process the text for input to the dataloader    
class GPT2_collator(object):
    def __init__(self, tokenizer, max_seq_len = None):
        self.tokenizer = tokenizer
        self.max_seq_len = max_seq_len
        return
    
    def __call__(self, sequences):
        texts = [sequence['text'] for sequence in sequences]
        labels = [int(sequence['label']) for sequence in sequences]
        inputs = self.tokenizer(text = texts,
                                return_tensors = 'pt',
                                padding = True,
                                truncation = True,
                                max_length = self.max_seq_len)
        inputs.update({'labels': torch.tensor(labels)})       
        return inputs


# defining function for training
def epoch_train(dataloader, optimizer, scheduler, device):
    global model
    model.train()
    predictions_labels = []
    true_labels = []
    total_loss = 0
    
    for batch in tqdm(dataloader, total = len(dataloader)):
        true_labels += batch['labels'].numpy().flatten().tolist()
        batch = {k:v.type(torch.long).to(device) for k,v in batch.items()}
        optimizer.zero_grad()
        outputs = model(**batch)
        loss, logits = outputs[:2]
        total_loss += loss.item()
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
        optimizer.step()
        scheduler.step()
        predictions_labels += logits.argmax(axis=-1).flatten().tolist()
    avg_epoch_loss = total_loss / len(dataloader)
    return predictions_labels, true_labels, avg_epoch_loss

# defining function for validation 
def epoch_validate(dataloader, device):
    global model
    model.eval()
    predictions_labels = []
    true_labels = []
    total_loss = 0
    
    for batch in tqdm(dataloader, total = len(dataloader)):
        true_labels += batch['labels'].numpy().flatten().tolist()
        batch = {k:v.type(torch.long).to(device) for k,v in batch.items()}
        with torch.no_grad():
            outputs = model(**batch)
            loss, logits = outputs[:2]
            total_loss += loss.item()
            predictions_labels += logits.argmax(axis = -1).flatten().tolist()
    avg_epoch_loss = total_loss / len(dataloader)
    return predictions_labels, true_labels, avg_epoch_loss

def epoch_predict(dataloader, device):
    global model
    model.eval()
    predictions_labels = []
    
    for batch in tqdm(dataloader, total=len(dataloader)):
        batch = {k:v.type(torch.long).to(device) for k,v in batch.items()}
        with torch.no_grad():
            outputs = model(**batch)
            _, logits = outputs[:2]
            predictions_labels += logits.argmax(axis=-1).flatten().tolist()
    return predictions_labels 

In [None]:
# loading te model and the tokenizer
print('Loading gpt-2 model')
model_config = GPT2Config.from_pretrained(pretrained_model_name_or_path = 'gpt2', num_labels = 3)

print('Loading tokenizer...')
tokenizer = GPT2Tokenizer.from_pretrained(pretrained_model_name_or_path = 'gpt2')
tokenizer.padding_side = 'left'
tokenizer.pad_token = tokenizer.eos_token

print('Loading model...')
model = GPT2ForSequenceClassification.from_pretrained(pretrained_model_name_or_path = 'gpt2', config = model_config)
model.resize_token_embeddings(len(tokenizer)) 
model.config.pad_token_id = model.config.eos_token_id

model.to(device)

Loading gpt-2 model


Downloading (…)lve/main/config.json:   0%|          | 0.00/665 [00:00<?, ?B/s]

Loading tokenizer...


Downloading (…)olve/main/vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

Downloading (…)olve/main/merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

Loading model...


Downloading pytorch_model.bin:   0%|          | 0.00/548M [00:00<?, ?B/s]

Some weights of GPT2ForSequenceClassification were not initialized from the model checkpoint at gpt2 and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


GPT2ForSequenceClassification(
  (transformer): GPT2Model(
    (wte): Embedding(50257, 768)
    (wpe): Embedding(1024, 768)
    (drop): Dropout(p=0.1, inplace=False)
    (h): ModuleList(
      (0-11): 12 x GPT2Block(
        (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (attn): GPT2Attention(
          (c_attn): Conv1D()
          (c_proj): Conv1D()
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Dropout(p=0.1, inplace=False)
        )
        (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (mlp): GPT2MLP(
          (c_fc): Conv1D()
          (c_proj): Conv1D()
          (act): NewGELUActivation()
          (dropout): Dropout(p=0.1, inplace=False)
        )
      )
    )
    (ln_f): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
  )
  (score): Linear(in_features=768, out_features=3, bias=False)
)

Prepare dataloader

In [None]:
gpt2_collator = GPT2_collator(tokenizer=tokenizer, max_seq_len=max_len)

# preparing training data
train_data = DatasetCreator(train, train = True)
train_dataloader = DataLoader(train_data, batch_size = batch_size, shuffle = True, collate_fn = gpt2_collator)

# preparing validation data
val_data = DatasetCreator(validation, train = True)
val_dataloader = DataLoader(val_data, batch_size = batch_size, shuffle = True, collate_fn = gpt2_collator)

In [None]:
# training the model
optimizer = AdamW(model.parameters(), lr = 5e-5, eps = 1e-8, weight_decay=0.01)
total_steps = len(train_dataloader) * epochs
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps = 0, num_training_steps = total_steps)
loss = []
accuracy = []
val_loss_list = []
val_accuracy_list = []

for epoch in tqdm(range(epochs)):
    train_labels, true_labels, train_loss = epoch_train(train_dataloader, optimizer, scheduler, device)    
    train_acc = accuracy_score(true_labels, train_labels) 
    print('epoch: %.2f train accuracy %.2f' % (epoch, train_acc))
    loss.append(train_loss)
    accuracy.append(train_acc)

    val_labels, val_true_labels, val_loss = epoch_validate(val_dataloader, device)
    val_acc= accuracy_score(val_true_labels, val_labels)
    print('epoch: %.2f validation accuracy %.2f' % (epoch, val_acc))
    val_loss_list.append(val_loss)
    val_accuracy_list.append(val_acc)



  0%|          | 0/10 [00:00<?, ?it/s]

  0%|          | 0/517 [00:00<?, ?it/s]

epoch: 0.00 train accuracy 0.53


  0%|          | 0/130 [00:00<?, ?it/s]

epoch: 0.00 validation accuracy 0.60


  0%|          | 0/517 [00:00<?, ?it/s]

epoch: 1.00 train accuracy 0.63


  0%|          | 0/130 [00:00<?, ?it/s]

epoch: 1.00 validation accuracy 0.68


  0%|          | 0/517 [00:00<?, ?it/s]

epoch: 2.00 train accuracy 0.67


  0%|          | 0/130 [00:00<?, ?it/s]

epoch: 2.00 validation accuracy 0.69


  0%|          | 0/517 [00:00<?, ?it/s]

epoch: 3.00 train accuracy 0.69


  0%|          | 0/130 [00:00<?, ?it/s]

epoch: 3.00 validation accuracy 0.73


  0%|          | 0/517 [00:00<?, ?it/s]

epoch: 4.00 train accuracy 0.71


  0%|          | 0/130 [00:00<?, ?it/s]

epoch: 4.00 validation accuracy 0.73


  0%|          | 0/517 [00:00<?, ?it/s]

epoch: 5.00 train accuracy 0.73


  0%|          | 0/130 [00:00<?, ?it/s]

epoch: 5.00 validation accuracy 0.69


  0%|          | 0/517 [00:00<?, ?it/s]

epoch: 6.00 train accuracy 0.75


  0%|          | 0/130 [00:00<?, ?it/s]

epoch: 6.00 validation accuracy 0.74


  0%|          | 0/517 [00:00<?, ?it/s]

epoch: 7.00 train accuracy 0.76


  0%|          | 0/130 [00:00<?, ?it/s]

epoch: 7.00 validation accuracy 0.74


  0%|          | 0/517 [00:00<?, ?it/s]

epoch: 8.00 train accuracy 0.77


  0%|          | 0/130 [00:00<?, ?it/s]

epoch: 8.00 validation accuracy 0.74


  0%|          | 0/517 [00:00<?, ?it/s]

epoch: 9.00 train accuracy 0.78


  0%|          | 0/130 [00:00<?, ?it/s]

epoch: 9.00 validation accuracy 0.74


In [None]:
# plotting train and validation loss
fig_loss = go.Figure()
fig_loss.add_trace(go.Scatter(x = [*range(0, len(loss), 1)], y = loss, mode = 'lines', name = 'train loss'))
fig_loss.add_trace(go.Scatter(x = [*range(0, len(loss), 1)], y = val_loss_list, mode = 'lines', name = 'validation loss'))

In [None]:
# plotting train and validation accuracy
fig_acc = go.Figure()
fig_acc.add_trace(go.Scatter(x = [*range(0, len(accuracy), 1)], y = accuracy, mode = 'lines', name = 'train accuracy'))
fig_acc.add_trace(go.Scatter(x = [*range(0, len(accuracy), 1)], y = val_accuracy_list, mode = 'lines', name = 'validation accuracy'))
fig_acc.show()

TypeError: ignored

In [None]:
# reading the testing dataset
test = pd.read_csv('/content/drive/MyDrive/SEM6/NLP/Project/data/clean_test.csv')
test.head()

Unnamed: 0,headline,label
0,ಬಜಾರ್ ಅಂಗಳದಲ್ಲಿ ಸಿಕ್ಕ ಧನ್ವೀರ್​,0
1,ಬಿಡುಗಡೆಯಾಯಿತು ಕಿಚ್ಚ ಸುದೀಪ್​ ಕಂಠದಾನ ಮಾಡಿರುವ ಸಿನ...,0
2,"ಸ್ಮಿತ್, ವಾರ್ನರ್​, ಬ್ಯಾಂಕ್ರಾಫ್ಟ್​ಗೆ ಶಿಕ್ಷೆ ಕಡಿತ...",1
3,ಬೆಂಗಳೂರು ಅಂತರ ರಾಷ್ಟ್ರೀಯ ಚಲನಚಿತ್ರೋತ್ಸವದಲ್ಲಿ ಏಷ್...,0
4,ಜಿಯೋ ಭರ್ಜರಿ ಗಿಫ್ಟ್​: ಇಂದಿನಿಂದಲೇ 5 ವರ್ಷಗಳ ಕಾಲ ಉ...,2


In [None]:
test.rename(columns={'headline': 'text'}, inplace = True)
test.head()

Unnamed: 0,text,label
0,ಬಜಾರ್ ಅಂಗಳದಲ್ಲಿ ಸಿಕ್ಕ ಧನ್ವೀರ್​,0
1,ಬಿಡುಗಡೆಯಾಯಿತು ಕಿಚ್ಚ ಸುದೀಪ್​ ಕಂಠದಾನ ಮಾಡಿರುವ ಸಿನ...,0
2,"ಸ್ಮಿತ್, ವಾರ್ನರ್​, ಬ್ಯಾಂಕ್ರಾಫ್ಟ್​ಗೆ ಶಿಕ್ಷೆ ಕಡಿತ...",1
3,ಬೆಂಗಳೂರು ಅಂತರ ರಾಷ್ಟ್ರೀಯ ಚಲನಚಿತ್ರೋತ್ಸವದಲ್ಲಿ ಏಷ್...,0
4,ಜಿಯೋ ಭರ್ಜರಿ ಗಿಫ್ಟ್​: ಇಂದಿನಿಂದಲೇ 5 ವರ್ಷಗಳ ಕಾಲ ಉ...,2


In [None]:
test_dataset = DatasetCreator(test, train = False)
test_dataloader = DataLoader(test_dataset, batch_size = batch_size, shuffle = False, collate_fn = gpt2_collator)
y_pred = epoch_predict(test_dataloader, device)

  0%|          | 0/162 [00:00<?, ?it/s]

In [None]:
y_pred = np.array(y_pred)
y_test = test.label.values

In [None]:
# evaluating the  predictions
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy: %.2f%%" % (accuracy * 100.0))

Accuracy: 73.16%


In [None]:
# printing the classification results
print(metrics.classification_report(y_test,y_pred))

              precision    recall  f1-score   support

           0       0.73      0.89      0.80       661
           1       0.77      0.67      0.72       470
           2       0.57      0.24      0.34       162

    accuracy                           0.73      1293
   macro avg       0.69      0.60      0.62      1293
weighted avg       0.72      0.73      0.71      1293



In [None]:
# printing the confusion matrix
cm = confusion_matrix(y_test, y_pred)
print(cm)

[[591  51  19]
 [143 316  11]
 [ 78  45  39]]
