## Bert document classification

In [None]:
!pip install --quiet transformers

In [None]:
import torch
from tqdm.notebook import tqdm
import pandas as pd
from transformers import AutoTokenizer
from torch.utils.data import TensorDataset
import numpy as np
from transformers import AutoModelForSequenceClassification

In [None]:
sheet_url = 'https://docs.google.com/spreadsheets/d/1joT62UHk0vbQdVVRou4B6z-lYcpl8FKmRvnxdZ7PmUE/export?format=csv&gid=0'

In [None]:
sample_new = pd.read_excel('/content/drive/MyDrive/test_records_282_titles.xlsx')
sample_new["Title_of_Article"] = sample_new.Title
sample_new["Category"] = sample_new['Final Area']

In [None]:
del sample_new['Title']
del sample_new['Final Area']

In [None]:
sample_new

Unnamed: 0,Title_of_Article,Category
0,Good news for farmers! Centre hikes MSP for Kh...,Government & Politics
1,"MSP hike to boost farmers' income, improve liv...",Government & Politics
2,MSP hike for paddy shows PM Modi's commitment ...,Government & Politics
3,Modi government rubs salt on farmers’ wounds w...,Government & Politics
4,Farmers disappointed over ‘meagre hike’ in MSP...,Government & Politics
...,...,...
277,"AAP vs Centre Over Report On ""Exaggerated Oxyg...",Government & Politics
278,"AAP, BJP Clash Over Purported SC Panel Report ...",Government & Politics
279,Delhi govt ‘exaggerated’ oxygen needs by 4 tim...,Government & Politics
280,Must wait for final report on Delhi Oxygen cla...,Government & Politics


In [None]:
train_abs  = pd.read_excel("/content/drive/MyDrive/final_data_intrigd.xlsx")
og_headlines = pd.read_csv(sheet_url)
my_data = pd.read_excel("/content/drive/MyDrive/news_json_vox_toi_misc.xlsx")

train_abs.Category = train_abs.Category.map(lambda x: "Economy & Economics" if x == "COVID-19" else x)
og_headlines.Category = og_headlines.Category.map(lambda x: "Economy & Economics" if x == "COVID-19" else x)
my_data.Category = my_data.Category.map(lambda x: "Economy & Economics" if x == "COVID-19" else x)
og_headlines.dropna(how="any")
my_data.dropna(how="any")
og_headlines = og_headlines.sample(frac=1)
my_data = my_data.sample(frac=1)
train_2 = og_headlines.head(1670)
df = [train_abs, train_2, my_data]
df = pd.concat(df)

In [None]:
test = og_headlines.head(-1670)
test = test.dropna(how="any")
test

Unnamed: 0,Title_of_Article,Category
2152,Good riddance to all archaic Indian labour laws,Economy & Economics
2869,How start-ups see the future of remote work,Technology & Startups
6741,Is the whatsapp new privacy policy infringing ...,Technology & Startups
3453,The Chinese “podcast” industry isn’t really po...,Technology & Startups
4082,All The Ways Uttar Pradesh Admin Is Underminin...,Government & Politics
...,...,...
452,Balancing The Budget,Economy & Economics
1118,Is Delhi A Revenue Surplus State That Can Deli...,Government & Politics
2732,Changes in Labour Laws Will Turn the Clock Bac...,Economy & Economics
1441,Awaiting lift-off: What keeps India's share of...,Global Trends


In [None]:
sample_new = sample_new.sample(frac=1)
#del my_data['Unnamed: 0']
sample_new

Unnamed: 0,Title_of_Article,Category
79,Adani stocks nosedive on report of FPI account...,Big Business
131,Opinion: Biden’s exit from Afghanistan has bee...,Global Trends
58,Electric 2-wheeler makers hail FAME II subsidy...,Economy & Economics
73,Delhi High Court verdict underlines the politi...,Government & Politics
34,Time for BJP to make calculative moves after W...,Government & Politics
...,...,...
77,Police blurred right to protest & terrorist ac...,Government & Politics
47,Israel's new PM Naftali Bennett promises to un...,Global Trends
179,Ceasefire broken as Israel carries out air str...,Global Trends
96,"EU Carbon Border Levy Will Not Be a Quick Fix,...",Economy & Economics


In [None]:
df = df.sample(frac=1)
df

Unnamed: 0.1,Unnamed: 0,Category,Title_of_Article
19090,84047.0,Global Trends,Greece -- Give It a Chance
36463,869914.0,Technology & Startups,Mobiles to get expensive post Diwali
77100,2409.0,Technology & Startups,
12798,12798.0,Government & Politics,"Saifullah, the commander of the terrorist orga..."
45740,20950.0,Government & Politics,"His visit sparked protests, leading to the po..."
...,...,...,...
84150,9459.0,Finance & Banking,The suicide cases of borrowers unable to pay l...
13825,37259.0,Global Trends,It's The Arab Spring For Dictators
45177,20387.0,Global Trends,The Muslim community in Sri Lanka has been ang...
14768,45504.0,Global Trends,Russian Ultra-Nationalist Says Vote Trump Or R...


In [None]:
del df["Unnamed: 0"]

In [None]:
df.isnull().sum().sum()

27629

In [None]:
df = df.dropna(how='any')

In [None]:
df['Category'].value_counts()

Economy & Economics      28558
Government & Politics    27491
Global Trends            26229
Technology & Startups    19090
Finance & Banking        17572
Big Business             16427
Name: Category, dtype: int64

In [None]:
print(df.isnull().sum().sum())
print(test.isnull().sum().sum())
print(sample_new.isnull().sum().sum())

0
0
0


In [None]:
label_dict = {'Big Business': 5,
 'Economy & Economics': 2,
 'Finance & Banking': 3,
 'Global Trends': 4,
 'Government & Politics': 0,
 'Technology & Startups': 1}
label_dict

{'Big Business': 5,
 'Economy & Economics': 2,
 'Finance & Banking': 3,
 'Global Trends': 4,
 'Government & Politics': 0,
 'Technology & Startups': 1}

In [None]:
sample_new['label'] = sample_new.Category.replace(label_dict)

In [None]:
df['label'] = df.Category.replace(label_dict)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


In [None]:
test['label'] = test.Category.replace(label_dict)

In [None]:
from sklearn.model_selection import train_test_split

X_train, X_val, y_train, y_val = train_test_split(df.index.values, 
                                                  df.label.values, 
                                                  test_size=0.15, 
                                                  random_state=42, 
                                                  stratify=df.label.values)

In [None]:
X_test, y_test = test.index.values, test.label.values
X_new, y_new = sample_new.index.values, sample_new.label.values

In [None]:
df['data_type'] = ['not_set']*df.shape[0]
test['data_type'] = ['not_set']*test.shape[0]
sample_new['data_type'] = ['not_set']*sample_new.shape[0]
df.loc[X_train, 'data_type'] = 'train'
df.loc[X_val, 'data_type'] = 'val'
test.loc[X_test, 'data_type'] = 'test'
sample_new.loc[X_new, 'data_type'] = 'new'

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  isetter(loc, value)


In [None]:
df.groupby(['Category', 'label', 'data_type']).count()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Title_of_Article
Category,label,data_type,Unnamed: 3_level_1
Big Business,5,train,12538
Big Business,5,val,3889
Economy & Economics,2,train,21850
Economy & Economics,2,val,6708
Finance & Banking,3,train,13455
Finance & Banking,3,val,4117
Global Trends,4,train,20377
Global Trends,4,val,5852
Government & Politics,0,train,21103
Government & Politics,0,val,6388


In [None]:
test.groupby(['Category', 'label', 'data_type']).count()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Title_of_Article
Category,label,data_type,Unnamed: 3_level_1
Big Business,5,test,844
Economy & Economics,2,test,1941
Finance & Banking,3,test,759
Global Trends,4,test,1623
Government & Politics,0,test,1588
Technology & Startups,1,test,1241


In [None]:
sample_new.groupby(['Category', 'label', 'data_type']).count()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Title_of_Article
Category,label,data_type,Unnamed: 3_level_1
Big Business,5,new,31
Economy & Economics,2,new,55
Finance & Banking,3,new,14
Global Trends,4,new,65
Government & Politics,0,new,101
Technology & Startups,1,new,16


In [None]:
tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased',
                                          do_lower_case=True)

In [None]:
encoded_data_train = tokenizer.batch_encode_plus(
    # convert to list when using roberta
    list(df[df.data_type=='train'].Title_of_Article.values), 
    add_special_tokens=True, 
    return_attention_mask=True, 
    pad_to_max_length=True, 
    max_length=256, 
    return_tensors='pt'
)

encoded_data_val = tokenizer.batch_encode_plus(
    list(df[df.data_type=='val'].Title_of_Article.values), 
    add_special_tokens=True, 
    return_attention_mask=True, 
    pad_to_max_length=True, 
    max_length=256, 
    return_tensors='pt'
)

encoded_data_test = tokenizer.batch_encode_plus(
    list(test[test.data_type=='test'].Title_of_Article.values), 
    add_special_tokens=True, 
    return_attention_mask=True, 
    pad_to_max_length=True, 
    max_length=256, 
    return_tensors='pt'
)

encoded_data_sample_new = tokenizer.batch_encode_plus(
    list(sample_new[sample_new.data_type=='new'].Title_of_Article.values), 
    add_special_tokens=True, 
    return_attention_mask=True, 
    pad_to_max_length=True, 
    max_length=256, 
    return_tensors='pt'
)

input_ids_train = encoded_data_train['input_ids']
attention_masks_train = encoded_data_train['attention_mask']
labels_train = torch.tensor(df[df.data_type=='train'].label.values)

input_ids_val = encoded_data_val['input_ids']
attention_masks_val = encoded_data_val['attention_mask']
labels_val = torch.tensor(df[df.data_type=='val'].label.values)

input_ids_test = encoded_data_test['input_ids']
attention_masks_test = encoded_data_test['attention_mask']
labels_test = torch.tensor(test[test.data_type=='test'].label.values)

input_ids_sample_new = encoded_data_sample_new['input_ids']
attention_masks_sample_new = encoded_data_sample_new['attention_mask']
labels_sample_new = torch.tensor(sample_new[sample_new.data_type=='new'].label.values)



In [None]:
dataset_train = TensorDataset(input_ids_train, attention_masks_train, labels_train)
dataset_val = TensorDataset(input_ids_val, attention_masks_val, labels_val)
dataset_test = TensorDataset(input_ids_test, attention_masks_test, labels_test)
dataset_new = TensorDataset(input_ids_sample_new, attention_masks_sample_new, labels_sample_new)

In [None]:
len(dataset_train), len(dataset_val), len(dataset_test), len(dataset_new)

(103947, 31420, 7996, 282)

In [None]:
model = AutoModelForSequenceClassification.from_pretrained("bert-base-uncased",
                                                      num_labels=len(label_dict),
                                                      output_attentions=False,
                                                      output_hidden_states=False)

Some weights of the model checkpoint at bert-large-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint a

In [None]:
torch.cuda.empty_cache()

In [None]:
from torch.utils.data import DataLoader, RandomSampler, SequentialSampler

batch_size = 32

dataloader_train = DataLoader(dataset_train, 
                              sampler=RandomSampler(dataset_train), 
                              batch_size=batch_size)

dataloader_validation = DataLoader(dataset_val, 
                                   sampler=SequentialSampler(dataset_val), 
                                   batch_size=batch_size)

dataloader_test = DataLoader(dataset_test, 
                                   sampler=SequentialSampler(dataset_test), 
                                   batch_size=batch_size)


dataloader_new = DataLoader(dataset_new, 
                                   sampler=SequentialSampler(dataset_new), 
                                   batch_size=batch_size)

In [None]:
from transformers import AdamW, get_linear_schedule_with_warmup

optimizer = AdamW(model.parameters(),
                  lr=1e-5, 
                  eps=1e-8)

In [None]:
epochs = 5

scheduler = get_linear_schedule_with_warmup(optimizer, 
                                            num_warmup_steps=0,
                                            num_training_steps=len(dataloader_train)*epochs)

In [None]:
from sklearn.metrics import f1_score

def f1_score_func(preds, labels):
    preds_flat = np.argmax(preds, axis=1).flatten()
    labels_flat = labels.flatten()
    return f1_score(labels_flat, preds_flat, average='weighted')

def accuracy_per_class(preds, labels):
    label_dict_inverse = {v: k for k, v in label_dict.items()}
    
    preds_flat = np.argmax(preds, axis=1).flatten()
    labels_flat = labels.flatten()

    for label in np.unique(labels_flat):
        y_preds = preds_flat[labels_flat==label]
        y_true = labels_flat[labels_flat==label]
        print(f'Class: {label_dict_inverse[label]}')
        print(f'Accuracy: {len(y_preds[y_preds==label])}/{len(y_true)}\n')

In [None]:
import random
import numpy as np

seed_val = 17
random.seed(seed_val)
np.random.seed(seed_val)
torch.manual_seed(seed_val)
torch.cuda.manual_seed_all(seed_val)

In [None]:
def evaluate(dataloader_val):

    model.eval()
    
    loss_val_total = 0
    predictions, true_vals = [], []
    
    for batch in dataloader_val:
        
        batch = tuple(b.to(device) for b in batch)
        
        inputs = {'input_ids':      batch[0],
                  'attention_mask': batch[1],
                  'labels':         batch[2],
                 }

        with torch.no_grad():        
            outputs = model(**inputs)
            
        loss = outputs[0]
        logits = outputs[1]
        loss_val_total += loss.item()

        logits = logits.detach().cpu().numpy()
        label_ids = inputs['labels'].cpu().numpy()
        predictions.append(logits)
        true_vals.append(label_ids)
    
    loss_val_avg = loss_val_total/len(dataloader_val) 
    
    predictions = np.concatenate(predictions, axis=0)
    true_vals = np.concatenate(true_vals, axis=0)
            
    return loss_val_avg, predictions, true_vals

In [None]:
for epoch in tqdm(range(1, epochs+1)):
    
    model.train()
    
    loss_train_total = 0

    progress_bar = tqdm(dataloader_train, desc='Epoch {:1d}'.format(epoch), leave=False, disable=False)
    for batch in progress_bar:

        model.zero_grad()
        
        batch = tuple(b.to(device) for b in batch)
        
        inputs = {'input_ids':      batch[0],
                  'attention_mask': batch[1],
                  'labels':         batch[2],
                 }       

        outputs = model(**inputs)
        
        loss = outputs[0]
        loss_train_total += loss.item()
        loss.backward()

        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)

        optimizer.step()
        scheduler.step()
        
        progress_bar.set_postfix({'training_loss': '{:.3f}'.format(loss.item()/len(batch))})
         
        
    torch.save(model.state_dict(), f'finetuned_BERT_epoch_{epoch}.model')
        
    tqdm.write(f'\nEpoch {epoch}')
    
    loss_train_avg = loss_train_total/len(dataloader_train)            
    tqdm.write(f'Training loss: {loss_train_avg}')
    
    val_loss, predictions, true_vals = evaluate(dataloader_validation)
    val_f1 = f1_score_func(predictions, true_vals)
    tqdm.write(f'Validation loss: {val_loss}')
    tqdm.write(f'F1 Score (Weighted): {val_f1}')

HBox(children=(FloatProgress(value=0.0, max=5.0), HTML(value='')))

HBox(children=(FloatProgress(value=0.0, description='Epoch 1', max=12994.0, style=ProgressStyle(description_wi…


Epoch 1
Training loss: 0.8901096016239695
Validation loss: 0.7231429270599057
F1 Score (Weighted): 0.7377898324214162


HBox(children=(FloatProgress(value=0.0, description='Epoch 2', max=12994.0, style=ProgressStyle(description_wi…


Epoch 2
Training loss: 0.6133781669105433
Validation loss: 0.725147235942324
F1 Score (Weighted): 0.7546638482558193


HBox(children=(FloatProgress(value=0.0, description='Epoch 3', max=12994.0, style=ProgressStyle(description_wi…


Epoch 3
Training loss: 0.4574059156277075
Validation loss: 0.872818725504178
F1 Score (Weighted): 0.7597481934910034


HBox(children=(FloatProgress(value=0.0, description='Epoch 4', max=12994.0, style=ProgressStyle(description_wi…

In [None]:
model = AutoModelForSequenceClassification.from_pretrained("bert-large-uncased",
                                                      num_labels=len(label_dict),
                                                      output_attentions=False,
                                                      output_hidden_states=False)

model.to(device)

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.transform.LayerNorm.bias', 'cls.predictions.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.weight']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at

BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, element

In [None]:
model.load_state_dict(torch.load('/content/drive/MyDrive/finetuned_BERT_small_epoch_5.model', map_location=torch.device(device)))

<All keys matched successfully>

In [None]:
_, predictions, true_vals = evaluate(dataloader_validation)
accuracy_per_class(predictions, true_vals)

Class: Government & Politics
Accuracy: 2594/2849

Class: Technology & Startups
Accuracy: 1350/1768

Class: Economy & Economics
Accuracy: 2488/3112

Class: Finance & Banking
Accuracy: 940/1315

Class: Global Trends
Accuracy: 2214/2796

Class: Big Business
Accuracy: 901/1268



In [None]:
_, predictions, true_vals = evaluate(dataloader_test)
accuracy_per_class(predictions, true_vals)

Class: Government & Politics
Accuracy: 872/966

Class: Technology & Startups
Accuracy: 715/795

Class: Economy & Economics
Accuracy: 1070/1192

Class: Finance & Banking
Accuracy: 449/493

Class: Global Trends
Accuracy: 941/1002

Class: Big Business
Accuracy: 485/550



In [None]:
_, predictions, true_vals = evaluate(dataloader_new)
accuracy_per_class(predictions, true_vals)

Class: Government & Politics
Accuracy: 74/101

Class: Technology & Startups
Accuracy: 6/16

Class: Economy & Economics
Accuracy: 21/55

Class: Finance & Banking
Accuracy: 12/14

Class: Global Trends
Accuracy: 60/65

Class: Big Business
Accuracy: 23/31

