# Model Finetuning

This notebook can train the [arazd/MIReAD](https://huggingface.co/arazd/MIReAD) model for journal classification on your dataset.

## Setup

In [None]:
!pip install transformers==4.28.0
!pip install -U sentence-transformers
!pip install datasets
!pip install evaluate
!pip install accelerate -U

### Huggingface Login

Uncomment and run to login to huggingface with your token.

In [None]:
# from huggingface_hub import notebook_login
# notebook_login()

### Imports

In [None]:
import os
import shutil

import numpy as np
import pandas as pd
import torch

np.random.seed(23)

In [None]:
torch.cuda.empty_cache()

if torch.cuda.is_available():    

    # Tell PyTorch to use the GPU.    
    device = torch.device("cuda")
  
else:
    device = torch.device("cpu")

In [None]:
import locale
locale.getpreferredencoding = lambda: "UTF-8"

## Data Preprocessing

Load your data as a csv file here. The task requires data on the title, abstract and the journal. These should be in columns named 'Title', 'Abstract' and 'Journal' respectively.

In [None]:
data = pd.read_csv('your_data.csv')
data.info()

In [None]:
data.head()

In [None]:
data = data[['Title','Abstract','Journal']]

In [None]:
datasets = data.to_dict("records")

data = []
for row in datasets:
  data.append({
      'abstract': row['Abstract'],
      'title': row['Title'],
      'label':row['Journal']
  })
data[0]

### Sorting the Data by Journal

If your data is large, you may find it helpful to run the model on smaller subsets of the journal

In [None]:
journal_wise_data = {}
for i in data.itertuples(index=False):
  if i[2] not in journal_wise_data:
    journal_wise_data[i[2]] = []
    journal_wise_data[i[2]].append([i[0],i[1]])
  else:
    journal_wise_data[i[2]].append([i[0],i[1]])

In [None]:
#@title Subset
fraction_to_use = 0.5 #@param {type:"slider", min:0, max:1, step:0.1}
journals = list(journal_wise_data.keys())
num_to_use = int(len(journals)*fraction_to_use)
subset = journals[:num_to_use+1]

In [None]:
all_train = []
all_val = []
all_test = []
for key in subset:
  split = int(len(journal_wise_data[key])*0.3)
  batch_train = [{'label':key,'title':row[0],'abstract':row[1]} for row in journal_wise_data[key][:-split]]
  batch_val = [{'label':key,'title':row[0],'abstract':row[1]} for row in journal_wise_data[key][-split:-(split//2)]]
  batch_test = [{'label':key,'title':row[0],'abstract':row[1]} for row in journal_wise_data[key][-(split//2):]]
  all_train.extend(batch_train)
  all_val.extend(batch_val)
  all_test.extend(batch_test)

In [None]:
from datasets import load_dataset, Dataset
all_test = Dataset.from_list(all_test)
all_val = Dataset.from_list(all_val)
all_train = Dataset.from_list(all_train)

### Creating label2id and id2label

In [None]:
count = 0
label2id = {}
id2label = {}
for journal in subset:
  if journal not in label2id:
    idx = len(id2label)
    label2id[journal] = len(id2label)
    id2label[idx] = journal

### Label Encoding

In [None]:
def encode_labels(data):
  data['label'] = label2id[data['label']]
  return data

In [None]:
all_train = all_train.map(encode_labels)
all_val = all_val.map(encode_labels)
all_test = all_test.map(encode_labels)

## Model

### Initiation

In [None]:
from transformers import AutoTokenizer, BertForSequenceClassification

m_tokenizer = AutoTokenizer.from_pretrained("arazd/MIReAD",
                                            max_length = 512,
                                            )

m_model = BertForSequenceClassification.from_pretrained("arazd/MIReAD",
                                                        num_labels=len(journals),
                                                        id2label=id2label,
                                                        label2id=label2id,
                                                        output_attentions=False,
                                                        ignore_mismatched_sizes=True,
                                                        output_hidden_states=False,
                                                        max_length = 512,
                                                        )

def m_preprocessor(batch):
  return m_tokenizer(batch['text'],truncation=True,padding=True,max_length=512)

m_bundle = {
    'name' : "MIReAD",
    'model' : m_model,
    'tokenizer' : m_tokenizer,
    'preprocessor' : m_preprocessor,
  }

### Join Title and Abstract
MIReAD takes only a single text as input. We need to join the title to the abstract with a ```sep_token```

In [None]:
def join_abst_mtitle(data):
  data['abstract'] = data['title'] + m_tokenizer.sep_token + data['abstract']
  return data

all_train = all_train.map(join_abst_mtitle)
all_train = all_train.remove_columns(['title'])
all_train = all_train.rename_column('abstract','text')
all_val = all_val.map(join_abst_mtitle)
all_val = all_val.remove_columns(['title'])
all_val = all_val.rename_column('abstract','text')
all_test = all_test.map(join_abst_mtitle)
all_test = all_test.remove_columns(['title'])
all_test = all_test.rename_column('abstract','text')

In [None]:
from transformers import DataCollatorWithPadding

def get_collator(tokenizer):
  data_collator = DataCollatorWithPadding(tokenizer=tokenizer)
  return data_collator

### Evaluation Metric

In [None]:
import evaluate
accuracy = evaluate.load("accuracy")
f1_metric = evaluate.load("f1")
recall_metric = evaluate.load("recall")
precision_metric = evaluate.load("precision")

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    preds = np.argmax(logits, axis = -1)
    
    results = {}
    results.update(accuracy.compute(predictions=preds, references = labels))
    results.update(f1_metric.compute(predictions=preds, references = labels, average="macro"))
    results.update(precision_metric.compute(predictions=preds, references = labels, average="macro"))
    results.update(recall_metric.compute(predictions=preds, references = labels, average="macro"))
    print(results)
    return results

### Fine-tuning

In [None]:
from transformers import TrainingArguments,Trainer

bundle = m_bundle
learning_rates = [3e-5,]
batch_size = 8
num_epochs = 6
weight_decay=0.01

for learning_rate in learning_rates:
  tokenizer = bundle['tokenizer']
  model = bundle['model']
  model.cuda()
  model_name = bundle['name'] + f'_{learning_rate}'
  preprocessor = bundle['preprocessor']
  tokenized_train = all_train.map(preprocessor,batched=True)
  tokenized_val = all_val.map(preprocessor,batched=True)
  collator = get_collator(tokenizer)


  training_args = TrainingArguments(
    output_dir=model_name,
    learning_rate=learning_rate,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    num_train_epochs=num_epochs,
    weight_decay=weight_decay,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    push_to_hub=False,
)
  
  trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_val,
    tokenizer=tokenizer,
    data_collator=collator,
    compute_metrics=compute_metrics,
)
  print(f'|--------------------------Now Training: {model_name} with Learning Rate = {learning_rate}------------------------------|')
  trainer.train()
  # trainer.push_to_hub()                          # Uncomment to push model at every epoch
  print(f'|-----------------------------------------------------------------------------------------------------------------------|')
