# Upload training, validation, and test sets to your drive then run this to mount your drive and have access to files from the notebook

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


# Install latest version of Huggingface and import necessary packages

In [None]:
!pip install -q git+https://github.com/huggingface/transformers

  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.6/3.6 MB[0m [31m42.3 MB/s[0m eta [36m0:00:00[0m
[?25h  Building wheel for transformers (pyproject.toml) ... [?25l[?25hdone


In [None]:
!pip install pandas openpyxl



In [None]:
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
from torch.utils.data import Dataset
from torch.utils.data import DataLoader
import torch.optim as optim
import torch.nn.functional as F
from torchsummary import summary
from tqdm import tqdm
from transformers import DistilBertTokenizer, DistilBertModel
from sklearn.metrics import accuracy_score

In [None]:
path = '/content/drive/My Drive/'

In [None]:
test_results = pd.ExcelFile(path + 'TestResults.xlsx')
test_results.sheet_names

['TestResultsQ3_part2',
 'TestResultsQ3_part1',
 'TestResultsQ2',
 'TestResultsQ1']

In [None]:
testq1 = pd.read_excel(test_results, 'TestResultsQ1')
testq2 = pd.read_excel(test_results, 'TestResultsQ2')
testq3_part1 = pd.read_excel(test_results, 'TestResultsQ3_part1')
testq3_part2 = pd.read_excel(test_results, 'TestResultsQ3_part2')


In [None]:
# Load training, test, and validation sets
def load_data(path, dataset):
  file_names = {
        'train': 'training.csv',
        'validation': 'validation.csv',
        'test': 'test.csv'
    }

  df = pd.read_csv(path + file_names[dataset])

  return df


In [None]:
training_set = load_data(path, 'train')
validation_set = load_data(path, 'validation')
test_set = load_data(path, 'test')

In [None]:
def get_accuracy(df1, df2, column_name1, column_name2):
  if len(df1) != len(df2):
        raise ValueError("DataFrames do not have the same length.")

  df1_cleaned = df1[column_name1].apply(lambda x: x.strip().lower() if isinstance(x, str) else x)
  df2_cleaned = df2[column_name2].apply(lambda x: x.strip().lower() if isinstance(x, str) else x)

  # Calculate matches
  matches = (df1_cleaned == df2_cleaned).sum()
  print("Matches:", matches)
  total = len(df1)
  print("Total:", total)

  # Calculate accuracy
  accuracy = matches / total
  return accuracy

# Question #1:Zero shot setting

In [None]:
testq1.head() # test results from zero shot setting

Unnamed: 0,Abstract,Domain
0,XML is a pervasive technology for representing...,CS
1,The integration of renewable energy sources in...,ECE
2,The original free-swinging piston engine with ...,MAE
3,Barriers to access and long-term complications...,Medical
4,This paper is to present a technological solut...,ECE


In [None]:
accuracy = get_accuracy(test_set, testq1, 'Domain', 'Domain')
print(f"The accuracy of the comparison is: {accuracy}")

Matches: 11
Total: 20
The accuracy of the comparison is: 0.55


https://chat.openai.com/share/000a48bb-17cf-433b-bb5a-a3ffe9fb28fd

# Question #2:Few shot setting, with prompt engineering

In [None]:
testq2.head()

Unnamed: 0,Abstract,Domain
0,XML is a pervasive technology for representing...,CS
1,The integration of renewable energy sources in...,ECE
2,The original free-swinging piston engine with ...,MAE
3,Barriers to access and long-term complications...,Medical
4,This paper is to present a technological solut...,CS


In [None]:
accuracy = get_accuracy(test_set, testq2, 'Domain', 'Domain')
print(f"The accuracy of the comparison is: {accuracy}")

Matches: 11
Total: 20
The accuracy of the comparison is: 0.55


https://chat.openai.com/share/567af738-e310-4195-bcef-a3505cb8c7ff

# Question #3: Prepare the data for fine-tuning using OpenAI Playground

In [None]:
# Format the training and validation sets
# to make them usable for Open AI playground fine-tuning
# Format:
# {"messages": [{"role": "system", "content": "Marv is a factual chatbot that is also sarcastic."}, {"role": "user", "content": "What's the capital of France?"}, {"role": "assistant", "content": "Paris, as if everyone doesn't know that already."}]}
# {"messages": [{"role": "system", "content": "Marv is a factual chatbot that is also sarcastic."}, {"role": "user", "content": "Who wrote 'Romeo and Juliet'?"}, {"role": "assistant", "content": "Oh, just some guy named William Shakespeare. Ever heard of him?"}]}
# {"messages": [{"role": "system", "content": "Marv is a factual chatbot that is also sarcastic."}, {"role": "user", "content": "How far is the Moon from Earth?"}, {"role": "assistant", "content": "Around 384,400 kilometers. Give or take a few, like that really matters."}]}
# Use gpt-3.5-turbo
# See https://platform.openai.com/docs/guides/fine-tuning/preparing-your-dataset

def make_finetuning_data_gpt(dataset):
  def create_data_point(row):
        return {
            "messages": [
                {"role": "user", "content": row["Abstract"]},
                {"role": "assistant", "content": row["Domain"]}
            ]
        }

  return dataset.apply(create_data_point, axis=1).tolist()

In [None]:
import json

def to_jsonl(data, filename):
    with open(filename, 'w') as f:
        for item in data:
            json.dump(item, f)
            f.write('\n')

finetuning_data_train = make_finetuning_data_gpt(training_set)
finetuning_data_validation = make_finetuning_data_gpt(validation_set)

to_jsonl(finetuning_data_train, 'training_data.jsonl')
to_jsonl(finetuning_data_validation, 'validation_data.jsonl')

from google.colab import files

# files.download('training_data.jsonl')
# files.download('validation_data.jsonl')

### Question #3: Part1 zero shot setting

In [None]:
testq3_part1.head()

Unnamed: 0,Abstract,Domain
0,XML is a pervasive technology for representing...,CS
1,The integration of renewable energy sources in...,Civil
2,The original free-swinging piston engine with ...,ECE
3,Barriers to access and long-term complications...,Medical
4,This paper is to present a technological solut...,ECE


In [None]:
accuracy = get_accuracy(test_set, testq3_part1, 'Domain', 'Domain')
print(f"The accuracy of the comparison is: {accuracy}")

Matches: 12
Total: 20
The accuracy of the comparison is: 0.6


https://platform.openai.com/playground/p/in9739R6cg7em3dy16xW1OiV?model=ft:gpt-3.5-turbo-1106:personal:1009969697:9GKb3r4T&mode=chat

### Question #3: Part2 Few shot setting, with prompt engineering

In [None]:
testq3_part2.head()

Unnamed: 0,Abstract,Domain
0,XML is a pervasive technology for representing...,CS
1,The integration of renewable energy sources in...,ECE
2,The original free-swinging piston engine with ...,ECE
3,Barriers to access and long-term complications...,Medical
4,This paper is to present a technological solut...,ECE


In [None]:
accuracy = get_accuracy(test_set, testq3_part2, 'Domain', 'Domain')
print(f"The accuracy of the comparison is: {accuracy}")

Matches: 12
Total: 20
The accuracy of the comparison is: 0.6


https://platform.openai.com/playground/p/H6Gf22hpFQeJNCR9SrMgugQY?model=ft:gpt-3.5-turbo-1106:personal:1009969697:9GKb3r4T&mode=chat

# Question #4: Fine-tune a Distilbert model on the training set


*1*) Prepare the data and load the pre-trained model

In [None]:
training_set.head()

Unnamed: 0,Domain,area,keywords,Abstract
0,Medical,Hepatitis C,Feasibility study; hepatitis C risk behaviour...,Aims: This study aimed to develop and test the...
1,CS,Distributed computing,Agent Architecture; Mobile Agent; Agent Cloni...,Mobile agent technology is becoming more popul...
2,ECE,Control engineering,educational software tool; multivariable cont...,This paper presents an educational software to...
3,Psychology,False memories,judgment; metamemory; accuracy; eyewitness me...,"Different researchers have reported positive, ..."
4,Psychology,Leadership,Implementation support; Co-occurring disorder...,Background: Incorporating evidence-based integ...


In [None]:
validation_set.head()

Unnamed: 0,Domain,area,keywords,Abstract
0,CS,Symbolic computation,(2+1)-dimensional non-linear optical waves; e...,(2 + 1)-dimensional non-linear optical waves t...
1,Medical,Alzheimer's Disease,Aging; Tau; Amyloid; PET; Alzheimer's disease...,(beta-amyloid (A beta) and tau pathology becom...
2,Civil,Green Building,LED lighting system; PV system; Distributed l...,(D)ecreasing of energy consumption and environ...
3,ECE,Electric motor,NdFeB magnets; Electric motor; Electric vehic...,(Hybrid) electric vehicles are assumed to play...
4,Medical,Parkinson's Disease,Parkinson's disease; dyskinesia; adenosine A(...,"(L)-3,4-Dihydroxyphenylalanine ((L)-DOPA) rema..."


In [None]:
test_set.head()

Unnamed: 0,Domain,area,keywords,Abstract
0,CS,Data structures,In-memory XPath processing; NESTOR; Set-based...,XML is a pervasive technology for representing...
1,Civil,Ambient Intelligence,Home energy management; persuasive interface;...,The integration of renewable energy sources in...
2,ECE,Electrical generator,non-standard electrical machine; generation o...,The original free-swinging piston engine with ...
3,Medical,Hepatitis C,complications; patient engagement; patient-ce...,Barriers to access and long-term complications...
4,ECE,Control engineering,force feedback haptic interface; virtual real...,This paper is to present a technological solut...


In [None]:
device = torch.device("cpu")

In [None]:
tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

In [None]:
from sklearn.preprocessing import LabelEncoder

def make_finetuning_data_distilbert(tokenizer, df, max_length=512):
    label_encoder = LabelEncoder()
    labels = label_encoder.fit_transform(df["Domain"])

    encodings = tokenizer(df['Abstract'].tolist(),
                          truncation=True,
                          padding="max_length",
                          max_length=max_length,
                          return_tensors="pt"
                          )
    return encodings, labels

# Prepare data for training, validation, and testing
training_encodings, training_labels = make_finetuning_data_distilbert(tokenizer, training_set)
validation_encodings, validation_labels = make_finetuning_data_distilbert(tokenizer, validation_set)
test_encodings, test_labels = make_finetuning_data_distilbert(tokenizer, test_set)


2) Prepare the dataloader

In [None]:
MAX_LENGTH = 512
BATCH_SIZE_TRAIN = 16
BATCH_SIZE_VAL = 16

class FTDataset(Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

dataset_train = FTDataset(training_encodings, training_labels)
dataloader_train = DataLoader(dataset=dataset_train, batch_size=BATCH_SIZE_TRAIN, shuffle=True)
dataset_val = FTDataset(validation_encodings, validation_labels)
dataloader_val = DataLoader(dataset=dataset_val, batch_size=BATCH_SIZE_VAL)

3) Add (a) trainable layer(s) on top of DistilBert

In [None]:
class FTModel(nn.Module):
    def __init__(self, n_classes):
        super(FTModel, self).__init__()
        self.bert_model = DistilBertModel.from_pretrained('distilbert-base-uncased')
        self.classifier = nn.Linear(self.bert_model.config.dim, n_classes)

    def forward(self, ids, mask):
        output = self.bert_model(input_ids=ids, attention_mask= mask)
        hidden_state = output.last_hidden_state[:, 0]
        logits = self.classifier(hidden_state)
        return logits


number_of_classes = 7
model = FTModel(n_classes=number_of_classes).to(device)


model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

In [None]:
LEARNING_RATE = 1e-5
# Use cross-entropy loss
loss_fn = nn.CrossEntropyLoss()

# Initialize Optimizer
optimizer= optim.Adam(model.parameters(), lr=LEARNING_RATE)

In [None]:
# Freeze parameters of the pre-trained Distilbert model
for param in model.bert_model.parameters():
    param.requires_grad = False

4) Write the fine-tuning and evaluation functions

In [None]:
VAL_FREQUENCY = 2

PATH1 = 'path_to_save_model.pth'

def evaluate(model, dataloader_val, loss_fn):
    model.eval()
    total_loss, total_correct, total = 0, 0, 0
    with torch.no_grad():
        for batch in dataloader_val:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)
            outputs = model(input_ids, attention_mask)
            loss = loss_fn(outputs, labels)
            total_loss += loss.item()
            _, predicted = torch.max(outputs.data, 1)
            total_correct += (predicted == labels).sum().item()
            total += labels.size(0)
    accuracy = total_correct / total
    return total_loss / len(dataloader_val), accuracy

def finetune(epochs, model, loss_fn, optimizer, dataloader_train, dataloader_val):
    min_vloss = 10000
    patience_index = 0
    patience = 5
    for epoch in range(epochs):
        model.train()
        loop = tqdm(enumerate(dataloader_train), total=len(dataloader_train))
        for _, batch in loop:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)
            outputs = model(input_ids, attention_mask)
            loss = loss_fn(outputs, labels)
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
            loop.set_description(f"Epoch {epoch+1}")
            loop.set_postfix(loss=loss.item())

        if epoch % VAL_FREQUENCY == 0:
            val_loss, val_accuracy = evaluate(model, dataloader_val, loss_fn)
            print(f'Validation Loss: {val_loss:.4f}, Accuracy: {val_accuracy:.4f}')
            if val_loss <= min_vloss:
                min_vloss = val_loss
                torch.save(model.state_dict(), PATH1)
                patience_index = 0
            else:
                patience_index += 1
                if patience_index == patience:
                    print("Early stopping due to no improvement in validation loss.")
                    break
    return model

# Assuming dataloader_train and dataloader_val are defined elsewhere
model = finetune(10, model, loss_fn, optimizer, dataloader_train, dataloader_val)

  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
Epoch 1: 100%|██████████| 4/4 [00:13<00:00,  3.45s/it, loss=2.14]


Validation Loss: 1.8380, Accuracy: 0.3000


Epoch 2: 100%|██████████| 4/4 [00:13<00:00,  3.38s/it, loss=2.06]
Epoch 3: 100%|██████████| 4/4 [00:13<00:00,  3.32s/it, loss=2]


Validation Loss: 1.8327, Accuracy: 0.3000


Epoch 4: 100%|██████████| 4/4 [00:13<00:00,  3.34s/it, loss=2.05]
Epoch 5: 100%|██████████| 4/4 [00:13<00:00,  3.33s/it, loss=1.59]


Validation Loss: 1.8274, Accuracy: 0.3000


Epoch 6: 100%|██████████| 4/4 [00:13<00:00,  3.38s/it, loss=1.83]
Epoch 7: 100%|██████████| 4/4 [00:13<00:00,  3.37s/it, loss=1.8]


Validation Loss: 1.8203, Accuracy: 0.3400


Epoch 8: 100%|██████████| 4/4 [00:13<00:00,  3.29s/it, loss=1.69]
Epoch 9: 100%|██████████| 4/4 [00:13<00:00,  3.33s/it, loss=2.25]


Validation Loss: 1.8131, Accuracy: 0.3400


Epoch 10: 100%|██████████| 4/4 [00:13<00:00,  3.28s/it, loss=1.73]


# Report accuracy on test set using sklearn.metrics.accuracy_score

In [None]:
def evaluate_on_test(model, dataloader_test, device):
    model.eval()
    predictions, true_labels = [], []

    with torch.no_grad():
        for batch in dataloader_test:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)

            outputs = model(input_ids, attention_mask)
            _, predicted = torch.max(outputs, dim=1)

            predictions.extend(predicted.cpu().numpy())
            true_labels.extend(labels.cpu().numpy())

    accuracy = accuracy_score(true_labels, predictions)
    return accuracy


model.to(device)
test_dataset = FTDataset(test_encodings, test_labels)
dataloader_test = DataLoader(dataset=test_dataset, batch_size=BATCH_SIZE_VAL, shuffle=False)

test_accuracy = evaluate_on_test(model, dataloader_test, device)
print(f"Test accuracy: {test_accuracy:.4f}")

  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}


Test accuracy: 0.2500


# Question 5: Fine-tune all parameters (requires GPU but used CPU)


In [None]:
MAX_LENGTH = 512
BATCH_SIZE_TRAIN = 16
BATCH_SIZE_VAL = 16

class FTDataset(Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

dataset_train = FTDataset(training_encodings, training_labels)
dataloader_train = DataLoader(dataset=dataset_train, batch_size=BATCH_SIZE_TRAIN, shuffle=True)
dataset_val = FTDataset(validation_encodings, validation_labels)
dataloader_val = DataLoader(dataset=dataset_val, batch_size=BATCH_SIZE_VAL)

In [None]:
bert_model_fullft = DistilBertModel.from_pretrained("distilbert-base-uncased")

In [None]:
class FTModelFull(nn.Module):
    def __init__(self, n_classes):
        super(FTModelFull, self).__init__()
        self.bert_model = bert_model_fullft
        self.classifier = nn.Linear(self.bert_model.config.dim, n_classes)

    def forward(self, ids, mask):
        output = self.bert_model(input_ids=ids, attention_mask=mask)
        hidden_state = output.last_hidden_state[:, 0]
        logits = self.classifier(hidden_state)
        return logits

number_of_classes = 7
model_full_ft = FTModelFull(n_classes=number_of_classes).to(device)

In [None]:
from transformers import AdamW

In [None]:
loss_fn = nn.CrossEntropyLoss()
EPSILON = 1e-8
# Choose parameters wisely!
learning_rate = 1e-5
adam_epsilon = EPSILON

no_decay = ['bias', 'LayerNorm.weight']
optimizer_grouped_parameters = [
    {'params': [p for n, p in model_full_ft.named_parameters() if not any(nd in n for nd in no_decay)],
     'weight_decay_rate': 0.2},
    {'params': [p for n, p in model_full_ft.named_parameters() if any(nd in n for nd in no_decay)],
     'weight_decay_rate': 0.0}
]

optimizer = AdamW(optimizer_grouped_parameters, lr=learning_rate, eps=adam_epsilon)



In [None]:
VAL_FREQUENCY = 2

PATH2 = 'path_to_save_full_model.pth'

def evaluate_full_model(model, dataloader_val, loss_fn):
    model.eval()
    total_loss, total_correct, total = 0, 0, 0
    with torch.no_grad():
        for batch in dataloader_val:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)
            outputs = model(input_ids, attention_mask)
            loss = loss_fn(outputs, labels)
            total_loss += loss.item()
            _, predicted = torch.max(outputs.data, 1)
            total_correct += (predicted == labels).sum().item()
            total += labels.size(0)
    accuracy = total_correct / total
    return total_loss / len(dataloader_val), accuracy

def finetune_full_model(epochs, model, loss_fn, optimizer, dataloader_train, dataloader_val):
    min_vloss = 10000
    patience_index = 0
    patience = 5
    for epoch in range(epochs):
        model.train()
        loop = tqdm(enumerate(dataloader_train), total=len(dataloader_train))
        for _, batch in loop:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)
            outputs = model(input_ids, attention_mask)
            loss = loss_fn(outputs, labels)
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
            loop.set_description(f"Epoch {epoch+1}")
            loop.set_postfix(loss=loss.item())

        if epoch % VAL_FREQUENCY == 0:
            val_loss, val_accuracy = evaluate_full_model(model, dataloader_val, loss_fn)
            print(f'Validation Loss: {val_loss:.4f}, Accuracy: {val_accuracy:.4f}')
            if val_loss <= min_vloss:
                min_vloss = val_loss
                torch.save(model.state_dict(), PATH2)
                patience_index = 0
            else:
                patience_index += 1
                if patience_index == patience:
                    print("Early stopping due to no improvement in validation loss.")
                    break
    return model

# Assuming dataloader_train and dataloader_val are defined elsewhere
model_full_ft = finetune_full_model(10, model_full_ft, loss_fn, optimizer, dataloader_train, dataloader_val)

  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
Epoch 1: 100%|██████████| 4/4 [00:33<00:00,  8.27s/it, loss=1.92]


Validation Loss: 1.7073, Accuracy: 0.3000


Epoch 2: 100%|██████████| 4/4 [00:28<00:00,  7.02s/it, loss=2.03]
Epoch 3: 100%|██████████| 4/4 [00:30<00:00,  7.71s/it, loss=1.93]


Validation Loss: 1.6024, Accuracy: 0.3800


Epoch 4: 100%|██████████| 4/4 [00:26<00:00,  6.71s/it, loss=1.9]
Epoch 5: 100%|██████████| 4/4 [00:27<00:00,  6.84s/it, loss=1.97]


Validation Loss: 1.5225, Accuracy: 0.4200


Epoch 6: 100%|██████████| 4/4 [00:29<00:00,  7.28s/it, loss=1.61]
Epoch 7: 100%|██████████| 4/4 [00:28<00:00,  7.03s/it, loss=1.03]


Validation Loss: 1.4199, Accuracy: 0.4000


Epoch 8: 100%|██████████| 4/4 [00:26<00:00,  6.61s/it, loss=1.33]
Epoch 9: 100%|██████████| 4/4 [00:26<00:00,  6.55s/it, loss=0.743]


Validation Loss: 1.3501, Accuracy: 0.4200


Epoch 10: 100%|██████████| 4/4 [00:26<00:00,  6.51s/it, loss=0.899]


# Report accuracy on test set using sklearn.metrics.accuracy_score

In [None]:
model_full_ft.to(device)
test_dataset = FTDataset(test_encodings, test_labels)
dataloader_test = DataLoader(dataset=test_dataset, batch_size=BATCH_SIZE_VAL, shuffle=False)

test_accuracy = evaluate_on_test(model_full_ft, dataloader_test, device)
print(f"Test accuracy: {test_accuracy:.4f}")

  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}


Test accuracy: 0.3500
