<a href="https://colab.research.google.com/github/RohanAkkineni/BERT-for-mental-health/blob/main/BERT_with_3000_rows.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import os
import pandas as pd
import numpy as np
import json

In [None]:
small = pd.read_csv("/content/Bert Data 3000.csv")

In [None]:
small

Unnamed: 0,Input,Output
0,Hi,Hello there. Tell me how are you feeling today?
1,Hey,Hello there. Tell me how are you feeling today?
2,Is anyone there?,Hello there. Tell me how are you feeling today?
3,Hi there,Hello there. Tell me how are you feeling today?
4,Hello,Hello there. Tell me how are you feeling today?
...,...,...
3046,How do I know if I'm unwell?,"If your beliefs , thoughts , feelings or behav..."
3047,How can I maintain social connections? What if...,"A lot of people are alone right now, but we do..."
3048,What's the difference between anxiety and stress?,Stress and anxiety are often used interchangea...
3049,What's the difference between sadness and depr...,"Sadness is a normal reaction to a loss, disapp..."


In [None]:
from sklearn.model_selection import train_test_split

In [None]:
X_train, X_test, y_train, y_test = train_test_split(small['Input'], small['Output'], test_size=0.2, random_state=42)

In [None]:
from transformers import DistilBertTokenizerFast
tokenizer = DistilBertTokenizerFast.from_pretrained('distilbert-base-uncased')

In [None]:
X_train_list = [str(item) for item in X_train]
train_encodings = tokenizer(X_train_list, truncation=True, padding=True, max_length=8)
test_encodings = tokenizer(list(X_test), truncation=True, padding=True, max_length=8)

train_response_encodings = tokenizer(list(y_train), truncation=True, padding=True, max_length=8)
test_response_encodings = tokenizer(list(y_test), truncation=True, padding=True, max_length=8)

In [None]:
print(len(test_encodings[1]), len(train_encodings[1]), len(test_response_encodings[1]), len(train_response_encodings[1]))

8 8 8 8


In [None]:
import torch
from torch.utils.data import Dataset

In [None]:
# class QADataset(Dataset):
#   def _init__(self, encodings, response_encodings):
#     self.encodings = encodings
#     self.response_encodings = response_encodings

#   def __len__(self):
#     return len(self.encodings['input_ids'])

#   def __getitem__(self, idx):
#     item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
#     item['response'] = torch.tensor(self.response_encodings['input_ids'][idx])
#     return item

# MODIFIED CLASS -
class QADataset(Dataset):
    def __init__(self, encodings, response_encodings):
        self.encodings = encodings
        self.response_encodings = response_encodings

    def __getitem__(self, idx):
        # Ensure all token IDs in input sequences are within the vocabulary range
        for key in self.encodings:
            self.encodings[key][idx] = [token_id for token_id in self.encodings[key][idx] if token_id < tokenizer.vocab_size]

        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}

        # Ensure all label IDs are within the vocabulary range
        labels = self.response_encodings['input_ids'][idx]
        labels = [label_id for label_id in labels if label_id < tokenizer.vocab_size]  # Filter out-of-vocabulary tokens
        item['labels'] = torch.tensor(labels)
        return item

    def __len__(self):
        # Return the length of one of the encoding attributes, e.g., 'input_ids'
        return len(self.encodings.input_ids)

train_dataset = QADataset(train_encodings, train_response_encodings)
test_dataset = QADataset(test_encodings, test_response_encodings)

In [None]:
# prompt: give me the class QADataset code for the above project, for finetuning BERT on the encodings as given in the codes

class QADataset(Dataset):
    def __init__(self, encodings, response_encodings):
        self.encodings = encodings
        self.response_encodings = response_encodings

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.response_encodings['input_ids'][idx][0])
        return item

    def __len__(self):
        return len(self.encodings.input_ids)

train_dataset = QADataset(train_encodings, train_response_encodings)
test_dataset = QADataset(test_encodings, test_response_encodings)

In [None]:
from transformers import DistilBertForSequenceClassification

# Load the DistilBERT model
model = DistilBertForSequenceClassification.from_pretrained(
    'distilbert-base-uncased',
    num_labels=tokenizer.vocab_size  # Matching vocab size for seq2seq tasks
)

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
from transformers import DefaultDataCollator

data_collator = DefaultDataCollator()

In [None]:
# data_collator = DataCollator(
#     tokenizer=tokenizer,
#     padding=True,
#     max_length=8,  # Adjust based on your desired sequence length
#     return_tensors="pt"
# )

In [None]:
from transformers import Trainer, TrainingArguments

training_args = TrainingArguments(
    output_dir='./results',
    evaluation_strategy="epoch",
    learning_rate=5e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=3,
    weight_decay=0.01,
    save_total_limit=2,
    logging_dir='./logs',
    logging_steps=100,
    save_strategy="epoch",
    load_best_model_at_end=True,
    report_to="none"
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    tokenizer=tokenizer,
    data_collator=data_collator
)

  trainer = Trainer(


In [None]:
trainer.train()

Epoch,Training Loss,Validation Loss
1,0.0003,0.000142
2,0.0001,6.1e-05
3,0.0001,4.8e-05


TrainOutput(global_step=915, training_loss=0.3535202177289792, metrics={'train_runtime': 1907.679, 'train_samples_per_second': 3.837, 'train_steps_per_second': 0.48, 'total_flos': 23397335758080.0, 'train_loss': 0.3535202177289792, 'epoch': 3.0})

In [113]:
results = trainer.evaluate()

In [117]:
print("The results of the model are")
for i in results:
  print(i,"-", results[i])

The results of the model are
eval_loss - 4.771866952069104e-05
eval_runtime - 27.3523
eval_samples_per_second - 22.338
eval_steps_per_second - 2.815
epoch - 3.0
