# Installing Library

In [1]:
!pip install transformers datasets evaluate torch torchtext sentencepiece pandas tqdm

Collecting datasets
  Downloading datasets-3.0.1-py3-none-any.whl.metadata (20 kB)
Collecting evaluate
  Downloading evaluate-0.4.3-py3-none-any.whl.metadata (9.2 kB)
Collecting torchtext
  Downloading torchtext-0.18.0-cp310-cp310-manylinux1_x86_64.whl.metadata (7.9 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess (from datasets)
  Downloading multiprocess-0.70.17-py310-none-any.whl.metadata (7.2 kB)
INFO: pip is looking at multiple versions of multiprocess to determine which version is compatible with other requirements. This could take a while.
  Downloading multiprocess-0.70.16-py310-none-any.whl.metadata (7.2 kB)
Downloading datasets-3.0.1-py3-none-any.whl (471 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m471.6/471.6 kB[0m [31m21.3 MB/s[0m e

# Loading Data & Library

In [2]:
from datasets import load_dataset, DatasetDict, Dataset
import pandas as pd
import ast
import datasets
from tqdm import tqdm
import time
from transformers import GPT2Tokenizer, GPT2LMHeadModel
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader, random_split

In [3]:
# If you have an NVIDIA GPU attached, use 'cuda'
if torch.cuda.is_available():
    device = torch.device('cuda')
else:
    # If Apple Silicon, set to 'mps' - otherwise 'cpu' (not advised)
    try:
        device = torch.device('mps')
    except Exception:
        device = torch.device('cpu')

device

device(type='cuda')

In [5]:
ds = load_dataset("Amod/mental_health_counseling_conversations", split='train')
ds

combined_dataset.json:   0%|          | 0.00/4.79M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/3512 [00:00<?, ? examples/s]

Dataset({
    features: ['Context', 'Response'],
    num_rows: 3512
})

# Prepare Dataseet

In [9]:
updated_data = [{'Context': item['Context'], 'Response': item['Response']} for item in ds]
data = pd.DataFrame(updated_data)
data.head()

Unnamed: 0,Context,Response
0,I'm going through some things with my feelings...,"If everyone thinks you're worthless, then mayb..."
1,I'm going through some things with my feelings...,"Hello, and thank you for your question and see..."
2,I'm going through some things with my feelings...,First thing I'd suggest is getting the sleep y...
3,I'm going through some things with my feelings...,Therapy is essential for those that are feelin...
4,I'm going through some things with my feelings...,I first want to let you know that you are not ...


In [10]:
data['Context'][0]

"I'm going through some things with my feelings and myself. I barely sleep and I do nothing but think about how I'm worthless and how I shouldn't be here.\n   I've never tried or contemplated suicide. I've always wanted to fix my issues, but I never get around to it.\n   How can I change my feeling of being worthless to everyone?"

In [12]:
data['Context'] = data['Context'].apply(lambda x: ', '.join(x.split(', ')))
data['Response'] = data['Response'].apply(lambda x: ', '.join(x.split(', ')))

data['Response'][0]

"If everyone thinks you're worthless, then maybe you need to find new people to hang out with.Seriously, the social context in which a person lives is a big influence in self-esteem.Otherwise, you can go round and round trying to understand why you're not worthless, then go back to the same crowd and be knocked down again.There are many inspirational messages you can find in social media. \xa0Maybe read some of the ones which state that no person is worthless, and that everyone has a good purpose to their life.Also, since our culture is so saturated with the belief that if someone doesn't feel good about themselves that this is somehow terrible.Bad feelings are part of living. \xa0They are the motivation to remove ourselves from situations and relationships which do us more harm than good.Bad feelings do feel terrible. \xa0 Your feeling of worthlessness may be good in the sense of motivating you to find out that you are much better than your feelings today."

# Loading Tokenizer & model

In [13]:
tokenizer = GPT2Tokenizer.from_pretrained("distilgpt2")
model = GPT2LMHeadModel.from_pretrained("distilgpt2").to(device)

tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/762 [00:00<?, ?B/s]



model.safetensors:   0%|          | 0.00/353M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

In [14]:
model

GPT2LMHeadModel(
  (transformer): GPT2Model(
    (wte): Embedding(50257, 768)
    (wpe): Embedding(1024, 768)
    (drop): Dropout(p=0.1, inplace=False)
    (h): ModuleList(
      (0-5): 6 x GPT2Block(
        (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (attn): GPT2SdpaAttention(
          (c_attn): Conv1D()
          (c_proj): Conv1D()
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Dropout(p=0.1, inplace=False)
        )
        (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (mlp): GPT2MLP(
          (c_fc): Conv1D()
          (c_proj): Conv1D()
          (act): NewGELUActivation()
          (dropout): Dropout(p=0.1, inplace=False)
        )
      )
    )
    (ln_f): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
  )
  (lm_head): Linear(in_features=768, out_features=50257, bias=False)
)

# Prepare Dataset For Model

In [15]:
batch_size = 8
max_length = 128

In [16]:
data.describe()

Unnamed: 0,Context,Response
count,3512,3512.0
unique,995,2480.0
top,I have so many issues to address. I have a his...,
freq,94,4.0


In [17]:
# DataSet Preperation

class LanguageDataset(Dataset):
  def __init__(self, data, tokenizer):
    self.data = data.to_dict(orient='records')
    self.labels = data.columns
    self.tokenizer = tokenizer
    x = self.fittest_max_length(data)
    self.max_length = x


  def __len__(self):
    return len(self.data)

  def __getitem__(self, idx):
    x = self.data[idx][self.labels[0]]
    y = self.data[idx][self.labels[1]]

    text = f'{x} | {y}'
    tokens = self.tokenizer.encode_plus(text, return_tensors='pt', max_length=128, padding='max_length', truncation=True)
    return tokens

  def fittest_max_length(self, data):
    max_length = max(len(max(data[self.labels[0]], key=len)), len(max(data[self.labels[1]], key=len)))
    x = 2
    while x < max_length: x = x * 2
    return x


In [18]:
datasample = LanguageDataset(data, tokenizer)
datasample

<__main__.LanguageDataset at 0x79246ff32d10>

In [19]:
train_size = int(0.8 * len(datasample))
test_size = len(datasample) - train_size


train_data, val_data = random_split(datasample, [train_size, test_size])

In [20]:
train_loader = DataLoader(train_data, batch_size=batch_size, shuffle=True)
val_loader = DataLoader(val_data, batch_size=batch_size)

# Build Arguments

In [21]:
num_epochs = 8
batch_size = 8
model_name = 'distilgpt2'
gpu = 0

In [22]:
criterion = nn.CrossEntropyLoss(ignore_index=tokenizer.pad_token_id)
optimizer = optim.Adam(model.parameters(), lr=5e-4)

tokenizer.pad_token = tokenizer.eos_token

In [23]:
result = pd.DataFrame(columns=['epochs', 'transformer', 'batch_size', 'gpu', 'training_loss', 'validation_loss', 'epoch_duration_second'])

# Training

In [24]:
import time

In [25]:
for epoch in range(num_epochs):
  start_time = time.time()
  model.train()
  epoch_training_loss = 0
  train_iterator = tqdm(train_loader, desc=f'Training epoch {epoch+1}/{num_epochs} Batch size = {batch_size} Transformer = {model_name}')

  for batch in train_iterator:
    optimizer.zero_grad()
    inputs = batch['input_ids'].squeeze(1).to(device)
    targets = inputs.clone()
    outputs = model(input_ids=inputs, labels=targets)
    loss = outputs.loss
    loss.backward()
    optimizer.step()
    train_iterator.set_postfix({'Training loss': loss.item()})
    epoch_training_loss += loss.item()

  avg_epoch_training_loss = epoch_training_loss / len(train_iterator)

  # Validation
  model.eval()
  model_validation_loss = 0.0
  total_loss = 0
  validation_iterators = tqdm(val_loader, desc=f'Validation epoch {epoch+1}/{num_epochs} Batch size = {batch_size} Transformer = {model_name}')
  with torch.no_grad():
    for batch in validation_iterators:
      inputs = batch['input_ids'].squeeze(1).to(device)
      targets = inputs.clone()
      outputs = model(input_ids=inputs, labels=targets)
      loss = outputs.loss
      total_loss += loss
      validation_iterators.set_postfix({'Validation loss': loss.item()})
      model_validation_loss += loss.item()

  avg_epoch_validation_loss = model_validation_loss / len(validation_iterators)

  end_time = time.time()
  epoch_duration_sec = end_time - start_time

  new_row = {
      'transformer': model_name,
      'batch_size': batch_size,
      'gpu': gpu,
      'epoch': epoch + 1,
      'training_loss': avg_epoch_training_loss,
      'validation_loss': avg_epoch_validation_loss,
      'epoch_duration_second': epoch_duration_sec
  }

  result.loc[len(result)] = new_row
  print(f'Epochs: {epoch + 1}, Validation Loss: {total_loss / len(val_loader)}')

Training epoch 1/8 Batch size = 8 Transformer = distilgpt2: 100%|██████████| 352/352 [01:09<00:00,  5.07it/s, Training loss=2.97]
Validation epoch 1/8 Batch size = 8 Transformer = distilgpt2: 100%|██████████| 88/88 [00:06<00:00, 14.14it/s, Validation loss=2.37]


Epochs: 1, Validation Loss: 2.3133842945098877


Training epoch 2/8 Batch size = 8 Transformer = distilgpt2: 100%|██████████| 352/352 [01:10<00:00,  5.02it/s, Training loss=2.39]
Validation epoch 2/8 Batch size = 8 Transformer = distilgpt2: 100%|██████████| 88/88 [00:05<00:00, 14.84it/s, Validation loss=2.09]


Epochs: 2, Validation Loss: 1.8921526670455933


Training epoch 3/8 Batch size = 8 Transformer = distilgpt2: 100%|██████████| 352/352 [01:11<00:00,  4.92it/s, Training loss=1.77]
Validation epoch 3/8 Batch size = 8 Transformer = distilgpt2: 100%|██████████| 88/88 [00:06<00:00, 14.55it/s, Validation loss=1.91]


Epochs: 3, Validation Loss: 1.6445521116256714


Training epoch 4/8 Batch size = 8 Transformer = distilgpt2: 100%|██████████| 352/352 [01:12<00:00,  4.85it/s, Training loss=0.361]
Validation epoch 4/8 Batch size = 8 Transformer = distilgpt2: 100%|██████████| 88/88 [00:05<00:00, 15.09it/s, Validation loss=1.78]


Epochs: 4, Validation Loss: 1.4636412858963013


Training epoch 5/8 Batch size = 8 Transformer = distilgpt2: 100%|██████████| 352/352 [01:12<00:00,  4.86it/s, Training loss=1.56]
Validation epoch 5/8 Batch size = 8 Transformer = distilgpt2: 100%|██████████| 88/88 [00:05<00:00, 15.30it/s, Validation loss=1.77]


Epochs: 5, Validation Loss: 1.3947638273239136


Training epoch 6/8 Batch size = 8 Transformer = distilgpt2: 100%|██████████| 352/352 [01:12<00:00,  4.84it/s, Training loss=0.302]
Validation epoch 6/8 Batch size = 8 Transformer = distilgpt2: 100%|██████████| 88/88 [00:05<00:00, 14.72it/s, Validation loss=1.79]


Epochs: 6, Validation Loss: 1.3436932563781738


Training epoch 7/8 Batch size = 8 Transformer = distilgpt2: 100%|██████████| 352/352 [01:12<00:00,  4.85it/s, Training loss=0.558]
Validation epoch 7/8 Batch size = 8 Transformer = distilgpt2: 100%|██████████| 88/88 [00:05<00:00, 14.74it/s, Validation loss=1.79]


Epochs: 7, Validation Loss: 1.3644880056381226


Training epoch 8/8 Batch size = 8 Transformer = distilgpt2: 100%|██████████| 352/352 [01:12<00:00,  4.83it/s, Training loss=0.236]
Validation epoch 8/8 Batch size = 8 Transformer = distilgpt2: 100%|██████████| 88/88 [00:05<00:00, 15.26it/s, Validation loss=1.86]

Epochs: 8, Validation Loss: 1.3790165185928345





# Validation And Predicted Data

In [27]:
input_str = "I'm going through some things with my feelings and myself. I barely sleep and I do nothing but think about how I'm worthless and how I shouldn't be here. I've never tried or contemplated suicide. I've always wanted to fix my issues, but I never get around to it. How can I change my feeling of being worthless to everyone?"
input_ids = tokenizer.encode(input_str, return_tensors='pt').to(device)

output = model.generate(
    input_ids,
    max_length=512,
    num_return_sequences=1,
    do_sample=True,
    top_k=8,
    top_p=0.95,
    temperature=0.5,
    repetition_penalty=1.2
)

decoded_output = tokenizer.decode(output[0], skip_special_tokens=True)
print(decoded_output)

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


I'm going through some things with my feelings and myself. I barely sleep and I do nothing but think about how I'm worthless and how I shouldn't be here. I've never tried or contemplated suicide. I've always wanted to fix my issues, but I never get around to it. How can I change my feeling of being worthless to everyone? | Hello, I am relieved that you are feeling this way. It sounds like you are having some pretty severe depression symptoms; however, they could be the result in medication changes, they could be the result of low self esteem, or because this is not what you are experiencing at work on. In looking for support, try and repairing relationships between you. You are already starting towards your interest and wanting other people who love you as well as you and feelingings. At the touch base in touch base by letting them know what others feel needed and feeling their connection to whom you need help. In touch base in a little bit easier and feel your need help - see them fir

In [30]:
decoded_output.split('|')[-1]

' Hello, I am relieved that you are feeling this way. It sounds like you are having some pretty severe depression symptoms; however, they could be the result in medication changes, they could be the result of low self esteem, or because this is not what you are experiencing at work on. In looking for support, try and repairing relationships between you. You are already starting towards your interest and wanting other people who love you as well as you and feelingings. At the touch base in touch base by letting them know what others feel needed and feeling their connection to whom you need help. In touch base in a little bit easier and feel your need help - see them first step. Tried to give you. Tried to talk to remember that there and feel and give them as well. Tried too that you are there so that you are there!'

In [None]:
torch.save(model, 'SmallMedLM.pt')