# setup environment

In [None]:
!pip install transformers datasets accelerate

Collecting datasets
  Downloading datasets-3.2.0-py3-none-any.whl.metadata (20 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py311-none-any.whl.metadata (7.2 kB)
Collecting fsspec<=2024.9.0,>=2023.1.0 (from fsspec[http]<=2024.9.0,>=2023.1.0->datasets)
  Downloading fsspec-2024.9.0-py3-none-any.whl.metadata (11 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch>=1.10.0->accelerate)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch>=1.10.0->accelerate)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (fr

In [None]:
from transformers import AutoTokenizer, AutoModelForCausalLM, Trainer, TrainingArguments
from datasets import load_dataset, Dataset
import pandas as pd
import torch

# Load the Model and Tokenizer

In [None]:
# Use GPT-2 Medium
model_name = "gpt2-medium"

# Load tokenizer and model
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(model_name)

# Assign padding token
tokenizer.pad_token = tokenizer.eos_token

tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/718 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.52G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

# Prepare Dataset

In [None]:
# Load your dataset
data = pd.read_csv("cleaned_dataset.csv")
dataset = Dataset.from_pandas(data)

# Split the dataset into train and validation sets
dataset = dataset.train_test_split(test_size=0.1)
train_dataset = dataset["train"]
eval_dataset = dataset["test"]

# Preprocessing function
def preprocess_function(examples):
    # Combine the client prompt and therapist response
    inputs = [f"Client: {prompt}\nTherapist: {response}" for prompt, response in zip(examples["client"], examples["therapist"])]
    tokenized = tokenizer(inputs, truncation=True, padding="max_length", max_length=128)
    tokenized["labels"] = tokenized["input_ids"].copy()
    return tokenized

# Tokenize datasets
train_dataset = train_dataset.map(preprocess_function, batched=True, remove_columns=["client", "therapist"])
eval_dataset = eval_dataset.map(preprocess_function, batched=True, remove_columns=["client", "therapist"])

Map:   0%|          | 0/574 [00:00<?, ? examples/s]

Map:   0%|          | 0/64 [00:00<?, ? examples/s]

# Cell 4: tokenize dataset

In [None]:
training_args = TrainingArguments(
    output_dir="./gpt2-medium-therapeutic",
    evaluation_strategy="epoch",
    learning_rate=5e-5,
    per_device_train_batch_size=2,  # Adjust for memory constraints
    per_device_eval_batch_size=2,
    num_train_epochs=5,
    save_strategy="epoch",
    save_total_limit=2,  # Save only the latest checkpoints
    logging_dir="./logs",
    logging_steps=10,
    load_best_model_at_end=True,
    fp16=True,  # Enable mixed precision for GPU
    push_to_hub=False
)



# Define Training Arguments

In [None]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    tokenizer=tokenizer
)

# Start training
trainer.train()

  trainer = Trainer(


Epoch,Training Loss,Validation Loss
1,1.1577,1.576889
2,0.9086,1.637157
3,0.7745,1.816243
4,0.5393,1.950435
5,0.412,2.047741


There were missing keys in the checkpoint model loaded: ['lm_head.weight'].


TrainOutput(global_step=1435, training_loss=0.7532982189896215, metrics={'train_runtime': 545.7449, 'train_samples_per_second': 5.259, 'train_steps_per_second': 2.629, 'total_flos': 666342748323840.0, 'train_loss': 0.7532982189896215, 'epoch': 5.0})

# train the model:

In [None]:
# Save the fine-tuned model and tokenizer
model.save_pretrained("./gpt2-medium-therapeutic")
tokenizer.save_pretrained("./gpt2-medium-therapeutic")

('./gpt2-medium-therapeutic/tokenizer_config.json',
 './gpt2-medium-therapeutic/special_tokens_map.json',
 './gpt2-medium-therapeutic/vocab.json',
 './gpt2-medium-therapeutic/merges.txt',
 './gpt2-medium-therapeutic/added_tokens.json',
 './gpt2-medium-therapeutic/tokenizer.json')

In [None]:
# Save to Google Drive
from google.colab import drive
drive.mount('/content/drive')
model.save_pretrained('/content/drive/MyDrive/gpt2-medium-therapeutic')
tokenizer.save_pretrained('/content/drive/MyDrive/gpt2-medium-therapeutic')

Mounted at /content/drive


('/content/drive/MyDrive/gpt2-medium-therapeutic/tokenizer_config.json',
 '/content/drive/MyDrive/gpt2-medium-therapeutic/special_tokens_map.json',
 '/content/drive/MyDrive/gpt2-medium-therapeutic/vocab.json',
 '/content/drive/MyDrive/gpt2-medium-therapeutic/merges.txt',
 '/content/drive/MyDrive/gpt2-medium-therapeutic/added_tokens.json',
 '/content/drive/MyDrive/gpt2-medium-therapeutic/tokenizer.json')

In [None]:
# Define the device
device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
model.to(device)

# Function to generate responses
def generate_response(input_text):
    # Format the prompt
    prompt = f"Client: {input_text}\nTherapist:"
    inputs = tokenizer(prompt, return_tensors="pt", padding=True, truncation=True, max_length=128).to(device)
    outputs = model.generate(
        input_ids=inputs["input_ids"],
        attention_mask=inputs["attention_mask"],
        max_new_tokens=120,
        temperature=1,
        top_p=0.9,
        repetition_penalty=1.5,
        eos_token_id=tokenizer.eos_token_id,
        pad_token_id=tokenizer.pad_token_id
    )
    # Decode and clean the response
    response = tokenizer.decode(outputs[0], skip_special_tokens=True)
    if "Therapist:" in response:
        response = response.split("Therapist:")[1].strip()
    response = response.split(".")[0].strip() + "."
    if not response.strip() or response == ".":
        response = "I'm here to listen and support you. You are not alone."
    return response

# save the finetuned model:

In [None]:
# Test inputs
test_inputs = [
    "I'm feeling so overwhelmed at work. What should I do?",
    "I can't stop thinking about my breakup. It's consuming me.",
    "I feel like I’m not good enough for my family.",
    "I’ve lost someone I deeply care about, and I can’t move on."
]

# Generate responses
for input_text in test_inputs:
    response = generate_response(input_text)
    print(f"Input: {input_text}")
    print(f"Response: {response}\n")

Input: I'm feeling so overwhelmed at work. What should I do?
Response: There are a few things you can try to cope with this stress, even if it's not your own doing! One thing that may help is taking some time for yourself each day and focusing on something else entirely instead of dealing in numbers or deadlines all the way through school year again next semester!! Another option would be spending more quality sleep than normal because many people find they get tired during their lunch hour when there isn't much going around them either!!! Finally another possibility might involve finding ways where we don' t have too big an impact by simply being ourselves and having fun rather then worrying about how others.

Input: I can't stop thinking about my breakup. It's consuming me.
Response: There are a few things that you could do to try and get yourself out of this situation as quickly, painlessly or comfortably possible without causing too much stress for your partner in the long run.

In

In [None]:
test_inputs = [
    "I've been feeling disconnected from my friends lately. What should I do?"
]

for input_text in test_inputs:
    response = generate_response(input_text)
    print(f"Input: {input_text}")
    print(f"Response: {response}\n")

Input: I've been feeling disconnected from my friends lately. What should I do?
Response: There are a few things you can try to reconnect with your social circle, even if it's just for one night or two weeks at most! One thing that may help is simply spending time alone in nature and doing something relaxing like yoga before going out into the world again so as not get overwhelmed by all of those people around us anymore!! Another way would be practicing some new activities on weekends where there isn't much activity for an extended period without getting too stressed about them causing problems.

