In [None]:
import subprocess
import sys

print("Installing packages...\n")

packages = [
    'torch',
    'transformers',
    'pandas',
    'numpy',
    'scikit-learn',
    'tqdm'
]

for package in packages:
    print(f"Installing {package}...")
    subprocess.check_call([sys.executable, '-m', 'pip', 'install', '-q', package])

print("\nImporting libraries...\n")

import pandas as pd
import numpy as np
import torch
import re
import os
from pathlib import Path
from sklearn.model_selection import train_test_split
from tqdm import tqdm
from transformers import GPT2Tokenizer, AutoModelForCausalLM
from transformers import Trainer, TrainingArguments, TextDataset, DataCollatorForLanguageModeling
from google.colab import files
import time

print("Checking GPU...")
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Device: {device}")

if device.type == 'cuda':
    print(f"GPU: {torch.cuda.get_device_name(0)}")
    print(f"GPU Memory: {torch.cuda.get_device_properties(0).total_memory / 1e9:.1f} GB")

print("\nAll libraries loaded successfully")

Installing packages...

Installing torch...
Installing transformers...
Installing pandas...
Installing numpy...
Installing scikit-learn...
Installing tqdm...

Importing libraries...

Checking GPU...
Device: cuda
GPU: Tesla T4
GPU Memory: 15.8 GB

All libraries loaded successfully


In [None]:
print("=" * 80)
print("UPLOADING DATASETS")
print("=" * 80)
print()

print("Step 1: Upload AMOD file\n")

uploaded_amod = files.upload()

print("\nStep 2: Upload Synthetic file\n")

uploaded_synthetic = files.upload()

print("\n" + "=" * 80)
print("LOADING DATASETS")
print("=" * 80)
print()

import os
all_files = os.listdir('.')
csv_files = [f for f in all_files if f.endswith('.csv')]

print(f"Found CSV files: {csv_files}\n")

if len(csv_files) < 2:
    print("ERROR: Need 2 CSV files")
    exit()

print(f"Loading {csv_files[0]}...")
amod_df = pd.read_csv(csv_files[0], encoding='utf-8')
print(f"  Shape: {amod_df.shape}")
print(f"  Columns: {list(amod_df.columns)}\n")

print(f"Loading {csv_files[1]}...")
synthetic_df = pd.read_csv(csv_files[1], encoding='utf-8')
print(f"  Shape: {synthetic_df.shape}")
print(f"  Columns: {list(synthetic_df.columns)}\n")

print("=" * 80)
print("DATASETS LOADED SUCCESSFULLY")
print("=" * 80)
print()

print(f"AMOD: {amod_df.shape[0]} conversations")
print(f"Synthetic: {synthetic_df.shape[0]} conversations")

UPLOADING DATASETS

Step 1: Upload AMOD file



Saving amod_therapy_conversations.csv to amod_therapy_conversations.csv

Step 2: Upload Synthetic file



Saving synthetic_therapy_conversations.csv to synthetic_therapy_conversations.csv

LOADING DATASETS

Found CSV files: ['synthetic_therapy_conversations.csv', 'amod_therapy_conversations.csv']

Loading synthetic_therapy_conversations.csv...
  Shape: (3512, 2)
  Columns: ['Context', 'Response']

Loading amod_therapy_conversations.csv...
  Shape: (3512, 2)
  Columns: ['Context', 'Response']

DATASETS LOADED SUCCESSFULLY

AMOD: 3512 conversations
Synthetic: 3512 conversations


In [None]:
print("=" * 80)
print("COMBINING DATASETS")
print("=" * 80)
print()

combined_df = pd.concat([amod_df, synthetic_df], ignore_index=True)
print(f"Combined: {combined_df.shape[0]} total pairs\n")

df_clean = combined_df.copy()
original_size = len(df_clean)

print("=" * 80)
print("PREPROCESSING PIPELINE")
print("=" * 80)
print()

print("Step 1: Remove nulls...")
before = len(df_clean)
df_clean = df_clean.dropna()
df_clean = df_clean[(df_clean['Context'].str.len() > 0) & (df_clean['Response'].str.len() > 0)]
after = len(df_clean)
print(f"  Removed: {before - after}\n")

print("Step 2: Remove duplicates...")
before = len(df_clean)
df_clean = df_clean.drop_duplicates(subset=['Context'], keep='first')
after = len(df_clean)
print(f"  Removed: {before - after}\n")

print("Step 3: Clean text...")
def clean_text(text):
    if not isinstance(text, str):
        return ""
    text = re.sub(r'[^\w\s.!?,;:\'\"-]', '', text)
    text = re.sub(r'\s+', ' ', text)
    return text.strip()

df_clean['Context'] = df_clean['Context'].apply(clean_text)
df_clean['Response'] = df_clean['Response'].apply(clean_text)
print("  Cleaned\n")

print("Step 4: Remove URLs...")
def remove_urls(text):
    text = re.sub(r'http\S+|www\S+', '', text)
    return text.strip()

df_clean['Context'] = df_clean['Context'].apply(remove_urls)
df_clean['Response'] = df_clean['Response'].apply(remove_urls)
print("  Removed\n")

print("Step 5: Filter by length...")
before = len(df_clean)
df_clean = df_clean[
    (df_clean['Context'].str.len() >= 20) &
    (df_clean['Response'].str.len() >= 50) &
    (df_clean['Response'].str.len() <= 3000)
]
after = len(df_clean)
print(f"  Removed: {before - after}\n")

print("Step 6: Remove outliers (95th percentile)...")
context_95 = df_clean['Context'].str.len().quantile(0.95)
response_95 = df_clean['Response'].str.len().quantile(0.95)

before = len(df_clean)
df_clean = df_clean[
    (df_clean['Context'].str.len() <= context_95) &
    (df_clean['Response'].str.len() <= response_95)
]
after = len(df_clean)
print(f"  Removed: {before - after}\n")

print("Step 7: Therapeutic content validation...")
therapeutic_keywords = [
    'feel', 'understand', 'help', 'support', 'try', 'think', 'emotion',
    'important', 'suggest', 'listen', 'care', 'concern', 'work', 'therapy',
    'manage', 'cope', 'well', 'professional', 'counselor', 'appreciate',
    'experience', 'talk', 'share', 'friend', 'family', 'relationship'
]

def has_therapeutic(text):
    text_lower = text.lower()
    count = sum(1 for kw in therapeutic_keywords if kw in text_lower)
    return count >= 1

before = len(df_clean)
df_clean = df_clean[df_clean['Response'].apply(has_therapeutic)]
after = len(df_clean)
print(f"  Removed: {before - after}\n")

df_clean = df_clean.reset_index(drop=True)

print("=" * 80)
print("PREPROCESSING SUMMARY")
print("=" * 80)
print()

print(f"Original: {original_size}")
print(f"Final: {len(df_clean)}")
print(f"Removed: {original_size - len(df_clean)} ({((original_size - len(df_clean)) / original_size * 100):.1f}%)\n")

print(f"Context statistics:")
print(f"  Average: {df_clean['Context'].str.len().mean():.0f} chars")
print(f"  Min: {df_clean['Context'].str.len().min()}")
print(f"  Max: {df_clean['Context'].str.len().max()}\n")

print(f"Response statistics:")
print(f"  Average: {df_clean['Response'].str.len().mean():.0f} chars")
print(f"  Min: {df_clean['Response'].str.len().min()}")
print(f"  Max: {df_clean['Response'].str.len().max()}\n")

print("=" * 80)
print("PREPROCESSING COMPLETE")
print("=" * 80)

COMBINING DATASETS

Combined: 7024 total pairs

PREPROCESSING PIPELINE

Step 1: Remove nulls...
  Removed: 8

Step 2: Remove duplicates...
  Removed: 6021

Step 3: Clean text...
  Cleaned

Step 4: Remove URLs...
  Removed

Step 5: Filter by length...
  Removed: 16

Step 6: Remove outliers (95th percentile)...
  Removed: 92

Step 7: Therapeutic content validation...
  Removed: 23

PREPROCESSING SUMMARY

Original: 7024
Final: 864
Removed: 6160 (87.7%)

Context statistics:
  Average: 296 chars
  Min: 29
  Max: 972

Response statistics:
  Average: 914 chars
  Min: 116
  Max: 2201

PREPROCESSING COMPLETE


In [None]:
print("=" * 80)
print("TRAIN-TEST-VALIDATION SPLIT")
print("=" * 80)
print()

print(f"Total pairs: {len(df_clean)}\n")

df_train_val, df_test = train_test_split(
    df_clean,
    test_size=0.15,
    random_state=42
)

df_train, df_val = train_test_split(
    df_train_val,
    test_size=0.176,
    random_state=42
)

df_train = df_train.reset_index(drop=True)
df_val = df_val.reset_index(drop=True)
df_test = df_test.reset_index(drop=True)

total = len(df_train) + len(df_val) + len(df_test)
train_pct = len(df_train) / total * 100
val_pct = len(df_val) / total * 100
test_pct = len(df_test) / total * 100

print(f"Train: {len(df_train)} ({train_pct:.1f}%)")
print(f"Val: {len(df_val)} ({val_pct:.1f}%)")
print(f"Test: {len(df_test)} ({test_pct:.1f}%)\n")

os.makedirs('/content/training_data', exist_ok=True)

def create_txt_file(df, output_path):
    with open(output_path, 'w', encoding='utf-8') as f:
        for idx, row in df.iterrows():
            user = row['Context'].strip()
            response = row['Response'].strip()
            conversation = f"User: {user}\nTherapist: {response}\n<|endoftext|>\n"
            f.write(conversation)

create_txt_file(df_train, '/content/training_data/train.txt')
create_txt_file(df_val, '/content/training_data/val.txt')
create_txt_file(df_test, '/content/training_data/test.txt')

train_size_mb = os.path.getsize('/content/training_data/train.txt') / (1024*1024)
val_size_mb = os.path.getsize('/content/training_data/val.txt') / (1024*1024)
test_size_mb = os.path.getsize('/content/training_data/test.txt') / (1024*1024)

print(f"Train file: {train_size_mb:.2f} MB")
print(f"Val file: {val_size_mb:.2f} MB")
print(f"Test file: {test_size_mb:.2f} MB\n")

print("=" * 80)
print("SPLIT COMPLETE")
print("=" * 80)

TRAIN-TEST-VALIDATION SPLIT

Total pairs: 864

Train: 604 (69.9%)
Val: 130 (15.0%)
Test: 130 (15.0%)

Train file: 0.72 MB
Val file: 0.15 MB
Test file: 0.15 MB

SPLIT COMPLETE


In [None]:
print("=" * 80)
print("LOADING DIALOGPT MODEL")
print("=" * 80)
print()

print("Step 1: Checking GPU\n")

print(f"Device: {device}")
if device.type == 'cuda':
    print(f"GPU: {torch.cuda.get_device_name(0)}")
    print(f"Memory: {torch.cuda.get_device_properties(0).total_memory / 1e9:.1f} GB\n")

print("Step 2: Loading tokenizer\n")

model_name = 'microsoft/DialoGPT-small'
tokenizer = GPT2Tokenizer.from_pretrained(model_name)
tokenizer.pad_token = tokenizer.eos_token

print(f"Tokenizer loaded")
print(f"  Vocab size: {len(tokenizer)}")
print(f"  EOS token: {tokenizer.eos_token}\n")

print("Step 3: Loading model\n")

model = AutoModelForCausalLM.from_pretrained(model_name)
model.to(device)
model.gradient_checkpointing_enable()

print(f"Model loaded")
print(f"  Type: {type(model).__name__}")
print(f"  Parameters: {model.num_parameters():,}")
print(f"  Device: {next(model.parameters()).device}\n")

print("=" * 80)
print("MODEL READY")
print("=" * 80)

LOADING DIALOGPT MODEL

Step 1: Checking GPU

Device: cuda
GPU: Tesla T4
Memory: 15.8 GB

Step 2: Loading tokenizer



The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/614 [00:00<?, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

Tokenizer loaded
  Vocab size: 50257
  EOS token: <|endoftext|>

Step 3: Loading model



config.json:   0%|          | 0.00/641 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/351M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

Model loaded
  Type: GPT2LMHeadModel
  Parameters: 124,439,808
  Device: cuda:0

MODEL READY


In [None]:
print("=" * 80)
print("CONFIGURING FINE-TUNING")
print("=" * 80)
print()

print("Step 1: Loading datasets\n")

train_dataset = TextDataset(
    tokenizer=tokenizer,
    file_path='/content/training_data/train.txt',
    block_size=256
)

val_dataset = TextDataset(
    tokenizer=tokenizer,
    file_path='/content/training_data/val.txt',
    block_size=256
)

test_dataset = TextDataset(
    tokenizer=tokenizer,
    file_path='/content/training_data/test.txt',
    block_size=256
)

print(f"Train blocks: {len(train_dataset)}")
print(f"Val blocks: {len(val_dataset)}")
print(f"Test blocks: {len(test_dataset)}\n")

print("Step 2: Creating data collator\n")

data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer,
    mlm=False
)

print("Data collator created\n")

print("Step 3: Setting training arguments\n")

training_args = TrainingArguments(
    output_dir='/content/dialogpt_finetuned_model',
    num_train_epochs=8,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    gradient_accumulation_steps=2,
    save_steps=40,
    eval_steps=40,
    logging_steps=10,
    learning_rate=5e-5,
    warmup_steps=100,
    weight_decay=0.01,
    max_grad_norm=1.0,
    fp16=True,
    save_strategy='epoch',
    eval_strategy='steps',
    report_to='none',
    seed=42,
)

print(f"Training configuration:")
print(f"  Epochs: {training_args.num_train_epochs}")
print(f"  Batch size: {training_args.per_device_train_batch_size}")
print(f"  Learning rate: {training_args.learning_rate}")
print(f"  Warmup steps: {training_args.warmup_steps}")
print(f"  Weight decay: {training_args.weight_decay}\n")

print("Step 4: Initializing Trainer\n")

trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
)

print("Trainer initialized\n")

print("=" * 80)
print("FINE-TUNING READY")
print("=" * 80)
print()

print(f"Ready to fine-tune DialoGPT on {len(df_train)} therapy conversations")
print(f"Estimated training time: 15-20 minutes")

CONFIGURING FINE-TUNING

Step 1: Loading datasets





Train blocks: 630
Val blocks: 133
Test blocks: 136

Step 2: Creating data collator

Data collator created

Step 3: Setting training arguments

Training configuration:
  Epochs: 8
  Batch size: 4
  Learning rate: 5e-05
  Warmup steps: 100
  Weight decay: 0.01

Step 4: Initializing Trainer

Trainer initialized

FINE-TUNING READY

Ready to fine-tune DialoGPT on 604 therapy conversations
Estimated training time: 15-20 minutes


In [None]:
print("=" * 80)
print("STARTING FINE-TUNING")
print("=" * 80)
print()

start_time = time.time()

print("Training initiated...\n")

train_result = trainer.train()

training_time = (time.time() - start_time) / 60

print(f"\nTraining complete")
print(f"Time: {training_time:.1f} minutes")
print(f"Final training loss: {train_result.training_loss:.4f}\n")

print("=" * 80)
print("EVALUATING VALIDATION SET")
print("=" * 80)
print()

eval_results = trainer.evaluate()

print(f"Validation loss: {eval_results['eval_loss']:.4f}\n")

print("=" * 80)
print("EVALUATING TEST SET")
print("=" * 80)
print()

test_results = trainer.evaluate(eval_dataset=test_dataset)

print(f"Test loss: {test_results['eval_loss']:.4f}\n")

print("=" * 80)
print("SAVING MODEL")
print("=" * 80)
print()

model.save_pretrained('/content/dialogpt_finetuned_model')
tokenizer.save_pretrained('/content/dialogpt_finetuned_model')

print("Model saved to /content/dialogpt_finetuned_model\n")

print("=" * 80)
print("FINE-TUNING SUMMARY")
print("=" * 80)
print()

print(f"Training loss: {train_result.training_loss:.4f}")
print(f"Validation loss: {eval_results['eval_loss']:.4f}")
print(f"Test loss: {test_results['eval_loss']:.4f}")
print(f"Training time: {training_time:.1f} minutes")
print(f"Epochs completed: {training_args.num_train_epochs}\n")

print("=" * 80)
print("FINE-TUNING SUCCESSFUL")
print("=" * 80)

STARTING FINE-TUNING

Training initiated...



`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...
`loss_type=None` was set in the config but it is unrecognized. Using the default loss: `ForCausalLMLoss`.


Step,Training Loss,Validation Loss
40,6.4415,5.380874
80,4.4145,3.977599
120,3.8216,3.495173
160,3.586,3.364051
200,3.4949,3.311197
240,3.3038,3.275526
280,3.2422,3.256155
320,3.1549,3.237315
360,3.138,3.226718
400,3.1057,3.215417



Training complete
Time: 7.0 minutes
Final training loss: 3.6337

EVALUATING VALIDATION SET



Validation loss: 3.2060

EVALUATING TEST SET

Test loss: 3.2157

SAVING MODEL

Model saved to /content/dialogpt_finetuned_model

FINE-TUNING SUMMARY

Training loss: 3.6337
Validation loss: 3.2060
Test loss: 3.2157
Training time: 7.0 minutes
Epochs completed: 8

FINE-TUNING SUCCESSFUL


In [None]:
print("=" * 80)
print("TESTING FINE-TUNED MODEL")
print("=" * 80)
print()

model.eval()

test_cases = [
    "I'm feeling really sad because I failed my exam",
    "I have severe anxiety about my job interview",
    "I'm angry with my family",
    "I can't sleep at night",
    "I'm struggling with depression",
    "I'm having trouble with relationships",
    "I feel overwhelmed and can't cope",
    "I'm anxious about my future",
]

print("Sample predictions:\n")

for i, user_input in enumerate(test_cases, 1):
    print(f"Test {i}:")
    print(f"User: {user_input}")

    prompt = f"User: {user_input}\nTherapist:"
    input_ids = tokenizer.encode(prompt, return_tensors='pt').to(device)

    with torch.no_grad():
        output = model.generate(
            input_ids,
            max_length=120,
            num_beams=1,
            temperature=0.7,
            do_sample=True,
            top_p=0.85,
            repetition_penalty=2.0,
            pad_token_id=tokenizer.eos_token_id,
            eos_token_id=tokenizer.eos_token_id
        )

    response = tokenizer.decode(output[0], skip_special_tokens=True)

    if response.startswith(prompt):
        response = response[len(prompt):].strip()

    if "User:" in response:
        response = response.split("User:")[0].strip()

    print(f"Therapist: {response}\n")

print("=" * 80)
print("TESTING COMPLETE")
print("=" * 80)

The attention mask is not set and cannot be inferred from input because pad token is same as eos token. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.


TESTING FINE-TUNED MODEL

Sample predictions:

Test 1:
User: I'm feeling really sad because I failed my exam
Therapist: Well said! What is the answer?Is it possible that you're just reflecting on what happened to your mother in her late 30s and not realizing she was cheating at some point.If so, then there's a good chance this will become more clear as time goes by...that if one of these things did happen again or didn't, maybe all three times would be considered evidence for why they were cheated upon together; but we can only rule out two possibilities here : either someone who cheats with their family members

Test 2:
User: I have severe anxiety about my job interview
Therapist: Sounds like you are feeling overwhelmed. What is the reason for your lack of confidence in being able to make a decision? Is it because there's no guarantee that this will happen again, or if he won't respond as quickly and with more trust than what would be needed from others? Are these things actually nece

In [None]:
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM

print("=" * 80)
print("LOADING T5 MODEL")
print("=" * 80)
print()

print("Step 1: Checking GPU\n")

print(f"Device: {device}")
if device.type == 'cuda':
    print(f"GPU: {torch.cuda.get_device_name(0)}")
    print(f"Memory: {torch.cuda.get_device_properties(0).total_memory / 1e9:.1f} GB\n")

print("Step 2: Loading tokenizer\n")

model_name = 't5-base'
tokenizer_t5 = AutoTokenizer.from_pretrained(model_name)

print(f"Tokenizer loaded")
print(f"  Vocab size: {len(tokenizer_t5)}\n")

print("Step 3: Loading model\n")

model_t5 = AutoModelForSeq2SeqLM.from_pretrained(model_name)
model_t5.to(device)

print(f"Model loaded")
print(f"  Type: {type(model_t5).__name__}")
print(f"  Parameters: {model_t5.num_parameters():,}")
print(f"  Device: {next(model_t5.parameters()).device}\n")

print("=" * 80)
print("T5 MODEL READY")
print("=" * 80)

LOADING T5 MODEL

Step 1: Checking GPU

Device: cuda
GPU: Tesla T4
Memory: 15.8 GB

Step 2: Loading tokenizer



config.json:   0%|          | 0.00/1.21k [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.39M [00:00<?, ?B/s]

Tokenizer loaded
  Vocab size: 32100

Step 3: Loading model



model.safetensors:   0%|          | 0.00/892M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

Model loaded
  Type: T5ForConditionalGeneration
  Parameters: 222,903,552
  Device: cuda:0

T5 MODEL READY


In [None]:
print("=" * 80)
print("PREPARING T5 DATASETS")
print("=" * 80)
print()

print("Step 1: Creating directory\n")

os.makedirs('/content/t5_training_data', exist_ok=True)

print("Step 2: Creating T5 formatted files\n")

def create_t5_files(df, output_path):
    with open(output_path, 'w', encoding='utf-8') as f:
        for idx, row in df.iterrows():
            context = row['Context'].strip()
            response = row['Response'].strip()
            t5_format = f"therapy: {context}\t{response}\n"
            f.write(t5_format)

create_t5_files(df_train, '/content/t5_training_data/train.tsv')
create_t5_files(df_val, '/content/t5_training_data/val.tsv')
create_t5_files(df_test, '/content/t5_training_data/test.tsv')

print(f"Train file created")
print(f"Val file created")
print(f"Test file created\n")

print("Step 3: Loading T5 datasets\n")

from datasets import load_dataset

train_dataset_t5 = load_dataset('csv', data_files='/content/t5_training_data/train.tsv', delimiter='\t', names=['input_text', 'target_text'])['train']
val_dataset_t5 = load_dataset('csv', data_files='/content/t5_training_data/val.tsv', delimiter='\t', names=['input_text', 'target_text'])['train']
test_dataset_t5 = load_dataset('csv', data_files='/content/t5_training_data/test.tsv', delimiter='\t', names=['input_text', 'target_text'])['train']

print(f"Train samples: {len(train_dataset_t5)}")
print(f"Val samples: {len(val_dataset_t5)}")
print(f"Test samples: {len(test_dataset_t5)}\n")

print("Step 4: Preprocessing datasets\n")

def preprocess_function(examples):
    model_inputs = tokenizer_t5(
        examples['input_text'],
        max_length=256,
        truncation=True,
        padding='max_length'
    )

    labels = tokenizer_t5(
        examples['target_text'],
        max_length=256,
        truncation=True,
        padding='max_length'
    )

    model_inputs['labels'] = labels['input_ids']
    return model_inputs

train_dataset_t5 = train_dataset_t5.map(preprocess_function, batched=True)
val_dataset_t5 = val_dataset_t5.map(preprocess_function, batched=True)
test_dataset_t5 = test_dataset_t5.map(preprocess_function, batched=True)

print("Preprocessing complete\n")

print("=" * 80)
print("T5 DATASETS READY")
print("=" * 80)

PREPARING T5 DATASETS

Step 1: Creating directory

Step 2: Creating T5 formatted files

Train file created
Val file created
Test file created

Step 3: Loading T5 datasets



Generating train split: 0 examples [00:00, ? examples/s]

Generating train split: 0 examples [00:00, ? examples/s]

Generating train split: 0 examples [00:00, ? examples/s]

Train samples: 604
Val samples: 130
Test samples: 130

Step 4: Preprocessing datasets



Map:   0%|          | 0/604 [00:00<?, ? examples/s]

Map:   0%|          | 0/130 [00:00<?, ? examples/s]

Map:   0%|          | 0/130 [00:00<?, ? examples/s]

Preprocessing complete

T5 DATASETS READY


In [None]:
from transformers import Seq2SeqTrainer, Seq2SeqTrainingArguments, DataCollatorForSeq2Seq

print("=" * 80)
print("CONFIGURING T5 FINE-TUNING")
print("=" * 80)
print()

print("Step 1: Creating data collator\n")

data_collator_t5 = DataCollatorForSeq2Seq(
    tokenizer_t5,
    model=model_t5,
    padding=True,
    return_tensors='pt'
)

print("Data collator created\n")

print("Step 2: Setting training arguments\n")

training_args_t5 = Seq2SeqTrainingArguments(
    output_dir='/content/t5_finetuned_model',
    num_train_epochs=10,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    gradient_accumulation_steps=2,
    save_steps=50,
    eval_steps=50,
    logging_steps=10,
    learning_rate=5e-5,
    warmup_steps=100,
    weight_decay=0.01,
    max_grad_norm=1.0,
    fp16=True,
    save_strategy='epoch',
    eval_strategy='steps',
    report_to='none',
    seed=42,
)

print(f"Training configuration:")
print(f"  Epochs: {training_args_t5.num_train_epochs}")
print(f"  Batch size: {training_args_t5.per_device_train_batch_size}")
print(f"  Learning rate: {training_args_t5.learning_rate}")
print(f"  Warmup steps: {training_args_t5.warmup_steps}\n")

print("Step 3: Initializing Seq2SeqTrainer\n")

trainer_t5 = Seq2SeqTrainer(
    model=model_t5,
    args=training_args_t5,
    train_dataset=train_dataset_t5,
    eval_dataset=val_dataset_t5,
    data_collator=data_collator_t5,
    tokenizer=tokenizer_t5,
)

print("Trainer initialized\n")

print("=" * 80)
print("T5 FINE-TUNING READY")
print("=" * 80)
print()

print("Ready to fine-tune T5 on 604 therapy conversations")
print("Estimated training time: 12-15 minutes")

CONFIGURING T5 FINE-TUNING

Step 1: Creating data collator

Data collator created

Step 2: Setting training arguments

Training configuration:
  Epochs: 10
  Batch size: 4
  Learning rate: 5e-05
  Warmup steps: 100

Step 3: Initializing Seq2SeqTrainer

Trainer initialized

T5 FINE-TUNING READY

Ready to fine-tune T5 on 604 therapy conversations
Estimated training time: 12-15 minutes


  trainer_t5 = Seq2SeqTrainer(


In [None]:
print("=" * 80)
print("STARTING T5 FINE-TUNING")
print("=" * 80)
print()

start_time = time.time()

print("Training initiated...\n")

train_result_t5 = trainer_t5.train()

training_time = (time.time() - start_time) / 60

print(f"\nTraining complete")
print(f"Time: {training_time:.1f} minutes")
print(f"Final training loss: {train_result_t5.training_loss:.4f}\n")

print("=" * 80)
print("EVALUATING VALIDATION SET")
print("=" * 80)
print()

eval_results_t5 = trainer_t5.evaluate()

print(f"Validation loss: {eval_results_t5['eval_loss']:.4f}\n")

print("=" * 80)
print("EVALUATING TEST SET")
print("=" * 80)
print()

test_results_t5 = trainer_t5.evaluate(eval_dataset=test_dataset_t5)

print(f"Test loss: {test_results_t5['eval_loss']:.4f}\n")

print("=" * 80)
print("SAVING MODEL")
print("=" * 80)
print()

model_t5.save_pretrained('/content/t5_finetuned_model')
tokenizer_t5.save_pretrained('/content/t5_finetuned_model')

print("Model saved to /content/t5_finetuned_model\n")

print("=" * 80)
print("T5 FINE-TUNING SUMMARY")
print("=" * 80)
print()

print(f"Training loss: {train_result_t5.training_loss:.4f}")
print(f"Validation loss: {eval_results_t5['eval_loss']:.4f}")
print(f"Test loss: {test_results_t5['eval_loss']:.4f}")
print(f"Training time: {training_time:.1f} minutes")
print(f"Epochs completed: {training_args_t5.num_train_epochs}\n")

print("=" * 80)
print("T5 FINE-TUNING SUCCESSFUL")
print("=" * 80)

STARTING T5 FINE-TUNING

Training initiated...



Step,Training Loss,Validation Loss
50,6.7491,4.809001
100,2.6847,2.545749
150,2.3716,2.401835
200,2.3246,2.354744
250,2.2034,2.327321
300,2.4273,2.311219
350,2.456,2.298727
400,2.1969,2.28845
450,2.2527,2.280182
500,2.297,2.272887



Training complete
Time: 23.2 minutes
Final training loss: 2.8825

EVALUATING VALIDATION SET



Validation loss: 2.2593

EVALUATING TEST SET

Test loss: 2.2188

SAVING MODEL

Model saved to /content/t5_finetuned_model

T5 FINE-TUNING SUMMARY

Training loss: 2.8825
Validation loss: 2.2593
Test loss: 2.2188
Training time: 23.2 minutes
Epochs completed: 10

T5 FINE-TUNING SUCCESSFUL


In [None]:
print("=" * 80)
print("TESTING T5 FINE-TUNED MODEL")
print("=" * 80)
print()

model_t5.eval()

test_cases = [
    "I'm feeling really sad because I failed my exam",
    "I have severe anxiety about my job interview",
    "I'm angry with my family",
    "I can't sleep at night",
    "I'm struggling with depression",
    "I'm having trouble with relationships",
    "I feel overwhelmed and can't cope",
    "I'm anxious about my future",
]

print("Sample predictions:\n")

for i, user_input in enumerate(test_cases, 1):
    print(f"Test {i}:")
    print(f"User: {user_input}")

    input_text = f"therapy: {user_input}"
    input_ids = tokenizer_t5.encode(input_text, return_tensors='pt').to(device)

    with torch.no_grad():
        output = model_t5.generate(
            input_ids,
            max_length=150,
            num_beams=4,
            temperature=0.7,
            do_sample=True,
            top_p=0.9,
            repetition_penalty=2.0,
        )

    response = tokenizer_t5.decode(output[0], skip_special_tokens=True)

    print(f"Therapist: {response}\n")

print("=" * 80)
print("TESTING COMPLETE")
print("=" * 80)

TESTING T5 FINE-TUNED MODEL

Sample predictions:

Test 1:
User: I'm feeling really sad because I failed my exam
Therapist: I'm sorry to hear that you failed your exam.It sounds like you are feeling really sad because you didn't pass your exam.Also, if you haven't passed your exam yet, then it may be time for you to do something about it.If you haven't already, then maybe you need to start thinking about what you want to do next.

Test 2:
User: I have severe anxiety about my job interview
Therapist: I'm sorry to hear that you are experiencing anxiety about your job interview.It sounds like you are having a hard time making it through the interview process.Also, I would encourage you to talk with a therapist who can help you understand what is going on in your life.

Test 3:
User: I'm angry with my family
Therapist: I'm sorry to hear that you are angry with your family.It sounds like you have a lot going on in your life right now.Also, it sounds like you are having a hard time understand

In [None]:
from google.colab import drive

print("=" * 80)
print("MOUNTING GOOGLE DRIVE")
print("=" * 80)
print()

print("Mounting Google Drive...\n")

drive.mount('/content/gdrive')

print("Drive mounted\n")

print("=" * 80)
print("SAVING MODEL TO GOOGLE DRIVE")
print("=" * 80)
print()

import shutil

drive_model_path = '/content/gdrive/My Drive/t5_finetuned_model'

print(f"Copying model to Google Drive...\n")

if os.path.exists(drive_model_path):
    shutil.rmtree(drive_model_path)

shutil.copytree('/content/t5_finetuned_model', drive_model_path)

print(f"Model saved to: {drive_model_path}\n")

print("=" * 80)
print("UPLOAD COMPLETE")
print("=" * 80)
print()

print("Model files:")
model_files = os.listdir(drive_model_path)
for f in model_files:
    print(f"  - {f}")

print("\nDownload from Google Drive:")
print("  https://drive.google.com")
print("  Folder: t5_finetuned_model")

MOUNTING GOOGLE DRIVE

Mounting Google Drive...

Mounted at /content/gdrive
Drive mounted

SAVING MODEL TO GOOGLE DRIVE

Copying model to Google Drive...

Model saved to: /content/gdrive/My Drive/t5_finetuned_model

UPLOAD COMPLETE

Model files:
  - tokenizer_config.json
  - checkpoint-608
  - config.json
  - checkpoint-228
  - checkpoint-684
  - checkpoint-152
  - model.safetensors
  - checkpoint-380
  - tokenizer.json
  - checkpoint-304
  - checkpoint-532
  - spiece.model
  - special_tokens_map.json
  - checkpoint-456
  - generation_config.json
  - checkpoint-760
  - checkpoint-76

Download from Google Drive:
  https://drive.google.com
  Folder: t5_finetuned_model


In [None]:
from google.colab import drive

print("=" * 80)
print("MOUNTING GOOGLE DRIVE")
print("=" * 80)
print()

drive.mount('/content/gdrive')

print("Drive mounted\n")

print("=" * 80)
print("SAVING MODEL TO GOOGLE DRIVE")
print("=" * 80)
print()

import shutil

drive_model_path = '/content/gdrive/My Drive/mental_health_chatbot_models/t5_finetuned_model'

os.makedirs('/content/gdrive/My Drive/mental_health_chatbot_models', exist_ok=True)

print(f"Copying model to Google Drive...\n")

if os.path.exists(drive_model_path):
    shutil.rmtree(drive_model_path)

shutil.copytree('/content/t5_finetuned_model', drive_model_path)

print(f"Model saved successfully\n")

print("Location: mental_health_chatbot_models/t5_finetuned_model\n")

print("Files in model directory:")
for f in os.listdir(drive_model_path):
    file_size = os.path.getsize(os.path.join(drive_model_path, f)) / (1024*1024)
    print(f"  - {f} ({file_size:.1f} MB)")

MOUNTING GOOGLE DRIVE

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).
Drive mounted

SAVING MODEL TO GOOGLE DRIVE

Copying model to Google Drive...



KeyboardInterrupt: 

In [None]:
from google.colab import drive

print("=" * 80)
print("CLEANING MODEL (REMOVING CHECKPOINTS)")
print("=" * 80)
print()

print("Step 1: Checking model directory\n")

model_dir = '/content/t5_finetuned_model'
items = os.listdir(model_dir)

print(f"Contents before cleanup:")
for item in items:
    item_path = os.path.join(model_dir, item)
    if os.path.isdir(item_path):
        print(f"  - {item}/ (CHECKPOINT)")
    else:
        size = os.path.getsize(item_path) / (1024*1024)
        print(f"  - {item} ({size:.1f} MB)")

print("\nStep 2: Removing checkpoint directories\n")

for item in items:
    item_path = os.path.join(model_dir, item)
    if os.path.isdir(item_path) and item.startswith('checkpoint'):
        import shutil
        shutil.rmtree(item_path)
        print(f"  Removed: {item}")

print("\nStep 3: Final model size\n")

final_items = os.listdir(model_dir)
total_size = sum(os.path.getsize(os.path.join(model_dir, f)) for f in final_items if os.path.isfile(os.path.join(model_dir, f))) / (1024*1024)

print(f"Model files remaining:")
for item in final_items:
    item_path = os.path.join(model_dir, item)
    if os.path.isfile(item_path):
        size = os.path.getsize(item_path) / (1024*1024)
        print(f"  - {item} ({size:.1f} MB)")

print(f"\nTotal size: {total_size:.1f} MB\n")

print("Step 4: Uploading to Google Drive\n")

drive.mount('/content/gdrive')

drive_model_path = '/content/gdrive/My Drive/mental_health_chatbot_models/t5_finetuned_model'

os.makedirs('/content/gdrive/My Drive/mental_health_chatbot_models', exist_ok=True)

if os.path.exists(drive_model_path):
    shutil.rmtree(drive_model_path)

shutil.copytree(model_dir, drive_model_path)

print(f"Model uploaded to Google Drive\n")

print("=" * 80)
print("COMPLETE")
print("=" * 80)
print()

print(f"Cleaned model size: {total_size:.1f} MB")
print(f"Location: mental_health_chatbot_models/t5_finetuned_model")

CLEANING MODEL (REMOVING CHECKPOINTS)

Step 1: Checking model directory

Contents before cleanup:
  - tokenizer_config.json (0.0 MB)
  - checkpoint-608/ (CHECKPOINT)
  - config.json (0.0 MB)
  - checkpoint-228/ (CHECKPOINT)
  - checkpoint-684/ (CHECKPOINT)
  - checkpoint-152/ (CHECKPOINT)
  - model.safetensors (850.3 MB)
  - checkpoint-380/ (CHECKPOINT)
  - tokenizer.json (2.3 MB)
  - checkpoint-304/ (CHECKPOINT)
  - checkpoint-532/ (CHECKPOINT)
  - spiece.model (0.8 MB)
  - special_tokens_map.json (0.0 MB)
  - checkpoint-456/ (CHECKPOINT)
  - generation_config.json (0.0 MB)
  - checkpoint-760/ (CHECKPOINT)
  - checkpoint-76/ (CHECKPOINT)

Step 2: Removing checkpoint directories

  Removed: checkpoint-608
  Removed: checkpoint-228
  Removed: checkpoint-684
  Removed: checkpoint-152
  Removed: checkpoint-380
  Removed: checkpoint-304
  Removed: checkpoint-532
  Removed: checkpoint-456
  Removed: checkpoint-760
  Removed: checkpoint-76

Step 3: Final model size

Model files remaining:
  