In [None]:
import kagglehub
import pandas as pd
import numpy as np
import json
import os
import re
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
import torch
from datasets import Dataset, DatasetDict
from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    TrainingArguments,
    Trainer,
    DataCollatorWithPadding
)
from sklearn.metrics import accuracy_score, precision_recall_fscore_support

# Download and Load Data

- Downloads dataset archive using `kagglehub`  
- Locates CSV file within the archive  
- Loads data into pandas DataFrame  
- Prints DataFrame shape and initial rows

In [None]:
dataset_path = kagglehub.dataset_download("neelghoshal/reddit-mental-health-data")
print(f"Dataset downloaded to: {dataset_path}")

csv_file = None
for root, dirs, files in os.walk(dataset_path):
    for file in files:
        if file.endswith('.csv'):
            csv_file = os.path.join(root, file)
            break
    if csv_file:
        break

if csv_file:
    print(f"Loading CSV file: {csv_file}")
    df = pd.read_csv(csv_file)
    print("Dataset loaded successfully!")
    print("First 5 records:")
    print(df.head())
    print(f"\nDataset shape: {df.shape}")
    print(f"Columns: {list(df.columns)}")
else:
    print("No CSV file found in the dataset")

Downloading from https://www.kaggle.com/api/v1/datasets/download/neelghoshal/reddit-mental-health-data?dataset_version_number=1...


100%|██████████| 1.83M/1.83M [00:00<00:00, 103MB/s]

Extracting files...
Dataset downloaded to: /root/.cache/kagglehub/datasets/neelghoshal/reddit-mental-health-data/versions/1
Loading CSV file: /root/.cache/kagglehub/datasets/neelghoshal/reddit-mental-health-data/versions/1/data_to_be_cleansed.csv
Dataset loaded successfully!
First 5 records:
   Unnamed: 0                                               text  \
0           0  Welcome to /r/depression's check-in post - a p...   
1           1  We understand that most people who reply immed...   
2           2  Anyone else just miss physical touch? I crave ...   
3           3  I’m just so ashamed. Everyone and everything f...   
4           4  I really need a friend. I don't even have a si...   

                                               title  target  
0  Regular check-in post, with information about ...       1  
1  Our most-broken and least-understood rules is ...       1  
2  I haven’t been touched, or even hugged, in so ...       1  
3                    Being Depressed is Embarr




# Preprocess Text and Encode Labels

**Text Standardization:**
- Removes URLs and artifacts from text data

**Label Processing:**
- Maps numeric classes to string labels (e.g., `0 → 'Stress'`)
- Uses `LabelEncoder` to convert labels to integer IDs

**Data Filtering:**
- Removes empty or very short text entries

In [None]:
def clean_text(text):
    """Clean and preprocess text data."""
    if pd.isna(text) or text == "":
        return ""
    # Convert to string
    text = str(text)
    # Remove URLs
    text = re.sub(r'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+', '', text)
    # Remove Reddit-specific formatting
    text = re.sub(r'\[.*?\]', '', text)
    text = re.sub(r'\(.*?\)', '', text)
    # Remove excessive whitespace
    text = re.sub(r'\s+', ' ', text)
    # Remove special characters but keep basic punctuation
    text = re.sub(r'[^\w\s\.\!\?\,\;\:\-\(\)]', '', text)
    # Strip whitespace
    text = text.strip()
    return text

# Prepare labels
target_to_label = {
    0: 'Stress',
    1: 'Depression',
    2: 'Bipolar disorder',
    3: 'Personality disorder',
    4: 'Anxiety'
}

df['label'] = df['target'].map(target_to_label)
# Drop rows where target was not in our mapping
df.dropna(subset=['label'], inplace=True)
print(f"Using 'target' column for labels. Number of samples after dropping unmapped targets: {len(df)}")

# Encode labels
label_encoder = LabelEncoder()
df['label_id'] = label_encoder.fit_transform(df['label'])
print(f"Label distribution:")
for label, count in df['label'].value_counts().items():
    print(f"  {label}: {count}")

Using 'target' column for labels. Number of samples after dropping unmapped targets: 5957
Label distribution:
  Depression: 1202
  Personality disorder: 1201
  Anxiety: 1188
  Bipolar disorder: 1185
  Stress: 1181


# Split Data into Train, Validation, and Test Sets

**Data Preparation:**
- Selects final columns: `text`, `label`, and `label_id`

**Stratified Splitting:**
- Creates three subsets:
  - Training set
  - Validation set  
  - Test set
- Maintains consistent class distribution across all splits

In [None]:
text_columns = ['text', 'body', 'content', 'post', 'comment', 'title']
text_column = None
for col in text_columns:
    if col in df.columns:
        text_column = col
        break

# Clean text data
print("Cleaning text data...")
df['cleaned_text'] = df[text_column].apply(clean_text)
# Remove rows with empty text after cleaning
initial_count = len(df)
df = df[df['cleaned_text'].str.len() > 10]
print(f"Removed {initial_count - len(df)} rows with empty/short text after cleaning")

# Create fine-tuning dataset
print("Creating fine-tuning dataset...")
# Select relevant columns
fine_tuning_data = df[['cleaned_text', 'label', 'label_id']].copy()
# Rename columns for consistency
fine_tuning_data = fine_tuning_data.rename(columns={'cleaned_text': 'text'})
# Split into train/validation/test sets
if len(fine_tuning_data['label'].unique()) > 1:
    train_df, temp_df = train_test_split(fine_tuning_data, test_size=0.3, random_state=42, stratify=fine_tuning_data['label'])
    val_df, test_df = train_test_split(temp_df, test_size=0.5, random_state=42, stratify=temp_df['label'])


print(f"Dataset splits:")
print(f"  Train: {len(train_df)} samples")
print(f"  Validation: {len(val_df)} samples")
print(f"  Test: {len(test_df)} samples")

Cleaning text data...
Removed 429 rows with empty/short text after cleaning
Creating fine-tuning dataset...
Dataset splits:
  Train: 3869 samples
  Validation: 829 samples
  Test: 830 samples


<hr>

# Create Hugging Face DatasetDict

**Conversion:**
- Transforms pandas DataFrames into Hugging Face `Dataset` objects  
  - Training split  
  - Validation split  
  - Test split  

**Consolidation:**
- Combines all splits into a `DatasetDict`  
- Prepares data in the standard format for Hugging Face `Trainer` API

In [None]:
train_dataset = Dataset.from_pandas(train_df)
val_dataset = Dataset.from_pandas(val_df)
test_dataset = Dataset.from_pandas(test_df)
dataset = DatasetDict({
    'train': train_dataset,
    'validation': val_dataset,
    'test': test_dataset
})

# Create label mapping
label_mapping = {i: label for i, label in enumerate(label_encoder.classes_)}
dataset_info = {
    "num_classes": len(label_encoder.classes_),
    "class_names": list(label_encoder.classes_),
    "train_samples": len(train_df),
    "validation_samples": len(val_df),
    "test_samples": len(test_df),
    "total_samples": len(train_df) + len(val_df) + len(test_df)
}
print(f"Dataset info: {dataset_info}")
print(f"Label mapping: {label_mapping}")

Dataset info: {'num_classes': 5, 'class_names': ['Anxiety', 'Bipolar disorder', 'Depression', 'Personality disorder', 'Stress'], 'train_samples': 3869, 'validation_samples': 829, 'test_samples': 830, 'total_samples': 5528}
Label mapping: {0: 'Anxiety', 1: 'Bipolar disorder', 2: 'Depression', 3: 'Personality disorder', 4: 'Stress'}


# Load Pre-trained Model and Tokenizer

**Model Setup:**
- Architecture: `distilbert-base-uncased`
- Customization:
  - Replaces head with `AutoModelForSequenceClassification`
  - Configures output layer for target class count

**Tokenization:**
- Loads matching tokenizer


In [None]:
# Load tokenizer and model
model_name = "distilbert-base-uncased"
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")
print(f"Model: {model_name}")

# Load tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_name)
# Add padding token if it doesn't exist
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token

# Load model
model = AutoModelForSequenceClassification.from_pretrained(
    model_name,
    num_labels=dataset_info["num_classes"],
    ignore_mismatched_sizes=True,
    # problem_type="single_label_classification" # Removed as it's only for num_labels=1
)
# Move model to device
model.to(device)
print(f"Model loaded with {dataset_info['num_classes']} output classes")

Using device: cuda
Model: distilbert-base-uncased


Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Model loaded with 5 output classes


# Tokenize the Datasets

**Tokenization Function:**
- Converts raw text → numerical `input_ids`
- Handles:
  - Padding to uniform length
  - Truncation (max_length=512)

**Batch Processing:**
- Applies to entire `DatasetDict`
- Output format: PyTorch tensors

**Configuration:**
- Fixed sequence length: 512 tokens
- Automatic padding/truncation

In [None]:
# Tokenize datasets
def tokenize_function(examples):
    """Tokenize the examples and add labels."""
    tokens = tokenizer(
        examples["text"],
        padding="max_length",
        truncation=True,
        max_length=512
    )
    # Use label_id instead of label for integer labels
    # For multi-class classification, labels should be shape [batch_size]
    tokens["labels"] = examples["label_id"]
    return tokens

print("Preparing dataset for training...")
# Tokenize the dataset
tokenized_dataset = dataset.map(
    tokenize_function,
    batched=True,
    # Remove the original 'label' column which contains strings
    remove_columns=['label', 'label_id']
)
# Set format for PyTorch
tokenized_dataset.set_format("torch")
print("Dataset tokenization completed!")

Preparing dataset for training...


Map:   0%|          | 0/3869 [00:00<?, ? examples/s]

Map:   0%|          | 0/829 [00:00<?, ? examples/s]

Map:   0%|          | 0/830 [00:00<?, ? examples/s]

Dataset tokenization completed!


# Compute Evaluation Metrics

**Functionality:**
- Calculates classification metrics for model evaluation
- Processes raw model predictions and true labels

**Metrics Computed:**
- Accuracy
- Weighted F1-score  
- Weighted Precision  
- Weighted Recall  

**Key Operations:**  
1. Extracts class predictions via argmax  
2. Computes multi-class metrics with weighting  


In [None]:
def compute_metrics(eval_pred):
    """Compute metrics for evaluation."""
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, predictions, average='weighted')
    acc = accuracy_score(labels, predictions)
    return {
        'accuracy': acc,
        'f1': f1,
        'precision': precision,
        'recall': recall
    }

# Train the Model

**Training Configuration:**
- Output directory: `results/fine_tuned_model`
- Learning rate: 2e-5
- Batch size: 8 (train/eval)
- Epochs: 3
- Weight decay: 0.01
- FP16 acceleration

**Training Process:**
- Evaluation after each epoch
- Model checkpointing per epoch
- Best model retention
- Gradient accumulation (steps: 4)
- Warmup steps: 500

**Monitoring:**
- Logging every 100 steps
- Disabled external reporting

In [None]:
# Train the model
print("Starting model training...")
training_args = TrainingArguments(
    output_dir="results/fine_tuned_model",
    learning_rate=2e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=3,
    weight_decay=0.01,
    eval_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    push_to_hub=False,
    logging_dir="results/fine_tuned_model/logs",
    logging_steps=100,
    save_total_limit=2,
    dataloader_num_workers=2,
    warmup_steps=500,
    gradient_accumulation_steps=4,
    fp16=torch.cuda.is_available(),
    report_to=[],
    run_name=None,
)

# Data collator
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

# Initialize trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["validation"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

# Train the model
print("Training started...")
trainer.train()
print("Training completed!")

Starting model training...
Training started...


  trainer = Trainer(


Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
1,1.6038,1.563548,0.366707,0.305483,0.526777,0.366707
2,1.4945,1.114819,0.639324,0.643294,0.670544,0.639324
3,1.0984,0.780678,0.733414,0.73251,0.747913,0.733414


Training completed!



- Completed 3 epochs in ~3 minutes
- Steady improvement across all metrics
- 100% increase in accuracy (36.67% → 73.34%)
- Validation loss decreased by 50% (1.5635 → 0.7807)
- Healthy convergence pattern (no overfitting)


<br>
<hr>

# Evaluate Model on Test Set

In [None]:
# Evaluate the model
print("Evaluating model on test set...")
# Evaluate on test set
results = trainer.evaluate(tokenized_dataset["test"])
print("Test Results:")
for key, value in results.items():
    print(f"  {key}: {value:.4f}")

Evaluating model on test set...


Test Results:
  eval_loss: 0.8708
  eval_accuracy: 0.6867
  eval_f1: 0.6863
  eval_precision: 0.6979
  eval_recall: 0.6867
  eval_runtime: 3.4634
  eval_samples_per_second: 239.6500
  eval_steps_per_second: 30.0280
  epoch: 3.0000


In [None]:
# Make sample predictions
def predict_sample(text):
    """Make a prediction on a sample text."""
    # Tokenize the input
    inputs = tokenizer(
        text,
        padding="max_length",
        truncation=True,
        max_length=512,
        return_tensors="pt"
    )
    # Move inputs to device
    inputs = {k: v.to(device) for k, v in inputs.items()}
    # Make prediction
    with torch.no_grad():
        outputs = model(**inputs)
        predictions = torch.softmax(outputs.logits, dim=-1)
        predicted_class = torch.argmax(predictions, dim=-1).item()
        confidence = predictions[0][predicted_class].item()
    # Get label name
    predicted_label = label_mapping[predicted_class] # Access using integer key
    return {
        "text": text,
        "predicted_label": predicted_label,
        "predicted_class": predicted_class,
        "confidence": confidence,
        "all_probabilities": predictions[0].cpu().numpy().tolist()
    }

print("\n Sample Predictions:")
sample_texts = [
    "I've been feeling really down lately and can't seem to get out of bed.",
    "My anxiety is through the roof today, I can't stop worrying about everything.",
    "I'm having trouble sleeping and my thoughts are racing constantly."
]

for text in sample_texts:
    prediction = predict_sample(text)
    print(f"Text: {text[:50]}...")
    print(f"Prediction: {prediction['predicted_label']} (confidence: {prediction['confidence']:.3f})")
    print("-" * 40)


 Sample Predictions:
Text: I've been feeling really down lately and can't see...
Prediction: Depression (confidence: 0.450)
----------------------------------------
Text: My anxiety is through the roof today, I can't stop...
Prediction: Anxiety (confidence: 0.816)
----------------------------------------
Text: I'm having trouble sleeping and my thoughts are ra...
Prediction: Anxiety (confidence: 0.384)
----------------------------------------


## Key Hyperparameters

- **learning_rate**: Step size for gradient descent. Small = stable, slow; large = fast, risky. *(2e-5)*
- **train_batch_size**: Batch size per device. Larger = stable gradients, more memory. *(8)*
- **num_train_epochs**: Training cycles over data. More = better fit, risk of overfitting. *(3)*
- **weight_decay**: L2 regularization to prevent overfitting. *(0.01)*
- **warmup_steps**: Gradual LR increase to stabilize early training. *(500)*
- **gradient_accumulation_steps**: Combines gradients over steps to mimic larger batch size. *(4)*

---

## Additional Tuning Options

- **max_length**: Max token length. Short = faster, less info; long = more context, higher cost. *(512)*
- **dropout rate**: Controls regularization; reduces overfitting.
- **LR scheduler**: Strategy for adjusting learning rate (e.g., linear, cosine).


In [None]:
hyperparameter_ranges = {
    "learning_rate": [1e-5, 2e-5, 3e-5, 5e-5],
    "per_device_train_batch_size": [8, 16, 32],
    "num_train_epochs": [3, 5, 10],
    "weight_decay": [0.0, 0.01, 0.1],
    "warmup_steps": [0, 100, 500],
    "gradient_accumulation_steps": [1, 2, 4, 8],
    "max_length": [256, 512],
    # Dropout can be tuned, usually in model config, default is often 0.1 for DistilBERT
    # "dropout": [0.1, 0.2, 0.3], # If we were tuning model config directly
    # Learning rate scheduler type is another option, default is 'linear'
    # "lr_scheduler_type": ["linear", "cosine"],
}

# Hyperparameter Tuning with Optuna

### Optimization Strategy
- **Search Space**:
  - Learning rate: 1e-5 to 5e-5 (log scale)
  - Batch sizes: [8, 16, 32]
  - Epochs: [3, 5, 10]
  - Sequence lengths: [256, 512]
  
- **Optimization Target**: Maximize validation accuracy

### Key Features
- **Automatic Exploration**: Tests 5 different configurations
- **Fresh Start**: Reinitializes model weights for each trial
- **Efficient Setup**:
  - Disables checkpointing during search
  - Uses GPU acceleration when available




In [None]:
import optuna

# Define the objective function for Optuna
def objective(trial):
    learning_rate = trial.suggest_float("learning_rate", 1e-5, 5e-5, log=True)
    per_device_train_batch_size = trial.suggest_categorical("per_device_train_batch_size", [8, 16, 32])
    num_train_epochs = trial.suggest_categorical("num_train_epochs", [3, 5, 10])
    weight_decay = trial.suggest_categorical("weight_decay", [0.0, 0.01, 0.1])
    warmup_steps = trial.suggest_categorical("warmup_steps", [0, 100, 500])
    gradient_accumulation_steps = trial.suggest_categorical("gradient_accumulation_steps", [1, 2, 4, 8])
    max_length = trial.suggest_categorical("max_length", [256, 512])

    # Re-tokenize dataset with suggested max_length
    def tokenize_and_format(examples):
        tokens = tokenizer(
            examples["text"],
            padding="max_length",
            truncation=True,
            max_length=max_length
        )
        tokens["labels"] = examples["label_id"]
        return tokens

    tokenized_dataset_trial = dataset.map(
        tokenize_and_format,
        batched=True,
        remove_columns=['label', 'label_id', 'text']
    )
    tokenized_dataset_trial.set_format("torch")


    # Create TrainingArguments with suggested hyperparameters
    training_args_trial = TrainingArguments(
        output_dir=f"results/optuna_trial_{trial.number}",
        learning_rate=learning_rate,
        per_device_train_batch_size=per_device_train_batch_size,
        num_train_epochs=num_train_epochs,
        weight_decay=weight_decay,
        eval_strategy="epoch",
        save_strategy="no",
        load_best_model_at_end=False,
        push_to_hub=False,
        logging_dir=f"results/optuna_trial_{trial.number}/logs",
        logging_steps=100,
        save_total_limit=0,
        dataloader_num_workers=2,
        warmup_steps=warmup_steps,
        gradient_accumulation_steps=gradient_accumulation_steps,
        fp16=torch.cuda.is_available(),
        report_to=[],
    )

    # Data collator
    data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

    # Re-initialize the model for each trial to reset weights
    model_trial = AutoModelForSequenceClassification.from_pretrained(
        model_name,
        num_labels=dataset_info["num_classes"],
        ignore_mismatched_sizes=True,
    ).to(device)


    # Initialize trainer
    trainer = Trainer(
        model=model_trial,
        args=training_args_trial,
        train_dataset=tokenized_dataset_trial["train"],
        eval_dataset=tokenized_dataset_trial["validation"],
        tokenizer=tokenizer,
        data_collator=data_collator,
        compute_metrics=compute_metrics,
    )

    # Train the model
    trainer.train()

    # Evaluate on validation set
    eval_results = trainer.evaluate()

    # Return the metric to optimize
    return eval_results["eval_accuracy"]

# Create an Optuna study
study = optuna.create_study(direction="maximize")

# Run the optimization
print("Starting Optuna hyperparameter tuning...")
study.optimize(objective, n_trials=5)

print("Optuna hyperparameter tuning completed!")
print("Best hyperparameters:", study.best_params)
print("Best accuracy:", study.best_value)

[I 2025-08-02 16:29:54,884] A new study created in memory with name: no-name-47c9fae6-98dc-4cc4-be5c-1dd8e02fb882


Starting Optuna hyperparameter tuning...


Map:   0%|          | 0/3869 [00:00<?, ? examples/s]

Map:   0%|          | 0/829 [00:00<?, ? examples/s]

Map:   0%|          | 0/830 [00:00<?, ? examples/s]

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(


Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
1,1.5006,0.892206,0.703257,0.702218,0.708946,0.703257
2,0.8334,0.701937,0.756333,0.75728,0.761491,0.756333
3,0.609,0.67704,0.78649,0.786917,0.797345,0.78649
4,0.4131,0.704916,0.784077,0.783225,0.787509,0.784077
5,0.2167,0.744367,0.788902,0.787895,0.791117,0.788902
6,0.1405,0.771374,0.792521,0.792287,0.798167,0.792521
7,0.1069,0.801404,0.802171,0.802899,0.805447,0.802171
8,0.0731,0.858886,0.794934,0.795367,0.798297,0.794934
9,0.0811,0.874896,0.803378,0.804244,0.807707,0.803378
10,0.0403,0.87457,0.804584,0.804323,0.805925,0.804584


[I 2025-08-02 16:39:03,829] Trial 0 finished with value: 0.804583835946924 and parameters: {'learning_rate': 2.9337206417264026e-05, 'per_device_train_batch_size': 8, 'num_train_epochs': 10, 'weight_decay': 0.0, 'warmup_steps': 100, 'gradient_accumulation_steps': 4, 'max_length': 512}. Best is trial 0 with value: 0.804583835946924.


Map:   0%|          | 0/3869 [00:00<?, ? examples/s]

Map:   0%|          | 0/829 [00:00<?, ? examples/s]

Map:   0%|          | 0/830 [00:00<?, ? examples/s]

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(


Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
1,No log,1.188732,0.570567,0.566712,0.582951,0.570567
2,No log,0.838577,0.717732,0.71783,0.737943,0.717732
3,No log,0.728124,0.756333,0.757965,0.7732,0.756333
4,0.977500,0.691867,0.778046,0.777377,0.783124,0.778046
5,0.977500,0.638792,0.788902,0.789746,0.794408,0.788902
6,0.977500,0.660394,0.792521,0.792705,0.798364,0.792521
7,0.366300,0.650908,0.797346,0.797714,0.799574,0.797346
8,0.366300,0.65418,0.79614,0.79648,0.798266,0.79614
9,0.366300,0.663254,0.797346,0.797468,0.799691,0.797346
10,0.161500,0.662546,0.79614,0.796337,0.798467,0.79614


[I 2025-08-02 16:43:17,033] Trial 1 finished with value: 0.7961399276236429 and parameters: {'learning_rate': 4.167868320377578e-05, 'per_device_train_batch_size': 16, 'num_train_epochs': 10, 'weight_decay': 0.1, 'warmup_steps': 0, 'gradient_accumulation_steps': 8, 'max_length': 256}. Best is trial 0 with value: 0.804583835946924.


Map:   0%|          | 0/3869 [00:00<?, ? examples/s]

Map:   0%|          | 0/829 [00:00<?, ? examples/s]

Map:   0%|          | 0/830 [00:00<?, ? examples/s]

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(


Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
1,1.2895,0.913641,0.696019,0.693764,0.700201,0.696019
2,0.7785,0.760067,0.732207,0.731962,0.743747,0.732207
3,0.533,0.685517,0.772014,0.771789,0.775808,0.772014
4,0.4231,0.685362,0.764777,0.76477,0.768071,0.764777
5,0.3727,0.670896,0.778046,0.778454,0.780518,0.778046


[I 2025-08-02 16:47:56,618] Trial 2 finished with value: 0.7780458383594693 and parameters: {'learning_rate': 1.5168115145221602e-05, 'per_device_train_batch_size': 16, 'num_train_epochs': 5, 'weight_decay': 0.0, 'warmup_steps': 100, 'gradient_accumulation_steps': 1, 'max_length': 512}. Best is trial 0 with value: 0.804583835946924.


Map:   0%|          | 0/3869 [00:00<?, ? examples/s]

Map:   0%|          | 0/829 [00:00<?, ? examples/s]

Map:   0%|          | 0/830 [00:00<?, ? examples/s]

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(


Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
1,No log,1.563968,0.340169,0.28715,0.423721,0.340169
2,1.544700,1.168276,0.599517,0.603091,0.634048,0.599517
3,1.544700,0.944488,0.702051,0.702434,0.702917,0.702051


[I 2025-08-02 16:49:27,686] Trial 3 finished with value: 0.7020506634499397 and parameters: {'learning_rate': 1.3192294743994892e-05, 'per_device_train_batch_size': 8, 'num_train_epochs': 3, 'weight_decay': 0.1, 'warmup_steps': 100, 'gradient_accumulation_steps': 8, 'max_length': 256}. Best is trial 0 with value: 0.804583835946924.


Map:   0%|          | 0/3869 [00:00<?, ? examples/s]

Map:   0%|          | 0/829 [00:00<?, ? examples/s]

Map:   0%|          | 0/830 [00:00<?, ? examples/s]

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(


Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
1,No log,1.575163,0.347407,0.27712,0.455302,0.347407
2,No log,1.206629,0.548854,0.546936,0.600321,0.548854
3,No log,0.783899,0.738239,0.737006,0.746027,0.738239
4,1.247100,0.711277,0.761158,0.761962,0.768478,0.761158
5,1.247100,0.662597,0.780458,0.779407,0.782893,0.780458
6,1.247100,0.671772,0.791315,0.793372,0.801926,0.791315
7,0.392300,0.648526,0.802171,0.801408,0.802684,0.802171
8,0.392300,0.674702,0.80579,0.807169,0.81435,0.80579
9,0.392300,0.66621,0.810615,0.810715,0.811776,0.810615
10,0.127400,0.671981,0.811821,0.811802,0.814081,0.811821


[I 2025-08-02 16:53:30,239] Trial 4 finished with value: 0.8118214716525934 and parameters: {'learning_rate': 4.498868200475396e-05, 'per_device_train_batch_size': 32, 'num_train_epochs': 10, 'weight_decay': 0.1, 'warmup_steps': 100, 'gradient_accumulation_steps': 4, 'max_length': 256}. Best is trial 4 with value: 0.8118214716525934.


Optuna hyperparameter tuning completed!
Best hyperparameters: {'learning_rate': 4.498868200475396e-05, 'per_device_train_batch_size': 32, 'num_train_epochs': 10, 'weight_decay': 0.1, 'warmup_steps': 100, 'gradient_accumulation_steps': 4, 'max_length': 256}
Best accuracy: 0.8118214716525934



**Best Configuration Achieved 81.2% Accuracy**  
*(Significant improvement from initial 73.3%)*
<br>
<br>
<hr>


# Final Model Training



In [None]:
print("Training final model with best hyperparameters...")

# Get best hyperparameters from Optuna study
best_params = study.best_params

# Define TrainingArguments with best hyperparameters
training_args_best = TrainingArguments(
    output_dir="results/final_model",  # Output directory for the final model
    learning_rate=best_params["learning_rate"],
    per_device_train_batch_size=best_params["per_device_train_batch_size"],
    num_train_epochs=best_params["num_train_epochs"],
    weight_decay=best_params["weight_decay"],
    eval_strategy="epoch",
    save_strategy="epoch",  # Save checkpoints every epoch
    load_best_model_at_end=True, # Load the best model based on validation metric
    push_to_hub=False,
    logging_dir="results/final_model/logs",
    logging_steps=100,
    save_total_limit=2, # Keep only the best and latest model checkpoints
    dataloader_num_workers=2,
    warmup_steps=best_params["warmup_steps"],
    gradient_accumulation_steps=best_params["gradient_accumulation_steps"],
    fp16=torch.cuda.is_available(),
    report_to=[],
    run_name="final_model_training",
)

# Re-initialize the model to train from scratch with the best hyperparameters
# Use the original model_name and number of classes
model_best = AutoModelForSequenceClassification.from_pretrained(
    model_name,
    num_labels=dataset_info["num_classes"],
    ignore_mismatched_sizes=True,
).to(device)

# Re-tokenize the dataset using the best max_length found by Optuna for the final training
best_max_length = best_params["max_length"]
print(f"Retokenizing full dataset with best max_length: {best_max_length}")

def tokenize_and_format_final(examples):
    tokens = tokenizer(
        examples["text"],
        padding="max_length",
        truncation=True,
        max_length=best_max_length
    )
    tokens["labels"] = examples["label_id"]
    return tokens

tokenized_dataset_final = dataset.map(
    tokenize_and_format_final,
    batched=True,
    remove_columns=['label', 'label_id', 'text']
)
tokenized_dataset_final.set_format("torch")


# Initialize trainer with the re-initialized model and best arguments
trainer_best = Trainer(
    model=model_best,
    args=training_args_best,
    train_dataset=tokenized_dataset_final["train"],
    eval_dataset=tokenized_dataset_final["validation"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

# Train the model
print("Final training started...")
trainer_best.train()
print("Final training completed!")

Training final model with best hyperparameters...


Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Retokenizing full dataset with best max_length: 256


Map:   0%|          | 0/3869 [00:00<?, ? examples/s]

Map:   0%|          | 0/829 [00:00<?, ? examples/s]

Map:   0%|          | 0/830 [00:00<?, ? examples/s]

  trainer_best = Trainer(


Final training started...


Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
1,No log,1.569817,0.302774,0.231293,0.290585,0.302774
2,No log,1.219848,0.558504,0.549816,0.584102,0.558504
3,No log,0.817149,0.722557,0.720344,0.729574,0.722557
4,1.265200,0.717299,0.756333,0.756937,0.76322,0.756333
5,1.265200,0.658588,0.784077,0.782876,0.790168,0.784077
6,1.265200,0.639122,0.806996,0.809222,0.815893,0.806996
7,0.400100,0.629611,0.809409,0.809271,0.809301,0.809409
8,0.400100,0.651358,0.816647,0.817569,0.821635,0.816647
9,0.400100,0.665128,0.819059,0.819181,0.820344,0.819059
10,0.119900,0.670402,0.813028,0.81382,0.817003,0.813028


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Final training completed!


In [None]:
print("Evaluating final model on test set...")

# Evaluate on the test set using the trainer with the best model loaded
results_best = trainer_best.evaluate(tokenized_dataset_final["test"])

print("\nFinal Model Test Results:")
for key, value in results_best.items():
    print(f"  {key}: {value:.4f}")

Evaluating final model on test set...



Final Model Test Results:
  eval_loss: 0.8044
  eval_accuracy: 0.7675
  eval_f1: 0.7676
  eval_precision: 0.7704
  eval_recall: 0.7675
  eval_runtime: 2.5956
  eval_samples_per_second: 319.7700
  eval_steps_per_second: 40.0680
  epoch: 10.0000


In [None]:
print("\nSample Predictions with Final Model:")
sample_texts = [
    "I've been feeling really down lately and can't seem to get out of bed. The world feels heavy and meaningless, and I've lost interest in everything I used to enjoy. It's a constant struggle to get through the day.", # Depression
    "My anxiety is through the roof today, I can't stop worrying about everything. My heart is racing, my palms are sweaty, and I have this constant knot in my stomach. I'm scared to even leave the house.", # Anxiety
    "I'm having trouble sleeping and my thoughts are racing constantly. One minute I'm incredibly energetic and feel like I can conquer the world, the next I'm in the depths of despair and can barely function. These mood swings are exhausting.", # Bipolar disorder
    "I have mood swings that are out of control, one minute I'm fine, the next I'm इरिटेटेड. I struggle to maintain stable relationships because I'm so afraid of abandonment and often act impulsively, pushing people away.", # Personality disorder (assuming this example fits)
    "I feel like people are always judging me and I can't form healthy relationships. Even in simple social situations, I feel intense scrutiny and self-doubt. It's hard to trust others, and I often isolate myself to avoid potential criticism or rejection.", # Social Anxiety/Personality Disorder (adjusting to fit categories)
    "The pressure at work has been immense lately. Deadlines are piling up, and I feel completely overwhelmed. I'm constantly worried about not being good enough and it's starting to affect my sleep and overall well-being.", # Stress
    "I had a traumatic experience a while back, and the memories keep flooding back. I have nightmares and flashbacks, and I'm constantly on edge. Loud noises or unexpected events trigger intense fear and panic.", # Could potentially align with Stress or Anxiety depending on specifics, adding for variety
    "I feel so overwhelmed by everything in my life right now. School, work, relationships... it's all too much. I can't seem to catch a break and I'm constantly stressed about falling behind or disappointing someone." # Stress/Anxiety
]

# Ensure the model is on the correct device for prediction
model_best.to(device)

for text in sample_texts:
    # Tokenize the input using the tokenizer associated with the best model's max_length
    inputs = tokenizer(
        text,
        padding="max_length",
        truncation=True,
        max_length=best_max_length, # Use the best max_length
        return_tensors="pt"
    )
    # Move inputs to device
    inputs = {k: v.to(device) for k, v in inputs.items()}
    # Make prediction
    with torch.no_grad():
        outputs = model_best(**inputs)
        predictions = torch.softmax(outputs.logits, dim=-1)
        predicted_class = torch.argmax(predictions, dim=-1).item()
        confidence = predictions[0][predicted_class].item()
    # Get label name
    predicted_label = label_mapping[predicted_class] # Access using integer key
    print(f"Text: {text[:70]}...") # Display a longer snippet of text
    print(f"Prediction: {predicted_label} (confidence: {confidence:.3f})")
    print("-" * 40)

# Report Final Results
print("\n--- Comprehensive Final Model Performance Report ---")
print("This report summarizes the hyperparameter tuning process and the performance of the final fine-tuned model.")

# Check if study object exists before trying to access its attributes
if 'study' in locals() and study is not None:
    print("\nHyperparameter Tuning Summary (using Optuna):")
    print(f"  Number of trials completed: {len(study.trials)}")
    print(f"  Best validation metric (accuracy): {study.best_value:.4f}")
    print("  Best hyperparameters found:")
    for hp, value in study.best_params.items():
        print(f"    - {hp}: {value}")
else:
    print("\nHyperparameter tuning was not performed in this run.")


print("\nFinal Model Test Set Metrics:")
print("The following metrics are based on the evaluation of the final model on the held-out test set.")
# Check if results_best object exists before trying to access its items
if 'results_best' in locals() and results_best is not None:
    for key, value in results_best.items():
        if key.startswith("eval_"):
             print(f"  {key.replace('eval_', '')}: {value:.4f}")
else:
    print("Final model evaluation results are not available.")



Sample Predictions with Final Model:
Text: I've been feeling really down lately and can't seem to get out of bed....
Prediction: Depression (confidence: 0.954)
----------------------------------------
Text: My anxiety is through the roof today, I can't stop worrying about ever...
Prediction: Anxiety (confidence: 0.982)
----------------------------------------
Text: I'm having trouble sleeping and my thoughts are racing constantly. One...
Prediction: Bipolar disorder (confidence: 0.610)
----------------------------------------
Text: I have mood swings that are out of control, one minute I'm fine, the n...
Prediction: Depression (confidence: 0.703)
----------------------------------------
Text: I feel like people are always judging me and I can't form healthy rela...
Prediction: Personality disorder (confidence: 0.974)
----------------------------------------
Text: The pressure at work has been immense lately. Deadlines are piling up,...
Prediction: Anxiety (confidence: 0.650)
---------