In [3]:
from google.colab import files
import pandas as pd

# Upload your training and test files
train_file = files.upload()
test_file = files.upload()

# # Load datasets
train_df = pd.read_csv("train-00000-of-00001.csv")
test_df = pd.read_csv("test-00000-of-00001.csv")
train_df.head()

Unnamed: 0,id,patient_question,distorted_part,dominant_distortion
0,2488,"I have PTSD, have been in a few abusive relati...","I am now 6 months pregnant, lately (past 3 or ...",7
1,453,From the U.S.: I am 13 years old. At the age o...,"thers, to my friends especially, I appear brig...",8
2,1959,"Hi..in the past, Iâ€™d say about 2 years, on an...",I feel like it is not worth the trouble of de...,0
3,800,"From a 15 year old in the U.S.: Hi, Iâ€™ve had t...","Hi, Iâ€™ve had this problem for 4 months itâ€™s be...",9
4,4550,"In the past few years, I havenâ€™t been getting ...","In the past few years, I havenâ€™t been getting ...",1


In [4]:
print(train_df.info())
print(train_df.isnull().sum())

# Fill missing text with empty string
train_df = train_df.fillna("")
test_df = test_df.fillna("")

# Check unique distortion labels
print(train_df["dominant_distortion"].value_counts())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2024 entries, 0 to 2023
Data columns (total 4 columns):
 #   Column               Non-Null Count  Dtype 
---  ------               --------------  ----- 
 0   id                   2024 non-null   int64 
 1    patient_question    2024 non-null   object
 2   distorted_part       2024 non-null   object
 3   dominant_distortion  2024 non-null   int64 
dtypes: int64(2), object(2)
memory usage: 63.4+ KB
None
id                     0
 patient_question      0
distorted_part         0
dominant_distortion    0
dtype: int64
dominant_distortion
2     771
8     187
7     183
3     148
10    129
5     124
4     111
1     106
9      98
6      87
0      80
Name: count, dtype: int64


In [5]:
print("Train columns:", train_df.columns.tolist())
print("Test columns:", test_df.columns.tolist())


Train columns: ['id', ' patient_question', 'distorted_part', 'dominant_distortion']
Test columns: ['id', ' patient_question', 'distorted_part', 'dominant_distortion']


In [6]:
train_df.columns = train_df.columns.str.strip()
test_df.columns = test_df.columns.str.strip()

In [7]:
print("Train columns:", train_df.columns.tolist())
print("Test columns:", test_df.columns.tolist())

Train columns: ['id', 'patient_question', 'distorted_part', 'dominant_distortion']
Test columns: ['id', 'patient_question', 'distorted_part', 'dominant_distortion']


In [8]:
train_df["input_text"] = train_df["patient_question"] + " " + train_df["distorted_part"]
test_df["input_text"] = test_df["patient_question"] + " " + test_df["distorted_part"]

In [9]:
# from sklearn.preprocessing import LabelEncoder

# encoder = LabelEncoder()
# train_df["label"] = encoder.fit_transform(train_df["dominant_distortion"])

# # Save the encoder classes to decode later
# label_classes = list(encoder.classes_)
# print("Classes:", label_classes)


from sklearn.preprocessing import LabelEncoder

# Drop missing labels
train_df = train_df.dropna(subset=["dominant_distortion"])
test_df = test_df.dropna(subset=["dominant_distortion"])

# Encode labels
encoder = LabelEncoder()
train_df["label"] = encoder.fit_transform(train_df["dominant_distortion"])
test_df["label"] = encoder.transform(test_df["dominant_distortion"])

# Check label distribution
print("Label distribution:\n", train_df["dominant_distortion"].value_counts())

# Save encoder classes
label_classes = list(encoder.classes_)
print("Classes:", label_classes)

Label distribution:
 dominant_distortion
2     771
8     187
7     183
3     148
10    129
5     124
4     111
1     106
9      98
6      87
0      80
Name: count, dtype: int64
Classes: [np.int64(0), np.int64(1), np.int64(2), np.int64(3), np.int64(4), np.int64(5), np.int64(6), np.int64(7), np.int64(8), np.int64(9), np.int64(10)]


In [10]:
# from sklearn.model_selection import train_test_split

# train_texts, val_texts, train_labels, val_labels = train_test_split(
#     train_df["input_text"].tolist(),
#     train_df["label"].tolist(),
#     test_size=0.3,
#     random_state=42,
#     stratify=train_df["label"]
# )


from sklearn.model_selection import train_test_split

# Shuffle to ensure random order
train_df = train_df.sample(frac=1, random_state=42).reset_index(drop=True)

# Train-validation split (15% val)
train_texts, val_texts, train_labels, val_labels = train_test_split(
    train_df["input_text"].tolist(),
    train_df["label"].tolist(),
    test_size=0.15,
    random_state=42,
    stratify=train_df["label"]
)

print(f"Training samples: {len(train_texts)}")
print(f"Validation samples: {len(val_texts)}")

Training samples: 1720
Validation samples: 304


In [11]:
# !pip install transformers datasets evaluate -q
# from transformers import DistilBertTokenizerFast, DistilBertForSequenceClassification, Trainer, TrainingArguments
# from datasets import Dataset

# tokenizer = DistilBertTokenizerFast.from_pretrained("distilbert-base-uncased")

# # Tokenize
# train_encodings = tokenizer(train_texts, truncation=True, padding=True, max_length=256)
# val_encodings = tokenizer(val_texts, truncation=True, padding=True, max_length=256)
# test_encodings = tokenizer(test_df["input_text"].tolist(), truncation=True, padding=True, max_length=256)

# train_dataset = Dataset.from_dict({**train_encodings, "labels": train_labels})
# val_dataset = Dataset.from_dict({**val_encodings, "labels": val_labels})




!pip install transformers datasets evaluate -q
from transformers import AutoTokenizer
from datasets import Dataset
import re

# ðŸ”¹ Use public BERT model
tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")

# ðŸ”¹ Clean text
def clean_text(text):
    text = text.lower()
    text = re.sub(r"http\S+", "", text)
    text = re.sub(r"[^a-zA-Z\s]", "", text)
    text = re.sub(r"\s+", " ", text).strip()
    return text

train_texts = [clean_text(t) for t in train_texts]
val_texts = [clean_text(t) for t in val_texts]
test_texts = [clean_text(t) for t in test_df["input_text"].tolist()]

# ðŸ”¹ Create datasets
train_dataset = Dataset.from_dict({"text": train_texts, "labels": train_labels})
val_dataset = Dataset.from_dict({"text": val_texts, "labels": val_labels})
test_dataset = Dataset.from_dict({"text": test_texts})

# ðŸ”¹ Tokenization
def tokenize_function(examples):
    return tokenizer(examples["text"], truncation=True, padding="max_length", max_length=384)

train_dataset = train_dataset.map(tokenize_function, batched=True)
val_dataset = val_dataset.map(tokenize_function, batched=True)
test_dataset = test_dataset.map(tokenize_function, batched=True)

train_dataset.set_format(type='torch', columns=['input_ids', 'attention_mask', 'labels'])
val_dataset.set_format(type='torch', columns=['input_ids', 'attention_mask', 'labels'])


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Map:   0%|          | 0/1720 [00:00<?, ? examples/s]

Map:   0%|          | 0/304 [00:00<?, ? examples/s]

Map:   0%|          | 0/506 [00:00<?, ? examples/s]

In [12]:
from transformers import AutoModelForSequenceClassification, AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")
model = AutoModelForSequenceClassification.from_pretrained(
    "distilbert-base-uncased",
    num_labels=len(label_classes)
)

model.config.hidden_dropout_prob = 0.3
model.gradient_checkpointing_enable()
print("âœ… Model loaded successfully with", len(label_classes), "labels.")


model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


âœ… Model loaded successfully with 11 labels.


In [13]:
# import evaluate
# import numpy as np

# accuracy_metric = evaluate.load("accuracy")
# precision_metric = evaluate.load("precision")
# recall_metric = evaluate.load("recall")
# f1_metric = evaluate.load("f1")

# def compute_metrics(eval_pred):
#     logits, labels = eval_pred
#     preds = np.argmax(logits, axis=-1)
#     return {
#         "accuracy": accuracy_metric.compute(predictions=preds, references=labels)["accuracy"],
#         "precision": precision_metric.compute(predictions=preds, references=labels, average="weighted")["precision"],
#         "recall": recall_metric.compute(predictions=preds, references=labels, average="weighted")["recall"],
#         "f1": f1_metric.compute(predictions=preds, references=labels, average="weighted")["f1"]
#     }




import evaluate
import numpy as np

# Load metrics
accuracy_metric = evaluate.load("accuracy")
precision_metric = evaluate.load("precision")
recall_metric = evaluate.load("recall")
f1_metric = evaluate.load("f1")

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    preds = np.argmax(logits, axis=-1)
    return {
        "accuracy": accuracy_metric.compute(predictions=preds, references=labels)["accuracy"],
        "precision_macro": precision_metric.compute(predictions=preds, references=labels, average="macro")["precision"],
        "precision_weighted": precision_metric.compute(predictions=preds, references=labels, average="weighted")["precision"],
        "recall_macro": recall_metric.compute(predictions=preds, references=labels, average="macro")["recall"],
        "recall_weighted": recall_metric.compute(predictions=preds, references=labels, average="weighted")["recall"],
        "f1_macro": f1_metric.compute(predictions=preds, references=labels, average="macro")["f1"],
        "f1_weighted": f1_metric.compute(predictions=preds, references=labels, average="weighted")["f1"]
    }


Downloading builder script: 0.00B [00:00, ?B/s]

Downloading builder script: 0.00B [00:00, ?B/s]

Downloading builder script: 0.00B [00:00, ?B/s]

Downloading builder script: 0.00B [00:00, ?B/s]

In [14]:
# training_args = TrainingArguments(
#     output_dir="./results",
#     eval_strategy="epoch",
#     save_strategy="epoch",
#     learning_rate=2e-5,
#     per_device_train_batch_size=16,
#     per_device_eval_batch_size=16,
#     num_train_epochs=13,
#     weight_decay=0.01,
#     load_best_model_at_end=True,
#     metric_for_best_model="f1",
# )

# trainer = Trainer(
#     model=model,
#     args=training_args,
#     train_dataset=train_dataset,
#     eval_dataset=val_dataset,
#     tokenizer=tokenizer,
#     compute_metrics=compute_metrics,
# )

# trainer.train()




from transformers import TrainingArguments, Trainer, EarlyStoppingCallback

training_args = TrainingArguments(
    output_dir="./results",
    eval_strategy="epoch",        # evaluate every epoch
    save_strategy="epoch",              # save best per epoch
    learning_rate=3e-5,                 # slightly higher for small datasets
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=12,                # 10â€“12 is a good sweet spot
    weight_decay=0.01,
    warmup_ratio=0.1,                   # stabilizes early training
    load_best_model_at_end=True,
    metric_for_best_model="f1_weighted",  # match compute_metrics key
    fp16=True,                          # use GPU mixed precision if available
    logging_dir="./logs",
    logging_strategy="epoch",
    report_to="none",                   # no external logging
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=3)]  # stops early if no improvement
)

trainer.train()


  trainer = Trainer(


Epoch,Training Loss,Validation Loss,Accuracy,Precision Macro,Precision Weighted,Recall Macro,Recall Weighted,F1 Macro,F1 Weighted
1,2.0256,1.542835,0.453947,0.100734,0.381465,0.174653,0.453947,0.10999,0.389632
2,1.4851,1.494173,0.453947,0.102881,0.390662,0.1779,0.453947,0.110933,0.393437
3,1.4502,1.480594,0.473684,0.119464,0.393362,0.188548,0.473684,0.13663,0.416251
4,1.4091,1.483719,0.460526,0.118542,0.392176,0.180487,0.460526,0.133245,0.410305
5,1.358,1.457186,0.473684,0.148438,0.429876,0.198803,0.473684,0.153892,0.432895
6,1.2514,1.453926,0.490132,0.209266,0.467336,0.225456,0.490132,0.202063,0.466488
7,1.1244,1.45317,0.509868,0.206787,0.462347,0.249966,0.509868,0.212354,0.471025
8,0.9984,1.502247,0.503289,0.216186,0.476527,0.25193,0.503289,0.22534,0.482356
9,0.8793,1.524015,0.486842,0.206778,0.460197,0.236149,0.486842,0.213886,0.467891
10,0.7557,1.543934,0.490132,0.249886,0.491387,0.252012,0.490132,0.241287,0.484626


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize

TrainOutput(global_step=1296, training_loss=1.1700918232953106, metrics={'train_runtime': 361.0003, 'train_samples_per_second': 57.174, 'train_steps_per_second': 3.59, 'total_flos': 2050924456304640.0, 'train_loss': 1.1700918232953106, 'epoch': 12.0})

In [15]:
metrics = trainer.evaluate()
print(f"âœ… Final Model Evaluation:")
print(f"Accuracy:  {metrics['eval_accuracy']:.4f}")
print(f"Precision: {metrics['eval_precision_weighted']:.4f}")
print(f"Recall:    {metrics['eval_recall_weighted']:.4f}")
print(f"F1:        {metrics['eval_f1_weighted']:.4f}")

âœ… Final Model Evaluation:
Accuracy:  0.5000
Precision: 0.4960
Recall:    0.5000
F1:        0.4949


In [16]:
metrics = trainer.evaluate()
print(metrics)
print("Accuracy:", metrics["eval_accuracy"])

{'eval_loss': 1.5605837106704712, 'eval_accuracy': 0.5, 'eval_precision_macro': 0.257586032818231, 'eval_precision_weighted': 0.49598520617850095, 'eval_recall_macro': 0.2625741645842435, 'eval_recall_weighted': 0.5, 'eval_f1_macro': 0.2553168298967709, 'eval_f1_weighted': 0.4949010234807399, 'eval_runtime': 0.9374, 'eval_samples_per_second': 324.31, 'eval_steps_per_second': 20.269, 'epoch': 12.0}
Accuracy: 0.5


In [20]:
# test_pred = trainer.predict(test_dataset=Dataset.from_dict(test_encodings))
# pred_labels = np.argmax(test_pred.predictions, axis=-1)
# test_df["predicted_distortion"] = [label_classes[i] for i in pred_labels]

# test_df[["patient_question", "distorted_part", "predicted_distortion"]].head()

test_pred = trainer.predict(test_dataset)
pred_labels = np.argmax(test_pred.predictions, axis=-1)

# Assign predicted labels
test_df["predicted_distortion"] = [label_classes[i] for i in pred_labels]

# View results
test_df[["patient_question", "distorted_part", "predicted_distortion"]].head()

Unnamed: 0,patient_question,distorted_part,predicted_distortion
0,Hi I am a 21-year-old male student. Recently I...,But now I am really scared that I might get sc...,1
1,When I was about fourteen I thought I was a na...,\N,2
2,"Hi, Iâ€™m a 15 year old. I was abused and sexual...",Iâ€™m scared to talk about this with my therapis...,4
3,"Iâ€™m a seventeen year old student in school, an...","And im afraid because i know how bad i feel, a...",5
4,Could my boyfriend (age 24) be Bipolar? He doe...,I feel like he has split personalities. Many t...,4


In [21]:
from transformers import pipeline

# Use the trained model and tokenizer already in memory
classifier = pipeline("text-classification", model=model, tokenizer=tokenizer)

# ---- Try any sentence ----
sentence = "My parents refuse too believe anything is wrong with me and every day scream at me and break things as well as insult me about how useless I am and how I am always ruining their lives! My friends all say that Iâ€™m amazing and such a good friend but I have a hard time believing them when my OWN parents seem to hate me"

# Predict
result = classifier(sentence)[0]

# Extract label index (e.g., 'LABEL_4' â†’ 4)
pred_id = int(result['label'].split('_')[-1])
predicted_label = label_classes[pred_id]
confidence = round(result['score'] * 100, 2)

print(f"Predicted Distortion: {predicted_label}")
print(f"Confidence: {confidence}%")

Device set to use cuda:0


Predicted Distortion: 5
Confidence: 38.87%


In [22]:
model.save_pretrained("cognitive_distortion_model")
tokenizer.save_pretrained("cognitive_distortion_tokenizer")
test_df.to_csv("test_predictions.csv", index=False)
from google.colab import files
files.download("test_predictions.csv")

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>