<h1><center>Dementia Disease Prediction - 2 </center></h1>

## Reading and understanding the Data

In [2]:
import re
import nltk
from nltk.corpus import stopwords
from transformers import (
    BertTokenizer,
    BertForSequenceClassification,
    Trainer,
    TrainingArguments,
)
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
import pandas as pd
import numpy as np
import torch
from sklearn.metrics import f1_score

nltk.download("stopwords")

  from .autonotebook import tqdm as notebook_tqdm
[nltk_data] Downloading package stopwords to
[nltk_data]     /home/wathsalya/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [3]:
# Load dataset
file_path = "dementia_dataset_4.csv"
df = pd.read_csv(file_path)

df.head()

Unnamed: 0,Transcript_CTD,Transcript_PFT,Transcript_SFT,Class_label
0,,"Pat: People, partner, plate, platter, pants, p...","Pat: (3 seconds) Giraffe, kangaroo, lion, tige...",1
1,Pat: (4 seconds) There’s a lad stood on the st...,"Pat: (1 second) Pipe, plane, people (5 seconds...","Pat: (1 second) Dogs, cats, birds (1 second) m...",1
2,"Pat: (3 seconds) Um, the picture is of a kitch...","Pat: Um (1 second) purple, pale, placid (1 sec...","Pat: Cow, bull, ewe, ram, chicken, goose, um (...",1
3,"Pat: A mother presumably, or a fe, an adult fe...",Pat: Plank (1 second) pool (1 second) swimming...,Pat: Um (1 second) impala (1 second) er cheeta...,1
4,Pat: ‘50s style er scene of domestic um confus...,"Pat: It’s er pillock, er post box, er Pyracant...","Pat: Dog, cat, giraffe, wallaby, kangaroo, tor...",1


In [4]:
df.tail()

Unnamed: 0,Transcript_CTD,Transcript_PFT,Transcript_SFT,Class_label
3138,Pat: The sink is overflowing; the woman doing ...,"Pat: Um, precise, prescient, er procrastinatio...","Pat: Um, well (buzzer sounds) er cat, dog, rab...",0
3139,Pat: I see a scene of absolute chaos in this p...,"Pat: (Buzzer sounds) Picture, plate, palm, pho...","Pat: Cat, lion, tiger (buzzer sounds) oh. Cat,...",0
3140,Pat: Little boy falling off a chair whilst pas...,"Pat: Countries beginning with P: Paraguay, Por...","Pat: Horse, dog, cat, pig, hen (1 second) walr...",0
3141,"Pat: (3 seconds) Er, little boy stood on a sto...","Pat: Phew, phew, phew (7 seconds) Phidi, Phila...","Pat: Pig, cat, dog (buzzer sounds) pig, cat, d...",1
3142,"Pat: (3 seconds) OK, well there’s a boy stood ...","Pat: (3 seconds) Er (2 seconds) paternity, pet...","Pat: Ooh. Armadillo, antelope, bear, buffalo, ...",0


In [5]:
df.shape

(3143, 4)

In [6]:
df["label"] = df["Class_label"]

print(df['label'].value_counts(normalize=True))

label
1    0.503341
0    0.496659
Name: proportion, dtype: float64


In [7]:
# Enhanced text cleaning function
def clean_text(text):
    text = str(text).lower()
    text = re.sub(r"\s+", " ", text)  # Preserve punctuation
    return text.strip()

In [8]:
# Apply cleaning to text column
df["Transcript_CTD"] = df["Transcript_CTD"].fillna("").apply(clean_text)
df["Transcript_PFT"] = df["Transcript_PFT"].fillna("").apply(clean_text)
df["Transcript_SFT"] = df["Transcript_SFT"].fillna("").apply(clean_text)


In [9]:
# Combine the three text columns
df["combined_text"] = (
    df["Transcript_CTD"] + " " + df["Transcript_PFT"] + " " + df["Transcript_SFT"]   
)

In [10]:
# Prepare train-test split
train_texts, test_texts, train_labels, test_labels = train_test_split(
    df["combined_text"], df["label"], test_size=0.2, random_state=42
)


In [11]:
# Ensure they are strings
train_texts = train_texts.astype(str)
test_texts = test_texts.astype(str)

In [12]:
# Tokenization
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
train_encodings = tokenizer(
    list(train_texts), truncation=True, padding=True, max_length=256
)
test_encodings = tokenizer(
    list(test_texts), truncation=True, padding=True, max_length=256
)

In [13]:
# Dataset Wrapper
class DatasetWrapper(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item["labels"] = torch.tensor(self.labels[idx])
        return item

In [14]:
train_dataset = DatasetWrapper(train_encodings, list(train_labels))
test_dataset = DatasetWrapper(test_encodings, list(test_labels))

In [15]:
# Model
model = BertForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=2)


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
# Updated training arguments to optimize for the entire dataset
training_args = TrainingArguments(
    num_train_epochs=4,  # Minimum for meaningful learning
    warmup_ratio=0.1,  # 10% of total steps instead of fixed 200
    per_device_train_batch_size=8,  # Increase if GPU memory allows
    gradient_accumulation_steps=2,
)

In [17]:
# Metric
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    preds = np.argmax(logits, axis=1)
    return {
        'accuracy': accuracy_score(labels, preds),
        'f1_macro': f1_score(labels, preds, average='macro')
    }


In [18]:
# Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    compute_metrics=compute_metrics,
)


In [19]:
# Train
trainer.train()

# Evaluate
results = trainer.evaluate()
print(f"Accuracy: {results['eval_accuracy']}")
print(f"Loss: {results['eval_loss']}")


Step,Training Loss
500,0.3672


Accuracy: 0.9062003179650239
Loss: 0.38061755895614624
