In [2]:
import pandas as pd
from datasets import Dataset, DatasetDict
from sklearn.model_selection import train_test_split

In [3]:
df = pd.read_csv("data\\train_dm_transcription.csv" , header = "infer")
# dropping rows with null values 
df = df.dropna()
df

Unnamed: 0,file,label,path,transcription
0,daningram_15,dementia,data/dementia/Dan Ingram/daningram_15.wav,"Long before the blackout. All right, yeah. Bla..."
1,terryjones_5,dementia,data/dementia/Terry Jones/terryjones_5.wav,"Well, yeah, but then it was the government tha..."
2,maureenforrester_5,dementia,data/dementia/Maureen Forrester/maureenforrest...,"You know, if you've ever gone to a place, a ho..."
3,aileenhernandez_0,dementia,data/dementia/Aileen Hernandez/aileenhernandez...,This is not going to sound like very ladylike....
4,aileenhernandez_5_1,dementia,data/dementia/Aileen Hernandez/aileenhernandez...,"I arrive at my first political science class, ..."
...,...,...,...,...
150,Angela Lansbury_1,nodementia,data/nodementia/Angela Lansbury/Angela Lansbur...,"Of course, I'd always been associated with mov..."
151,Angela Lansbury_3,nodementia,data/nodementia/Angela Lansbury/Angela Lansbur...,I only really come alive as an interesting per...
152,Angela Lansbury_2,nodementia,data/nodementia/Angela Lansbury/Angela Lansbur...,Opened the door to a solution in my mind. Our ...
225,BobNewhart_2,nodementia,data/nodementia/Bob Newhart/BobNewhart_2.wav,That's a tough. I don't know. When Bob wasn't ...


In [4]:
df = df[['label' , 'transcription']]
dataset = Dataset.from_pandas(df)
dataset = dataset.rename_column('transcription', 'text')
train_df, val_df = train_test_split(df, test_size=0.2, random_state=42)

train_dataset = Dataset.from_pandas(train_df)
val_dataset = Dataset.from_pandas(val_df)

train_dataset = train_dataset.rename_column('transcription', 'text')
val_dataset = val_dataset.rename_column('transcription', 'text')


dataset_dict = DatasetDict({
    'train': train_dataset,
    'test': val_dataset
})

dataset_dict


DatasetDict({
    train: Dataset({
        features: ['label', 'text', '__index_level_0__'],
        num_rows: 99
    })
    test: Dataset({
        features: ['label', 'text', '__index_level_0__'],
        num_rows: 25
    })
})

In [5]:
from transformers import BertTokenizer
import torch

tokenizer = BertTokenizer.from_pretrained('bert-base-cased')

def tokenize(example):
    text = example.get('text', "")
    label = example.get('label')  
    
    if text is None:
        text = ""  
    encoded_dict = tokenizer.encode_plus(
        text,                         
        add_special_tokens=True,      
        max_length=128,                
        padding='max_length',         
        truncation=True,              
        return_tensors='pt'          
    )
    
    input_ids = encoded_dict['input_ids'].squeeze(0)
    attention_mask = encoded_dict['attention_mask'].squeeze(0)
    label_int = 1

    if label=='dementia':
        label_int = 0
    if label=='nodementia':
        label_int = 1 
     
    return {
        'input_ids': input_ids,
        'attention_mask': attention_mask,
        'label': label_int
    }






In [6]:
dataset_dict = dataset_dict.map(tokenize, batched=False)

Map:   0%|          | 0/99 [00:00<?, ? examples/s]

Map:   0%|          | 0/25 [00:00<?, ? examples/s]

In [7]:
from transformers import BertTokenizer, BertModel
import torch.nn as nn 
Bert_layer = BertModel.from_pretrained('bert-base-uncased')

class BertModel(nn.Module):  
    def __init__(self, bert_layer, num_classes):
        super(BertModel, self).__init__()  
        self.bert = bert_layer
        self.fc = nn.Linear(self.bert.config.hidden_size, num_classes)  

    def forward(self, input_ids, attention_mask=None):
        outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask)
        x = outputs.pooler_output  
        x = self.fc(x)
        x = nn.Softmax(dim=1)(x)  
        return x

In [8]:
classifier = BertModel(Bert_layer , num_classes=4)
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(classifier.parameters(), lr=0.00001)

In [9]:
inps = []
atts = []
lab = []
i = 0
for data in dataset_dict['train']:
    if i > 10000 : 
        break 
    i+=1
    inp = data['input_ids']
    inp = torch.tensor(inp).unsqueeze(0) 
    inp = inp.squeeze(1)
    att = data['attention_mask']
    att = torch.tensor(att).unsqueeze(0) 
    att = att.squeeze(1)
    inps.append(inp)
    atts.append(att)
    lab.append(data['label'])

inps = torch.stack(inps)
atts = torch.stack(atts)

In [10]:
inps = inps.squeeze(1)
print(inps.shape)
atts = atts.squeeze(1)
atts.shape

torch.Size([99, 128])


torch.Size([99, 128])

In [12]:
import torch
from torch.utils.data import DataLoader, TensorDataset, Subset
from tqdm import tqdm  
labels = torch.tensor(lab)
dataset = TensorDataset(inps, atts, labels)
data_loader = DataLoader(dataset, batch_size=10, shuffle=True)
for epoch in range(10):
    classifier.train()
    total_loss = 0
    correct_predictions = 0
    for batch in tqdm(data_loader, desc=f"Epoch {epoch+1}", unit="batch"):
        input_ids, attention_mask, labels = batch  
        optimizer.zero_grad()
        outputs = classifier(input_ids, attention_mask=attention_mask)
        loss = criterion(outputs, labels)
        total_loss += loss.item()
        loss.backward()
        optimizer.step()
        preds = torch.argmax(outputs, dim=1)
        correct_predictions+=(preds == labels).sum().item()
    avg_loss = total_loss/len(dataset)
    accuracy = correct_predictions/len(dataset)
    print(f"Epoch {epoch+1} - Loss: {avg_loss:.4f}, Accuracy: {accuracy:.4f}")

Epoch 1: 100%|██████████| 10/10 [02:09<00:00, 12.94s/batch]


Epoch 1 - Loss: 0.1296, Accuracy: 0.6364


Epoch 2: 100%|██████████| 10/10 [03:35<00:00, 21.53s/batch]


Epoch 2 - Loss: 0.1133, Accuracy: 0.7475


Epoch 3: 100%|██████████| 10/10 [02:08<00:00, 12.88s/batch]


Epoch 3 - Loss: 0.1074, Accuracy: 0.7475


Epoch 4: 100%|██████████| 10/10 [03:07<00:00, 18.79s/batch]


Epoch 4 - Loss: 0.1041, Accuracy: 0.7475


Epoch 5: 100%|██████████| 10/10 [01:55<00:00, 11.53s/batch]


Epoch 5 - Loss: 0.1027, Accuracy: 0.7475


Epoch 6: 100%|██████████| 10/10 [01:56<00:00, 11.68s/batch]


Epoch 6 - Loss: 0.1020, Accuracy: 0.7475


Epoch 7: 100%|██████████| 10/10 [02:07<00:00, 12.75s/batch]


Epoch 7 - Loss: 0.1018, Accuracy: 0.7475


Epoch 8: 100%|██████████| 10/10 [01:55<00:00, 11.51s/batch]


Epoch 8 - Loss: 0.1013, Accuracy: 0.7475


Epoch 9: 100%|██████████| 10/10 [03:38<00:00, 21.86s/batch]


Epoch 9 - Loss: 0.1013, Accuracy: 0.7475


Epoch 10: 100%|██████████| 10/10 [01:57<00:00, 11.80s/batch]

Epoch 10 - Loss: 0.1013, Accuracy: 0.7475





In [13]:
# testing the model 
classifier.eval()
total_loss = 0
correct_predictions = 0
with torch.no_grad():
    for batch in tqdm(data_loader, desc="Testing", unit="batch"):
        input_ids, attention_mask, labels = batch  
        outputs = classifier(input_ids, attention_mask=attention_mask)
        loss = criterion(outputs, labels)
        total_loss += loss.item()
        preds = torch.argmax(outputs, dim=1)
        correct_predictions+=(preds == labels).sum().item()
    avg_loss = total_loss/len(dataset)
    accuracy = correct_predictions/len(dataset)
    print(f"Testing - Loss: {avg_loss:.4f}, Accuracy: {accuracy:.4f}")
# saving the model
torch.save(classifier.state_dict(), "bert_model.pth")

Testing: 100%|██████████| 10/10 [00:40<00:00,  4.09s/batch]


Testing - Loss: 0.1013, Accuracy: 0.7475
