**DistilBERT**

In [None]:
import pandas as pd
df = pd.read_csv("Datasets/final_parkinsons_dataset.csv")
print(df["Label"].value_counts(normalize=True))

In [None]:
import re
import nltk
from nltk.corpus import stopwords

nltk.download("stopwords")
stop_words = set(stopwords.words("english"))

def clean_text(text):
    text = text.lower()
    text = re.sub(r"[^a-z\s']", "", text)
    text = " ".join([word for word in text.split() if word not in stop_words])
    return text

df["Cleaned_Abstract"] = df["Abstract"].astype(str).apply(clean_text)
print(df[["Abstract", "Cleaned_Abstract"]].head())

In [None]:
from transformers import DistilBertTokenizer

tokenizer = DistilBertTokenizer.from_pretrained("distilbert-base-uncased")

tokens = tokenizer(
    df["Cleaned_Abstract"].tolist(),
    padding=True, truncation=True, max_length=512,
    return_tensors="pt"
)

print(tokens["input_ids"].shape)
print("Attention Mask shape:", tokens["attention_mask"].shape)

In [None]:
from sklearn.model_selection import train_test_split
import torch

labels = torch.tensor(df["Label"].values)

train_inputs, test_inputs, train_labels, test_labels, train_masks, test_masks = train_test_split(
    tokens["input_ids"], labels, tokens["attention_mask"],
    test_size=0.2, random_state=42, stratify=labels
)
print(f"Train Inputs: {train_inputs.shape}, Train Masks: {train_masks.shape}, Train Labels: {train_labels.shape}")
print(f"Test Inputs: {test_inputs.shape}, Test Masks: {test_masks.shape}, Test Labels: {test_labels.shape}")

In [None]:
import torch
from transformers import DistilBertForSequenceClassification
from torch.optim import AdamW

device = torch.device("cuda:6" if torch.cuda.is_available() else "cpu")

torch.cuda.empty_cache()

model = DistilBertForSequenceClassification.from_pretrained(
    "distilbert-base-uncased",
    num_labels=2,
    torch_dtype=torch.float32
)

model.to(device)

optimizer = AdamW(model.parameters(), lr=1e-5, eps=1e-8)

print(f"Model Loaded Successfully on {device}!")

In [None]:
from torch.utils.data import Dataset
class ParkinsonsDataset(Dataset):
    def __init__(self, inputs, masks, labels):
        self.inputs = inputs
        self.masks = masks
        self.labels = labels

    def __len__(self):
        return len(self.inputs)

    def __getitem__(self, idx):
        return {
            "input_ids": self.inputs[idx],
            "attention_mask": self.masks[idx],
            "labels": self.labels[idx]
        }


In [None]:
train_dataset = ParkinsonsDataset(train_inputs, train_masks, train_labels)
test_dataset = ParkinsonsDataset(test_inputs, test_masks, test_labels)

In [None]:
from torch.utils.data import DataLoader
train_dataloader = DataLoader(train_dataset, batch_size=8, shuffle=True)
test_dataloader = DataLoader(test_dataset, batch_size=8, shuffle=False)
print("DataLoaders created successfully!")

In [None]:
import torch
from torch.utils.data import DataLoader
import torch.nn as nn
from transformers import get_scheduler

loss_fn = nn.CrossEntropyLoss()

=num_training_steps = len(train_dataloader) * 5
lr_scheduler = get_scheduler(
    "linear", optimizer=optimizer, num_warmup_steps=0, num_training_steps=num_training_steps
)

model.train()

epochs = 25
for epoch in range(epochs):
    total_loss = 0
    correct = 0
    total = 0

    for batch in train_dataloader:
        batch_inputs = batch["input_ids"].to(device)
        batch_masks = batch["attention_mask"].to(device)
        batch_labels = batch["labels"].to(device)

        if torch.isnan(batch_inputs).any() or torch.isnan(batch_masks).any() or torch.isnan(batch_labels).any():
            print("NaN detected in batch input! Skipping...")
            continue

        optimizer.zero_grad()

        outputs = model(input_ids=batch_inputs, attention_mask=batch_masks)
        logits = outputs.logits

        logits = torch.clamp(logits, min=-1e6, max=1e6)

        loss = loss_fn(logits, batch_labels)
        total_loss += loss.item()

        loss.backward()

        torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)

        optimizer.step()
        lr_scheduler.step()

        preds = torch.argmax(logits, dim=1)
        correct += (preds == batch_labels).sum().item()
        total += batch_labels.size(0)

    avg_loss = total_loss / len(train_dataloader)
    accuracy = correct / total

    print(f"Epoch [{epoch+1}/{epochs}] - Loss: {avg_loss:.4f}, Accuracy: {accuracy:.4f}")
print("Training Completed!")

In [None]:
model.eval()

correct = 0
total = 0
total_loss = 0

with torch.no_grad():
    for batch in test_dataloader:
        batch_inputs = batch["input_ids"].to(device)
        batch_masks = batch["attention_mask"].to(device)
        batch_labels = batch["labels"].to(device)

        outputs = model(input_ids=batch_inputs, attention_mask=batch_masks)
        logits = outputs.logits

        loss = loss_fn(logits, batch_labels)
        total_loss += loss.item()

        preds = torch.argmax(logits, dim=1)
        correct += (preds == batch_labels).sum().item()
        total += batch_labels.size(0)

avg_test_loss = total_loss / len(test_dataloader)
test_accuracy = correct / total

print(f"Test Loss: {avg_test_loss:.4f}, Test Accuracy: {test_accuracy:.4f}")


In [None]:
model.save_pretrained("distilbert_parkinsons_model")
tokenizer.save_pretrained("distilbert_parkinsons_model")


In [None]:
from transformers import DistilBertForSequenceClassification, DistilBertTokenizer

model = DistilBertForSequenceClassification.from_pretrained("distilbert_parkinsons_model")
tokenizer = DistilBertTokenizer.from_pretrained("distilbert_parkinsons_model")


In [None]:
import torch
from sklearn.metrics import classification_report

device = torch.device("cuda:6" if torch.cuda.is_available() else "cpu")
model.to(device)
model.eval()

all_preds = []
all_labels = []

with torch.no_grad():
    for batch in test_dataloader:
        batch_inputs = batch["input_ids"].to(device)
        batch_masks = batch["attention_mask"].to(device)
        batch_labels = batch["labels"].to(device)

        outputs = model(input_ids=batch_inputs, attention_mask=batch_masks)
        preds = torch.argmax(outputs.logits, dim=1)

        all_preds.extend(preds.cpu().numpy())
        all_labels.extend(batch_labels.cpu().numpy())

print(classification_report(all_labels, all_preds, digits=4))


In [None]:
from sklearn.metrics import confusion_matrix
import seaborn as sns
import matplotlib.pyplot as plt

cm = confusion_matrix(all_labels, all_preds)
plt.figure(figsize=(5,5))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', xticklabels=["Non PD", "PD"], yticklabels=["Non PD", "PD"])
plt.xlabel("Predicted")
plt.ylabel("Actual")
plt.title("Confusion Matrix")
plt.show()


In [None]:
from sklearn.metrics import roc_curve, auc

probs = []
with torch.no_grad():
    for batch in test_dataloader:
        batch_inputs = batch["input_ids"].to(device)
        batch_masks = batch["attention_mask"].to(device)

        outputs = model(input_ids=batch_inputs, attention_mask=batch_masks)
        probs.extend(torch.nn.functional.softmax(outputs.logits, dim=1)[:, 1].cpu().numpy())


fpr, tpr, _ = roc_curve(all_labels, probs)
roc_auc = auc(fpr, tpr)

plt.figure(figsize=(6, 6))
plt.plot(fpr, tpr, color="blue", label=f"AUC = {roc_auc:.4f}")
plt.plot([0, 1], [0, 1], color="gray", linestyle="--")
plt.xlabel("False Positive Rate")
plt.ylabel("True Positive Rate")
plt.title("AUC-ROC Curve")
plt.legend()
plt.show()


**BioBERT**

In [None]:
import pandas as pd

df = pd.read_csv("Datasets/final_parkinsons_dataset.csv")

print(df["Label"].value_counts(normalize=True))


In [None]:
import re
import nltk
from nltk.corpus import stopwords

nltk.download("stopwords")
stop_words = set(stopwords.words("english"))

def clean_text(text):
    text = text.lower()
    text = re.sub(r"[^a-z\s']", "", text)
    text = " ".join([word for word in text.split() if word not in stop_words])
    return text

df["Cleaned_Abstract"] = df["Abstract"].astype(str).apply(clean_text)

print(df[["Abstract", "Cleaned_Abstract"]].head())


In [None]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("dmis-lab/biobert-base-cased-v1.1")

tokens = tokenizer(
    df["Cleaned_Abstract"].tolist(),
    padding=True, truncation=True, max_length=512,
    return_tensors="pt"
)

print(tokens["input_ids"].shape)
print("Attention Mask shape:", tokens["attention_mask"].shape)


In [None]:
from sklearn.model_selection import train_test_split
import torch

labels = torch.tensor(df["Label"].values)

train_inputs, test_inputs, train_labels, test_labels, train_masks, test_masks = train_test_split(
    tokens["input_ids"], labels, tokens["attention_mask"],
    test_size=0.2, random_state=42, stratify=labels
)
print(f"Train Inputs: {train_inputs.shape}, Train Masks: {train_masks.shape}, Train Labels: {train_labels.shape}")
print(f"Test Inputs: {test_inputs.shape}, Test Masks: {test_masks.shape}, Test Labels: {test_labels.shape}")



In [None]:
from transformers import AutoModelForSequenceClassification
from torch.optim import AdamW

model = AutoModelForSequenceClassification.from_pretrained("dmis-lab/biobert-base-cased-v1.1", num_labels=2)

device = torch.device("cuda:6" if torch.cuda.is_available() else "cpu")
model.to(device)

optimizer = AdamW(model.parameters(), lr=2e-5, eps=1e-8)

print("BioBERT Model Loaded Successfully!")


In [None]:
from torch.utils.data import Dataset
class ParkinsonsDataset(Dataset):
    def __init__(self, inputs, masks, labels):
        self.inputs = inputs
        self.masks = masks
        self.labels = labels

    def __len__(self):
        return len(self.inputs)

    def __getitem__(self, idx):
        return {
            "input_ids": self.inputs[idx],
            "attention_mask": self.masks[idx],
            "labels": self.labels[idx]
        }


In [None]:
train_dataset = ParkinsonsDataset(train_inputs, train_masks, train_labels)
test_dataset = ParkinsonsDataset(test_inputs, test_masks, test_labels)

In [None]:
from torch.utils.data import DataLoader
train_dataloader = DataLoader(train_dataset, batch_size=8, shuffle=True)
test_dataloader = DataLoader(test_dataset, batch_size=8, shuffle=False)
print("DataLoaders created successfully!")

In [None]:
import torch
from torch.utils.data import DataLoader
import torch.nn as nn
from transformers import get_scheduler

loss_fn = nn.CrossEntropyLoss()

num_training_steps = len(train_dataloader) * 5
lr_scheduler = get_scheduler(
    "linear", optimizer=optimizer, num_warmup_steps=0, num_training_steps=num_training_steps
)

model.train()

epochs = 25
for epoch in range(epochs):
    total_loss = 0
    correct = 0
    total = 0

    for batch in train_dataloader:
        batch_inputs = batch["input_ids"].to(device)
        batch_masks = batch["attention_mask"].to(device)
        batch_labels = batch["labels"].to(device)

        optimizer.zero_grad()

        outputs = model(input_ids=batch_inputs, attention_mask=batch_masks)
        logits = outputs.logits

        loss = loss_fn(logits, batch_labels)
        total_loss += loss.item()

        loss.backward()
        optimizer.step()
        lr_scheduler.step()

        preds = torch.argmax(logits, dim=1)
        correct += (preds == batch_labels).sum().item()
        total += batch_labels.size(0)

    avg_loss = total_loss / len(train_dataloader)
    accuracy = correct / total

    print(f"Epoch [{epoch+1}/{epochs}] - Loss: {avg_loss:.4f}, Accuracy: {accuracy:.4f}")

print("Training Completed!")


In [None]:
model.eval()
correct = 0
total = 0
total_loss = 0

with torch.no_grad():
    for batch in test_dataloader:
        batch_inputs = batch["input_ids"].to(device)
        batch_masks = batch["attention_mask"].to(device)
        batch_labels = batch["labels"].to(device)

        outputs = model(input_ids=batch_inputs, attention_mask=batch_masks)
        logits = outputs.logits

        loss = loss_fn(logits, batch_labels)
        total_loss += loss.item()

        preds = torch.argmax(logits, dim=1)
        correct += (preds == batch_labels).sum().item()
        total += batch_labels.size(0)

avg_test_loss = total_loss / len(test_dataloader)
test_accuracy = correct / total

print(f"Test Loss: {avg_test_loss:.4f}, Test Accuracy: {test_accuracy:.4f}")


In [None]:
model.save_pretrained("biobert_parkinsons_model")
tokenizer.save_pretrained("biobert_parkinsons_model")


In [None]:
from transformers import AutoModelForSequenceClassification, AutoTokenizer

model = AutoModelForSequenceClassification.from_pretrained("dmis-lab/biobert-base-cased-v1.1", num_labels=2)
tokenizer = AutoTokenizer.from_pretrained("dmis-lab/biobert-base-cased-v1.1")

print("BioBERT Loaded Successfully!")


In [None]:
import torch
from sklearn.metrics import classification_report

device = torch.device("cuda:6" if torch.cuda.is_available() else "cpu")
model.to(device)
model.eval()

all_preds = []
all_labels = []

with torch.no_grad():
    for batch in test_dataloader:
        batch_inputs = batch["input_ids"].to(device)
        batch_masks = batch["attention_mask"].to(device)
        batch_labels = batch["labels"].to(device)

        outputs = model(input_ids=batch_inputs, attention_mask=batch_masks)
        preds = torch.argmax(outputs.logits, dim=1)

        all_preds.extend(preds.cpu().numpy())
        all_labels.extend(batch_labels.cpu().numpy())

print(classification_report(all_labels, all_preds, digits=4))


In [None]:
from sklearn.metrics import confusion_matrix
import seaborn as sns
import matplotlib.pyplot as plt

cm = confusion_matrix(all_labels, all_preds)
plt.figure(figsize=(5,5))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', xticklabels=["Non PD", "PD"], yticklabels=["Non PD", "PD"])
plt.xlabel("Predicted")
plt.ylabel("Actual")
plt.title("Confusion Matrix")
plt.show()


In [None]:
from sklearn.metrics import roc_curve, auc

probs = []
with torch.no_grad():
    for batch in test_dataloader:
        batch_inputs = batch["input_ids"].to(device)
        batch_masks = batch["attention_mask"].to(device)

        outputs = model(input_ids=batch_inputs, attention_mask=batch_masks)
        probs.extend(torch.nn.functional.softmax(outputs.logits, dim=1)[:, 1].cpu().numpy())

fpr, tpr, _ = roc_curve(all_labels, probs)
roc_auc = auc(fpr, tpr)

plt.figure(figsize=(6, 6))
plt.plot(fpr, tpr, color="blue", label=f"AUC = {roc_auc:.4f}")
plt.plot([0, 1], [0, 1], color="gray", linestyle="--")
plt.xlabel("False Positive Rate")
plt.ylabel("True Positive Rate")
plt.title("AUC-ROC Curve")
plt.legend()
plt.show()


**ALBERT**

In [None]:
import pandas as pd

df = pd.read_csv("Datasets/final_parkinsons_dataset.csv")

print(df["Label"].value_counts(normalize=True))


In [None]:
import re
import nltk
from nltk.corpus import stopwords

nltk.download("stopwords")
stop_words = set(stopwords.words("english"))

def clean_text(text):
    text = text.lower()
    text = re.sub(r"[^a-z\s']", "", text)
    text = " ".join([word for word in text.split() if word not in stop_words])
    return text

df["Cleaned_Abstract"] = df["Abstract"].astype(str).apply(clean_text)

print(df[["Abstract", "Cleaned_Abstract"]].head())


In [None]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("albert-base-v2")

tokens = tokenizer(
    df["Cleaned_Abstract"].tolist(),
    padding=True, truncation=True, max_length=512,
    return_tensors="pt"
)

print(tokens["input_ids"].shape)
print("Attention Mask shape:", tokens["attention_mask"].shape)


In [None]:
from sklearn.model_selection import train_test_split
import torch

labels = torch.tensor(df["Label"].values)

train_inputs, test_inputs, train_labels, test_labels, train_masks, test_masks = train_test_split(
    tokens["input_ids"], labels, tokens["attention_mask"],
    test_size=0.2, random_state=42, stratify=labels
)
print(f"Train Inputs: {train_inputs.shape}, Train Masks: {train_masks.shape}, Train Labels: {train_labels.shape}")
print(f"Test Inputs: {test_inputs.shape}, Test Masks: {test_masks.shape}, Test Labels: {test_labels.shape}")



In [None]:
import torch
from transformers import AutoModelForSequenceClassification
from torch.optim import AdamW


model = AutoModelForSequenceClassification.from_pretrained("albert-base-v2", num_labels=2)

device = torch.device("cuda:7" if torch.cuda.is_available() else "cpu")
model.to(device)

optimizer = AdamW(model.parameters(), lr=2e-5, eps=1e-8)

print("ALBERT Model Loaded Successfully!")


In [None]:
from torch.utils.data import Dataset
class ParkinsonsDataset(Dataset):
    def __init__(self, inputs, masks, labels):
        self.inputs = inputs
        self.masks = masks
        self.labels = labels

    def __len__(self):
        return len(self.inputs)

    def __getitem__(self, idx):
        return {
            "input_ids": self.inputs[idx],
            "attention_mask": self.masks[idx],
            "labels": self.labels[idx]
        }


In [None]:
train_dataset = ParkinsonsDataset(train_inputs, train_masks, train_labels)
test_dataset = ParkinsonsDataset(test_inputs, test_masks, test_labels)

In [None]:
from torch.utils.data import DataLoader
train_dataloader = DataLoader(train_dataset, batch_size=8, shuffle=True)
test_dataloader = DataLoader(test_dataset, batch_size=8, shuffle=False)
print("DataLoaders created successfully!")

In [None]:
import torch
from torch.utils.data import DataLoader
import torch.nn as nn
from transformers import get_scheduler

loss_fn = nn.CrossEntropyLoss()

num_training_steps = len(train_dataloader) * 5
lr_scheduler = get_scheduler(
    "linear", optimizer=optimizer, num_warmup_steps=0, num_training_steps=num_training_steps
)

model.train()

epochs = 25
for epoch in range(epochs):
    total_loss = 0
    correct = 0
    total = 0

    for batch in train_dataloader:
        batch_inputs = batch["input_ids"].to(device)
        batch_masks = batch["attention_mask"].to(device)
        batch_labels = batch["labels"].to(device)

        optimizer.zero_grad()

        outputs = model(input_ids=batch_inputs, attention_mask=batch_masks)
        logits = outputs.logits

        loss = loss_fn(logits, batch_labels)
        total_loss += loss.item()

        loss.backward()
        optimizer.step()
        lr_scheduler.step()

        preds = torch.argmax(logits, dim=1)
        correct += (preds == batch_labels).sum().item()
        total += batch_labels.size(0)

    avg_loss = total_loss / len(train_dataloader)
    accuracy = correct / total

    print(f"Epoch [{epoch+1}/{epochs}] - Loss: {avg_loss:.4f}, Accuracy: {accuracy:.4f}")

print("Training Completed!")


In [None]:
model.eval()

correct = 0
total = 0
total_loss = 0

with torch.no_grad():
    for batch in test_dataloader:
        batch_inputs = batch["input_ids"].to(device)
        batch_masks = batch["attention_mask"].to(device)
        batch_labels = batch["labels"].to(device)

        outputs = model(input_ids=batch_inputs, attention_mask=batch_masks)
        logits = outputs.logits

        loss = loss_fn(logits, batch_labels)
        total_loss += loss.item()

        preds = torch.argmax(logits, dim=1)
        correct += (preds == batch_labels).sum().item()
        total += batch_labels.size(0)

avg_test_loss = total_loss / len(test_dataloader)
test_accuracy = correct / total

print(f"Test Loss: {avg_test_loss:.4f}, Test Accuracy: {test_accuracy:.4f}")


In [None]:
model.save_pretrained("albert_parkinsons_model")
tokenizer.save_pretrained("albert_parkinsons_model")


In [None]:
from transformers import AutoModelForSequenceClassification, AutoTokenizer

model = AutoModelForSequenceClassification.from_pretrained("albert_parkinsons_model")
tokenizer = AutoTokenizer.from_pretrained("albert_parkinsons_model")

print("ALBERT Model and Tokenizer Loaded Successfully!")


In [None]:
import torch
from sklearn.metrics import classification_report

device = torch.device("cuda:7" if torch.cuda.is_available() else "cpu")
model.to(device)
model.eval()

all_preds = []
all_labels = []

with torch.no_grad():
    for batch in test_dataloader:
        batch_inputs = batch["input_ids"].to(device)
        batch_masks = batch["attention_mask"].to(device)
        batch_labels = batch["labels"].to(device)

        outputs = model(input_ids=batch_inputs, attention_mask=batch_masks)
        preds = torch.argmax(outputs.logits, dim=1)

        all_preds.extend(preds.cpu().numpy())
        all_labels.extend(batch_labels.cpu().numpy())

print(classification_report(all_labels, all_preds, digits=4))


In [None]:
from sklearn.metrics import confusion_matrix
import seaborn as sns
import matplotlib.pyplot as plt

cm = confusion_matrix(all_labels, all_preds)
plt.figure(figsize=(5,5))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', xticklabels=["Non PD", "PD"], yticklabels=["Non PD", "PD"])
plt.xlabel("Predicted")
plt.ylabel("Actual")
plt.title("Confusion Matrix")
plt.show()


In [None]:
from sklearn.metrics import roc_curve, auc

probs = []
with torch.no_grad():
    for batch in test_dataloader:
        batch_inputs = batch["input_ids"].to(device)
        batch_masks = batch["attention_mask"].to(device)

        outputs = model(input_ids=batch_inputs, attention_mask=batch_masks)
        probs.extend(torch.nn.functional.softmax(outputs.logits, dim=1)[:, 1].cpu().numpy())

fpr, tpr, _ = roc_curve(all_labels, probs)
roc_auc = auc(fpr, tpr)

plt.figure(figsize=(6, 6))
plt.plot(fpr, tpr, color="blue", label=f"AUC = {roc_auc:.4f}")
plt.plot([0, 1], [0, 1], color="gray", linestyle="--")
plt.xlabel("False Positive Rate")
plt.ylabel("True Positive Rate")
plt.title("AUC-ROC Curve")
plt.legend()
plt.show()


**TinyBERT**

In [None]:
import pandas as pd

df = pd.read_csv("Datasets/final_parkinsons_dataset.csv")

print(df["Label"].value_counts(normalize=True))


In [None]:
import re
import nltk
from nltk.corpus import stopwords

nltk.download("stopwords")
stop_words = set(stopwords.words("english"))

def clean_text(text):
    text = text.lower()
    text = re.sub(r"[^a-z\s']", "", text)
    text = " ".join([word for word in text.split() if word not in stop_words])
    return text

df["Cleaned_Abstract"] = df["Abstract"].astype(str).apply(clean_text)

print(df[["Abstract", "Cleaned_Abstract"]].head())


In [None]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("huawei-noah/TinyBERT_General_6L_768D")

tokens = tokenizer(
    df["Cleaned_Abstract"].tolist(),
    padding=True, truncation=True, max_length=512,
    return_tensors="pt"
)

print(tokens["input_ids"].shape)
print("Attention Mask shape:", tokens["attention_mask"].shape)


In [None]:
from sklearn.model_selection import train_test_split
import torch

labels = torch.tensor(df["Label"].values)

train_inputs, test_inputs, train_labels, test_labels, train_masks, test_masks = train_test_split(
    tokens["input_ids"], labels, tokens["attention_mask"],
    test_size=0.2, random_state=42, stratify=labels
)
print(f"Train Inputs: {train_inputs.shape}, Train Masks: {train_masks.shape}, Train Labels: {train_labels.shape}")
print(f"Test Inputs: {test_inputs.shape}, Test Masks: {test_masks.shape}, Test Labels: {test_labels.shape}")



In [None]:
from transformers import AutoModelForSequenceClassification
from torch.optim import AdamW


model = AutoModelForSequenceClassification.from_pretrained("huawei-noah/TinyBERT_General_6L_768D", num_labels=2)

device = torch.device("cuda:6" if torch.cuda.is_available() else "cpu")
model.to(device)

optimizer = AdamW(model.parameters(), lr=2e-5, eps=1e-8)

print("TinyBERT Model Loaded Successfully!")


In [None]:
from torch.utils.data import Dataset
class ParkinsonsDataset(Dataset):
    def __init__(self, inputs, masks, labels):
        self.inputs = inputs
        self.masks = masks
        self.labels = labels

    def __len__(self):
        return len(self.inputs)

    def __getitem__(self, idx):
        return {
            "input_ids": self.inputs[idx],
            "attention_mask": self.masks[idx],
            "labels": self.labels[idx]
        }


In [None]:
train_dataset = ParkinsonsDataset(train_inputs, train_masks, train_labels)
test_dataset = ParkinsonsDataset(test_inputs, test_masks, test_labels)

In [None]:
from torch.utils.data import DataLoader
train_dataloader = DataLoader(train_dataset, batch_size=8, shuffle=True)
test_dataloader = DataLoader(test_dataset, batch_size=8, shuffle=False)
print("DataLoaders created successfully!")

In [None]:
import torch
from torch.utils.data import DataLoader
import torch.nn as nn
from transformers import get_scheduler

loss_fn = nn.CrossEntropyLoss()

num_training_steps = len(train_dataloader) * 5
lr_scheduler = get_scheduler(
    "linear", optimizer=optimizer, num_warmup_steps=0, num_training_steps=num_training_steps
)

model.train()

epochs = 25
for epoch in range(epochs):
    total_loss = 0
    correct = 0
    total = 0

    for batch in train_dataloader:
        batch_inputs = batch["input_ids"].to(device)
        batch_masks = batch["attention_mask"].to(device)
        batch_labels = batch["labels"].to(device)

        optimizer.zero_grad()

        outputs = model(input_ids=batch_inputs, attention_mask=batch_masks)
        logits = outputs.logits

        loss = loss_fn(logits, batch_labels)
        total_loss += loss.item()

        loss.backward()
        optimizer.step()
        lr_scheduler.step()

        preds = torch.argmax(logits, dim=1)
        correct += (preds == batch_labels).sum().item()
        total += batch_labels.size(0)

    avg_loss = total_loss / len(train_dataloader)
    accuracy = correct / total

    print(f"Epoch [{epoch+1}/{epochs}] - Loss: {avg_loss:.4f}, Accuracy: {accuracy:.4f}")

print("Training Completed!")


In [None]:
model.eval()

correct = 0
total = 0
total_loss = 0

with torch.no_grad():
    for batch in test_dataloader:
        batch_inputs = batch["input_ids"].to(device)
        batch_masks = batch["attention_mask"].to(device)
        batch_labels = batch["labels"].to(device)

        outputs = model(input_ids=batch_inputs, attention_mask=batch_masks)
        logits = outputs.logits

        loss = loss_fn(logits, batch_labels)
        total_loss += loss.item()

        preds = torch.argmax(logits, dim=1)
        correct += (preds == batch_labels).sum().item()
        total += batch_labels.size(0)

avg_test_loss = total_loss / len(test_dataloader)
test_accuracy = correct / total

print(f"Test Loss: {avg_test_loss:.4f}, Test Accuracy: {test_accuracy:.4f}")


In [None]:
model.save_pretrained("tinybert_parkinsons_model")
tokenizer.save_pretrained("tinybert_parkinsons_model")


In [None]:
from transformers import AutoModelForSequenceClassification, AutoTokenizer

model = AutoModelForSequenceClassification.from_pretrained("huawei-noah/TinyBERT_General_6L_768D")
tokenizer = AutoTokenizer.from_pretrained("huawei-noah/TinyBERT_General_6L_768D")

print("TinyBERT Model and Tokenizer Loaded Successfully!")


In [None]:
import torch
from sklearn.metrics import classification_report

device = torch.device("cuda:6" if torch.cuda.is_available() else "cpu")
model.to(device)
model.eval()

all_preds = []
all_labels = []

with torch.no_grad():
    for batch in test_dataloader:
        batch_inputs = batch["input_ids"].to(device)
        batch_masks = batch["attention_mask"].to(device)
        batch_labels = batch["labels"].to(device)

        outputs = model(input_ids=batch_inputs, attention_mask=batch_masks)
        preds = torch.argmax(outputs.logits, dim=1)

        all_preds.extend(preds.cpu().numpy())
        all_labels.extend(batch_labels.cpu().numpy())

print(classification_report(all_labels, all_preds, digits=4))


In [None]:
from sklearn.metrics import confusion_matrix
import seaborn as sns
import matplotlib.pyplot as plt

cm = confusion_matrix(all_labels, all_preds)
plt.figure(figsize=(5,5))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', xticklabels=["Non PD", "PD"], yticklabels=["Non PD", "PD"])
plt.xlabel("Predicted")
plt.ylabel("Actual")
plt.title("Confusion Matrix")
plt.show()


In [None]:
from sklearn.metrics import roc_curve, auc

probs = []
with torch.no_grad():
    for batch in test_dataloader:
        batch_inputs = batch["input_ids"].to(device)
        batch_masks = batch["attention_mask"].to(device)

        outputs = model(input_ids=batch_inputs, attention_mask=batch_masks)
        probs.extend(torch.nn.functional.softmax(outputs.logits, dim=1)[:, 1].cpu().numpy())

fpr, tpr, _ = roc_curve(all_labels, probs)
roc_auc = auc(fpr, tpr)

plt.figure(figsize=(6, 6))
plt.plot(fpr, tpr, color="blue", label=f"AUC = {roc_auc:.4f}")
plt.plot([0, 1], [0, 1], color="gray", linestyle="--")
plt.xlabel("False Positive Rate")
plt.ylabel("True Positive Rate")
plt.title("AUC-ROC Curve")
plt.legend()
plt.show()
