In [None]:
from IPython import get_ipython
from IPython.display import display
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder, StandardScaler, PolynomialFeatures
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report
import matplotlib.pyplot as plt
import seaborn as sns
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout


In [1]:
# Loading dataset
df = pd.read_csv("biomarkers.csv")
df.columns = df.columns.str.strip()

# Replace missing values represented by '-' with NaN
df.replace('-', np.nan, inplace=True)

# Encoding categorical features
label_encoders = {}
categorical_cols = ['Gender', 'Positive  history  of  Parkinson  disease  in  family',
                    'Antidepressant  therapy', 'Antiparkinsonian  medication',
                    'Antipsychotic  medication', 'Benzodiazepine  medication']

for col in categorical_cols:
    le = LabelEncoder()
    df[col] = le.fit_transform(df[col].astype(str))
    label_encoders[col] = le



NameError: name 'pd' is not defined

In [2]:
# Convert all other columns to numeric
for col in df.columns:
    if col not in categorical_cols and col != 'Participant  code':
        df[col] = pd.to_numeric(df[col], errors='coerce')

# Handling missing values
imputer = SimpleImputer(strategy='median')
df_imputed = pd.DataFrame(imputer.fit_transform(df.drop('Participant  code', axis=1)),
                          columns=df.drop('Participant  code', axis=1).columns)

# Define features and target
target_col = 'Overview  of  motor  examination:  Hoehn  &  Yahr  scale  (-)'
X = df_imputed.drop(columns=[target_col])
y = df_imputed[target_col].astype(int)

# Split dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# Standardize features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Generate polynomial features
poly = PolynomialFeatures(degree=2, interaction_only=True, include_bias=False)
X_train_poly = poly.fit_transform(X_train_scaled)
X_test_poly = poly.transform(X_test_scaled)

# Logistic Regression model
log_reg = LogisticRegression(multi_class='multinomial', solver='lbfgs', max_iter=500,
                             class_weight='balanced', random_state=42)
log_reg.fit(X_train_poly, y_train)

# Predictions and evaluation for Logistic Regression
y_pred_log = log_reg.predict(X_test_poly)
print("Logistic Regression Test Accuracy:", accuracy_score(y_test, y_pred_log))
print(classification_report(y_test, y_pred_log))



NameError: name 'df' is not defined

In [3]:
# DNN Model
dnn_model = Sequential([
    Dense(128, input_shape=(X_train_scaled.shape[1],), activation='relu'),
    Dropout(0.3),
    Dense(64, activation='relu'),
    Dropout(0.2),
    Dense(32, activation='relu'),
    Dense(len(np.unique(y)), activation='softmax')
])

dnn_model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])

# Subtract 1 from y_train and y_test to shift labels to the range [0, num_classes - 1]
y_train = y_train - 1
y_test = y_test - 1

# Train DNN model
history = dnn_model.fit(X_train_scaled, y_train, epochs=100, batch_size=32, validation_split=0.2, verbose=1)

# Predictions and evaluation for DNN
y_pred_dnn = np.argmax(dnn_model.predict(X_test_scaled), axis=1)



NameError: name 'Sequential' is not defined

In [4]:
# Add 1 back to y_pred_dnn and y_test to restore original labels for evaluation
y_pred_dnn = y_pred_dnn + 1
y_test = y_test + 1

print("DNN Test Accuracy:", accuracy_score(y_test, y_pred_dnn))
print(classification_report(y_test, y_pred_dnn))

# Plot DNN training history
plt.figure(figsize=(10, 5))
plt.plot(history.history['accuracy'], label='Training Accuracy')
plt.plot(history.history['val_accuracy'], label='Validation Accuracy')
plt.title('DNN Model Training History')
plt.xlabel('Epochs')
plt.ylabel('Accuracy')
plt.legend()
plt.grid(True)
plt.show()

# Compare actual vs predicted (DNN)
plt.figure(figsize=(8, 6))
sns.regplot(x=y_test, y=y_pred_dnn, scatter=True, line_kws={"color": "blue", "linestyle": "--"})
plt.xlabel('Actual Stage')
plt.ylabel('Predicted Stage (DNN)')
plt.title('Actual vs Predicted Disease Progression (DNN)')
plt.grid(True)
plt.show()

NameError: name 'y_pred_dnn' is not defined