In [None]:

# Parkinson's Phenotype Classification (TD vs. PIGD)
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, roc_auc_score
import matplotlib.pyplot as plt
import seaborn as sns

# Load data
df = pd.read_csv("DATA.csv")

# Drop null-heavy or unnamed columns
df = df.drop(columns=[col for col in df.columns if "Unnamed" in col or df[col].isnull().mean() > 0.5])

# Drop rows with critical nulls
df = df.dropna(subset=["sex", "screening_age", "a_persistent_anxiety_total", "gds_total", "minibestest_total_score"])

# Create phenotype ratio
df["phenotype_ratio"] = df["tremor_score"] / (df["pigd_score"] + 1)
df["phenotype_label"] = np.where(df["phenotype_ratio"] > 1.0, "TD", "PIGD")
df["phenotype_label_encoded"] = df["phenotype_label"].map({"TD": 0, "PIGD": 1})

# Define predictors and target
features = ["screening_age", "sex", "a_persistent_anxiety_total", "gds_total", "minibestest_total_score"]
X = df[features]
X = pd.get_dummies(X, columns=["sex"], drop_first=True)
y = df["phenotype_label_encoded"]

# Split data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Logistic Regression
model = LogisticRegression(max_iter=1000)
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
y_prob = model.predict_proba(X_test)[:, 1]

# Evaluation
print("Classification Report:\n", classification_report(y_test, y_pred))
print("ROC AUC Score:", roc_auc_score(y_test, y_prob))

# Feature importance
importance = pd.Series(model.coef_[0], index=X.columns)
importance.sort_values().plot(kind='barh')
plt.title("Feature Importance")
plt.xlabel("Coefficient Value")
plt.tight_layout()
plt.show()
