In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, GridSearchCV, StratifiedKFold
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.metrics import (accuracy_score, precision_score, recall_score, 
                            f1_score, roc_auc_score, confusion_matrix, 
                            RocCurveDisplay, PrecisionRecallDisplay)
from imblearn.over_sampling import SMOTE
from imblearn.pipeline import make_pipeline as make_imb_pipeline
import xgboost as xgb
from lightgbm import LGBMClassifier
from sklearn.ensemble import VotingClassifier
from sklearn.decomposition import PCA
from sklearn.feature_selection import SelectKBest, f_classif

# Load data
df = pd.read_csv('Autism-Prediction-using-Machine-Learning---DataSet.csv')

# Data Cleaning
# Replace '?' with NaN and handle missing values
df.replace('?', np.nan, inplace=True)

# Handle negative values in 'result' column
df['result'] = df['result'].apply(lambda x: x if x > 0 else np.nan)

# Feature Engineering
# Create total score from A1-A10
df['total_score'] = df[[f'A{i}_Score' for i in range(1, 11)]].sum(axis=1)

# Create age groups
df['age_group'] = pd.cut(df['age'], bins=[0, 12, 18, 30, 50, 100],
                        labels=['child', 'teen', 'young', 'adult', 'senior'])

# Preprocessing
# Define target variable (assuming 'austim' is the target)
y = df['austim'].map({'yes': 1, 'no': 0})

# Select features
features = ['A1_Score', 'A2_Score', 'A3_Score', 'A4_Score', 'A5_Score',
            'A6_Score', 'A7_Score', 'A8_Score', 'A9_Score', 'A10_Score',
            'age', 'gender', 'ethnicity', 'jaundice', 'contry_of_res',
            'used_app_before', 'result', 'relation', 'total_score', 'age_group']

X = df[features]

# Split data
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, stratify=y, random_state=42
)

# Preprocessing pipeline
numeric_features = ['age', 'result', 'total_score']
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])

categorical_features = ['gender', 'ethnicity', 'jaundice', 'contry_of_res',
                       'used_app_before', 'relation', 'age_group']
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)
    ])

# Feature selection and dimensionality reduction
feature_selector = SelectKBest(score_func=f_classif, k=15)
pca = PCA(n_components=0.95)

# Define models
xgb_model = xgb.XGBClassifier(random_state=42, eval_metric='logloss')
lgbm_model = LGBMClassifier(random_state=42)

# Hyperparameter grids
param_grid_xgb = {
    'classifier__learning_rate': [0.01, 0.1],
    'classifier__n_estimators': [100, 200],
    'classifier__max_depth': [3, 5],
    'classifier__subsample': [0.8, 1.0],
    'classifier__colsample_bytree': [0.8, 1.0]
}

param_grid_lgbm = {
    'classifier__learning_rate': [0.01, 0.1],
    'classifier__n_estimators': [100, 200],
    'classifier__num_leaves': [31, 63],
    'classifier__feature_fraction': [0.8, 1.0]
}

# Create voting classifier
voting_clf = VotingClassifier(
    estimators=[
        ('xgb', xgb_model),
        ('lgbm', lgbm_model)
    ],
    voting='soft'
)

# Full pipeline with SMOTE
pipeline = make_imb_pipeline(
    preprocessor,
    SMOTE(random_state=42),
    feature_selector,
    pca,
    voting_clf
)

# GridSearchCV setup
grid_search = GridSearchCV(
    estimator=pipeline,
    param_grid=[
        {
            'feature_selector__k': [15, 20],
            'pca__n_components': [None, 0.95],
            'votingclf__xgb__learning_rate': [0.01, 0.1],
            'votingclf__xgb__n_estimators': [100, 200]
        }
    ],
    cv=StratifiedKFold(n_splits=5, shuffle=True, random_state=42),
    scoring='roc_auc',
    n_jobs=-1,
    verbose=2
)

# Train model
grid_search.fit(X_train, y_train)

# Best model
best_model = grid_search.best_estimator_

# Evaluate
y_pred = best_model.predict(X_test)
y_proba = best_model.predict_proba(X_test)[:, 1]

print(f"Best Parameters: {grid_search.best_params_}")
print(f"Accuracy: {accuracy_score(y_test, y_pred):.4f}")
print(f"Precision: {precision_score(y_test, y_pred):.4f}")
print(f"Recall: {recall_score(y_test, y_pred):.4f}")
print(f"F1 Score: {f1_score(y_test, y_pred):.4f}")
print(f"ROC AUC: {roc_auc_score(y_test, y_proba):.4f}")

# Plot confusion matrix
cm = confusion_matrix(y_test, y_pred)
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues')
plt.xlabel('Predicted')
plt.ylabel('True')
plt.title('Confusion Matrix')
plt.show()

# ROC Curve
RocCurveDisplay.from_estimator(best_model, X_test, y_test)
plt.title('ROC Curve')
plt.show()

# Feature Importance (for tree-based models)
try:
    importances = best_model.named_steps['votingclf'].estimators_[0].feature_importances_
    features_names = preprocessor.get_feature_names_out()
    selected_features = features_names[feature_selector.get_support()]
    importance_df = pd.DataFrame({'Feature': selected_features, 'Importance': importances})
    importance_df = importance_df.sort_values('Importance', ascending=False)
    
    plt.figure(figsize=(10, 6))
    sns.barplot(x='Importance', y='Feature', data=importance_df.head(15))
    plt.title('Top 15 Feature Importances')
    plt.show()
except AttributeError:
    print("Feature importance not available for selected model configuration")

# Save model
import joblib
joblib.dump(best_model, 'autism_detection_model.pkl')


KeyError: "['jaundice'] not in index"