In [None]:
# Diabetes Dataset Analysis with Advanced Techniques
# Importing necessary libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler, LabelEncoder, PowerTransformer, RobustScaler
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score, StratifiedKFold
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, roc_auc_score, precision_recall_curve, average_precision_score
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, VotingClassifier, StackingClassifier
from sklearn.svm import SVC
from sklearn.neural_network import MLPClassifier
from sklearn.linear_model import LogisticRegression
from imblearn.over_sampling import SMOTE, ADASYN
from imblearn.combine import SMOTETomek
from imblearn.pipeline import Pipeline as ImbPipeline
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, BatchNormalization
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau
from tensorflow.keras.regularizers import l1_l2
import warnings
warnings.filterwarnings('ignore')

# Load and preprocess the dataset
file_path = 'diabetes_dataset.csv'
try:
    df = pd.read_csv(file_path)
except:
    df = pd.read_csv(file_path, header=None)
    column_names = ['Age', 'Sex', 'Ethnicity', 'BMI', 'Waist_Circumference',
                    'Fasting_Blood_Glucose', 'HbA1c', 'Blood_Pressure_Systolic',
                    'Blood_Pressure_Diastolic', 'Cholesterol_Total', 'Cholesterol_HDL',
                    'Cholesterol_LDL', 'GGT', 'Serum_Urate', 'Physical_Activity_Level',
                    'Dietary_Intake_Calories', 'Alcohol_Consumption', 'Smoking_Status',
                    'Family_History_of_Diabetes', 'Previous_Gestational_Diabetes']
    if df.iloc[0, 0] == 0:
        df = df.iloc[1:].reset_index(drop=True)
    df.columns = column_names

# Enhanced Data Preprocessing
def advanced_preprocess_data(df):
    data = df.copy()
    
    # Handle categorical variables with advanced encoding
    categorical_cols = ['Sex', 'Ethnicity', 'Physical_Activity_Level',
                       'Alcohol_Consumption', 'Smoking_Status']
    
    # Create interaction features
    data['BMI_Age'] = data['BMI'] * data['Age']
    data['BMI_Waist_Ratio'] = data['BMI'] / data['Waist_Circumference']
    data['Glucose_HbA1c_Ratio'] = data['Fasting_Blood_Glucose'] / data['HbA1c']
    data['Cholesterol_Ratio'] = data['Cholesterol_Total'] / data['Cholesterol_HDL']
    data['BP_Product'] = data['Blood_Pressure_Systolic'] * data['Blood_Pressure_Diastolic']
    
    # Create polynomial features for important numeric columns
    numeric_cols = ['BMI', 'Fasting_Blood_Glucose', 'HbA1c', 'Waist_Circumference']
    for col in numeric_cols:
        data[f'{col}_Squared'] = data[col] ** 2
    
    # Advanced categorical encoding
    for col in categorical_cols:
        le = LabelEncoder()
        data[col] = le.fit_transform(data[col])
        # Add frequency encoding
        data[f'{col}_Freq'] = data[col].map(data[col].value_counts(normalize=True))
    
    # Handle missing values with advanced techniques
    for col in data.columns:
        if data[col].isnull().sum() > 0:
            if data[col].dtype == 'object':
                data[col].fillna(data[col].mode()[0], inplace=True)
            else:
                # Use more robust imputation for numeric columns
                median_val = data[col].median()
                std_val = data[col].std()
                data[col].fillna(data[col].median() + np.random.normal(0, std_val/4, size=data[col].isnull().sum()), inplace=True)
    
    # Add log transforms for skewed numeric columns
    numeric_cols = data.select_dtypes(include=[np.number]).columns
    for col in numeric_cols:
        if data[col].skew() > 1:
            data[f'{col}_Log'] = np.log1p(data[col] - data[col].min() + 1)
    
    return data

processed_df = advanced_preprocess_data(df)

# Create enhanced target variable
def create_advanced_target(df):
    # Combine multiple risk factors for a more nuanced classification
    high_risk = (
        (df['HbA1c'] >= 6.5) |  # Standard diabetes threshold
        (df['Fasting_Blood_Glucose'] >= 126) |  # Standard diabetes threshold
        ((df['HbA1c'] >= 6.0) & (df['Fasting_Blood_Glucose'] >= 110) & 
         (df['BMI'] >= 30) & (df['Age'] >= 45)) |  # Combined risk factors
        ((df['Family_History_of_Diabetes'] == 1) & (df['HbA1c'] >= 6.0))  # Genetic predisposition
    ).astype(int)
    return high_risk

processed_df['Diabetes_Risk'] = create_advanced_target(processed_df)

# Prepare data for modeling with advanced feature selection
X = processed_df.drop(['Diabetes_Risk', 'HbA1c', 'Fasting_Blood_Glucose'], axis=1)
y = processed_df['Diabetes_Risk']

# Advanced feature scaling
scaler = RobustScaler()  # More robust to outliers than StandardScaler
X_scaled = scaler.fit_transform(X)
X_scaled = pd.DataFrame(X_scaled, columns=X.columns)

# Enhanced train-test split with stratification
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42, stratify=y)

# Advanced class balancing with SMOTETomek
smote_tomek = SMOTETomek(random_state=42)
X_train_balanced, y_train_balanced = smote_tomek.fit_resample(X_train, y_train)

# Define optimized base models
rf_model = RandomForestClassifier(
    n_estimators=200,
    max_depth=20,
    min_samples_split=5,
    min_samples_leaf=2,
    max_features='sqrt',
    class_weight='balanced',
    random_state=42,
    n_jobs=-1
)

gb_model = GradientBoostingClassifier(
    n_estimators=200,
    learning_rate=0.1,
    max_depth=5,
    min_samples_split=5,
    subsample=0.8,
    random_state=42
)

svm_model = SVC(
    kernel='rbf',
    C=10,
    gamma='scale',
    probability=True,
    class_weight='balanced',
    random_state=42
)

# Advanced Neural Network
def build_advanced_dl_model(input_dim):
    model = Sequential([
        Dense(256, activation='relu', input_shape=(input_dim,), kernel_regularizer=l1_l2(l1=1e-5, l2=1e-4)),
        BatchNormalization(),
        Dropout(0.4),

        Dense(128, activation='relu', kernel_regularizer=l1_l2(l1=1e-5, l2=1e-4)),
        BatchNormalization(),
        Dropout(0.4),

        Dense(64, activation='relu', kernel_regularizer=l1_l2(l1=1e-5, l2=1e-4)),
        BatchNormalization(),
        Dropout(0.3),

        Dense(32, activation='relu', kernel_regularizer=l1_l2(l1=1e-5, l2=1e-4)),
        BatchNormalization(),
        Dropout(0.2),

        Dense(1, activation='sigmoid')
    ])

    optimizer = Adam(learning_rate=0.001)
    model.compile(
        optimizer=optimizer,
        loss='binary_crossentropy',
        metrics=['accuracy', tf.keras.metrics.AUC()]
    )
    return model

# Build and train stacking ensemble
estimators = [
    ('rf', rf_model),
    ('gb', gb_model),
    ('svm', svm_model)
]

stack = StackingClassifier(
    estimators=estimators,
    final_estimator=LogisticRegression(max_iter=1000),
    cv=5
)

# Train models
print("Training Stacking Ensemble...")
stack.fit(X_train_balanced, y_train_balanced)

# Train Deep Learning model
print("\nTraining Deep Learning Model...")
dl_model = build_advanced_dl_model(X_train_balanced.shape[1])

early_stopping = EarlyStopping(
    monitor='val_loss',
    patience=15,
    restore_best_weights=True
)

reduce_lr = ReduceLROnPlateau(
    monitor='val_loss',
    factor=0.2,
    patience=5,
    min_lr=0.00001
)

history = dl_model.fit(
    X_train_balanced, y_train_balanced,
    epochs=100,
    batch_size=32,
    validation_split=0.2,
    callbacks=[early_stopping, reduce_lr],
    verbose=1
)

# Make predictions
stack_pred = stack.predict(X_test)
dl_pred = (dl_model.predict(X_test) > 0.5).astype(int)

# Create final ensemble prediction
final_pred = np.round((stack_pred + dl_pred.reshape(-1)) / 2).astype(int)

# Evaluate final model
final_accuracy = accuracy_score(y_test, final_pred)
final_auc = roc_auc_score(y_test, final_pred)

print("\nFinal Model Performance:")
print(f"Accuracy: {final_accuracy:.4f}")
print(f"AUC-ROC: {final_auc:.4f}")
print("\nClassification Report:")
print(classification_report(y_test, final_pred))

# Plot confusion matrix
plt.figure(figsize=(10, 8))
cm = confusion_matrix(y_test, final_pred)
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues')
plt.title('Confusion Matrix - Final Ensemble Model')
plt.xlabel('Predicted Label')
plt.ylabel('True Label')
plt.show()

# Save the best models
print("\nSaving models...")
joblib.dump(stack, 'stack_ensemble_model.pkl')
dl_model.save('deep_learning_model.h5')

print("\nAnalysis complete!")

ModuleNotFoundError: No module named 'pandas'