In [9]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from imblearn.over_sampling import SMOTE

# Read the data file
df = pd.read_csv("https://raw.githubusercontent.com/NumanESchulich/SchulichDataScience/main/Data%20Science%20I%20(MBAN%206110T)/Group%20Assignment/Datasets/Full%20Dataset%20(RAW).csv")

# Drop the ID column
df = df.drop('ID', axis=1)

# Modify the target variable
df['Diabetes_percentage'] = df['Diabetes_binary'] * 100

# Feature Engineering
def feature_engineering(df):
    # Reverse binary columns
    reverse_cols = ['DiffWalk', 'HighBP', 'HighChol', 'HeartDiseaseorAttack', 'Stroke', 'Smoker', 'HvyAlcoholConsump']
    for col in reverse_cols:
        df[f'No{col}'] = 1 - df[col]
        df = df.drop(col, axis=1)

    # Clustering
    def cluster_column(df, col, bins, labels):
        return pd.cut(df[col], bins=bins, labels=labels, include_lowest=True, ordered=False)

    df['GenHlth'] = cluster_column(df, 'GenHlth', [0, 1, 2, 3, 4, 5], [0, 0.25, 0.5, 0.75, 1])
    df['PhysHlth'] = cluster_column(df, 'PhysHlth', [-1, 6, 12, 18, 24, 30], [1, 0.75, 0.5, 0.25, 0])
    df['MentHlth'] = cluster_column(df, 'MentHlth', [-1, 6, 12, 18, 24, 30], [1, 0.75, 0.5, 0.25, 0])
    df['BMI'] = cluster_column(df, 'BMI', [0, 18.5, 24.9, 29.9, 39.9, 100], [1, 1, 0.5, 0.25, 0])
    df['Income'] = cluster_column(df, 'Income', [0, 2, 4, 5, 7, 8], [0, 0.25, 0.5, 0.75, 1])
    df['Education'] = cluster_column(df, 'Education', [0, 2, 3, 4, 5, 6], [0, 0.25, 0.75, 1, 1])
    df['Age'] = cluster_column(df, 'Age', [0, 3, 6, 9, 12, 13], [1, 0.75, 0.5, 0.25, 0])

    # Convert categorical columns to numeric
    cat_columns = df.select_dtypes(include=['category']).columns
    for col in cat_columns:
        df[col] = df[col].cat.codes

    # Feature Engineered Columns
    df['PhysicalCondition'] = (df['GenHlth'] + df['NoDiffWalk'] + df['PhysHlth'] + df['PhysActivity']) / 4
    df['NoDisease'] = (df['NoHighBP'] + df['NoHighChol'] + df['NoHeartDiseaseorAttack'] + df['NoStroke']) / 4
    df['Lifestyle'] = (df['NoSmoker'] + df['NoHvyAlcoholConsump'] + df['Veggies'] + df['Fruits']) / 4
    df['NotObese'] = (df['BMI'] < 2).astype(int)  # BMI < 30 corresponds to codes 0 and 1

    return df

# Prepare data for modeling
def prepare_data(df, feature_engineered=True):
    if feature_engineered:
        df = feature_engineering(df)
    
    X = df.drop(['Diabetes_binary', 'Diabetes_percentage'], axis=1)
    y = df['Diabetes_percentage']
    
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_test_scaled = scaler.transform(X_test)
    
    # Apply SMOTE to handle class imbalance
    smote = SMOTE(random_state=42)
    X_train_resampled, y_train_resampled = smote.fit_resample(X_train_scaled, y_train)
    
    return X_train_resampled, X_test_scaled, y_train_resampled, y_test, X.columns

# Train and evaluate model
def train_and_evaluate(X_train, X_test, y_train, y_test, model_name, feature_names):
    if model_name == 'RandomForest':
        model = RandomForestRegressor(n_estimators=100, random_state=42)
    elif model_name == 'LinearRegression':
        model = LinearRegression()
    
    model.fit(X_train, y_train)
    
    # Predictions
    y_pred = model.predict(X_test)
    
    # Metrics
    mse = mean_squared_error(y_test, y_pred)
    rmse = np.sqrt(mse)
    mae = mean_absolute_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)
    
    return {
        'Model': model_name,
        'MSE': mse,
        'RMSE': rmse,
        'MAE': mae,
        'R2': r2
    }, model, y_pred, feature_names

# Run models and compare results
def run_models(df):
    results = []
    models = {}
    
    for feature_engineered in [False, True]:
        X_train, X_test, y_train, y_test, feature_names = prepare_data(df, feature_engineered)
        
        for model_name in ['RandomForest', 'LinearRegression']:
            result, model, _, feature_names = train_and_evaluate(X_train, X_test, y_train, y_test, model_name, feature_names)
            result['Feature Engineered'] = feature_engineered
            results.append(result)
            models[(model_name, feature_engineered)] = (model, feature_names)
    
    return pd.DataFrame(results), models

# Run the analysis
results, models = run_models(df)
print(results)

# Function to demonstrate predictions as percentages
def predict_diabetes_probability(model, feature_names, X, feature_engineered=True):
    original_columns = X.columns.tolist()
    
    if feature_engineered:
        X = feature_engineering(X)
    
    # Ensure all columns from the training data are present
    for col in feature_names:
        if col not in X.columns:
            X[col] = 0  # Add missing columns with default value 0
    
    # Select only the columns used during training
    X = X[feature_names]
    
    X = X.drop(['Diabetes_binary', 'Diabetes_percentage'], axis=1, errors='ignore')
    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(X)
    
    probabilities = model.predict(X_scaled)
    return probabilities, original_columns

# Example usage
sample_data = df.sample(5)  # Take 5 random samples from the dataset
for (model_name, feature_eng), (model, feature_names) in models.items():
    print(f"\nPredictions using {model_name} {'with' if feature_eng else 'without'} feature engineering:")
    probabilities, original_columns = predict_diabetes_probability(model, feature_names, sample_data, feature_eng)
    for i, prob in enumerate(probabilities):
        print(f"Sample {i+1}: {prob:.2f}% chance of diabetes")
    
    # Print feature importances for Random Forest
    if model_name == 'RandomForest':
        importances = model.feature_importances_
        feature_imp = pd.DataFrame({'feature': feature_names, 'importance': importances})
        feature_imp = feature_imp.sort_values('importance', ascending=False).head(10)
        print("\nTop 10 Feature Importances:")
        print(feature_imp)
    
    # Print coefficients for Linear Regression
    if model_name == 'LinearRegression':
        coefficients = model.coef_
        coef_df = pd.DataFrame({'feature': feature_names, 'coefficient': coefficients})
        coef_df = coef_df.sort_values('coefficient', ascending=False).head(10)
        print("\nTop 10 Coefficients:")
        print(coef_df)

              Model          MSE       RMSE        MAE        R2  \
0      RandomForest  1101.985776  33.196171  21.086063  0.073111   
1  LinearRegression  1818.355418  42.642179  35.704248 -0.529433   
2      RandomForest  1256.837138  35.451899  22.302058 -0.057136   
3  LinearRegression  1820.241202  42.664285  35.701179 -0.531019   

   Feature Engineered  
0               False  
1               False  
2                True  
3                True  

Predictions using RandomForest without feature engineering:
Sample 1: 100.00% chance of diabetes
Sample 2: 100.00% chance of diabetes
Sample 3: 69.00% chance of diabetes
Sample 4: 100.00% chance of diabetes
Sample 5: 100.00% chance of diabetes

Top 10 Feature Importances:
      feature  importance
13    GenHlth    0.282460
3         BMI    0.212421
18        Age    0.161880
0      HighBP    0.084572
20     Income    0.049987
15   PhysHlth    0.035681
19  Education    0.031167
14   MentHlth    0.026364
1    HighChol    0.016077
4    