In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
# ===============================
# Import Required Libraries
# ===============================

import pandas as pd          # For data handling (tables, rows, columns)
import numpy as np           # For numerical operations
import matplotlib.pyplot as plt  # For plotting graphs
import seaborn as sns        # For advanced visualizations

from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

# Set style for better plots
sns.set(style="whitegrid")


In [None]:
# ===============================
# Load the Dataset
# ===============================

train = pd.read_csv('/kaggle/input/playground-series-s5e12/train.csv')
test = pd.read_csv('/kaggle/input/playground-series-s5e12/test.csv')

print("Train shape:", train.shape)
print("Test shape:", test.shape)
print('-'*80)
print(train.isnull().sum())
print('-'*80)
print(train.describe())

In [None]:
# ===============================
# Target Variable Distribution
# ===============================

train['diagnosed_diabetes'].value_counts() # Number of diabetic vs non-diabetic patients

In [None]:
# Visualize target distribution
sns.countplot(x='diagnosed_diabetes', data=train)
plt.title("Diabetes Diagnosis Distribution")
plt.show()


In [None]:
# ===============================
# Age Distribution Analysis
# ===============================

print('''
Age spread of patients

Helps see which age group is dominant
''')
plt.figure(figsize=(8,5))
sns.histplot(train['age'], bins=30, kde=True)
plt.title("Age Distribution of Patients")
plt.show()


In [None]:
# ===============================
# BMI vs Diabetes
# ===============================
print('''
BMI comparison between diabetic and non-diabetic patients

Shows higher BMI â†’ higher diabetes risk
''')

plt.figure(figsize=(8,5))
sns.boxplot(x='diagnosed_diabetes', y='bmi', data=train)
plt.title("BMI vs Diabetes Status")
plt.show()


In [None]:
# ===============================
# Physical Activity vs Diabetes
# ===============================

print('Less physical activity is linked to diabetes')
plt.figure(figsize=(8,5))
sns.boxplot(x='diagnosed_diabetes', y='physical_activity_minutes_per_week', data=train)
plt.title("Physical Activity vs Diabetes")
plt.show()


In [None]:
# ===============================
# Correlation Analysis
# ===============================

# Select only numerical columns
num_cols = train.select_dtypes(include=['int64', 'float64'])

# Correlation matrix
corr = num_cols.corr()

plt.figure(figsize=(16,10))
sns.heatmap(corr, cmap='coolwarm', annot=False)
plt.title("Correlation Heatmap")
plt.show()


In [None]:
# ===============================
# Top Features Correlated with Diabetes
# ===============================

print('''Features most positively and negatively related to diabetes

Useful for feature selection ''')

corr_target = corr['diagnosed_diabetes'].sort_values(ascending=False)
corr_target


In [None]:
# ===============================
# Categorical Feature Distribution
# ===============================

categorical_cols = [
    'gender', 'ethnicity', 'education_level',
    'income_level', 'smoking_status', 'employment_status'
]

for col in categorical_cols:
    plt.figure(figsize=(6,4))
    sns.countplot(x=col, data=train)
    plt.xticks(rotation=45)
    plt.title(f"Distribution of {col}")
    plt.show()


In [None]:
# ===============================
# Family History vs Diabetes
# ===============================
print('Patients with family history show higher diabetes risk')
sns.countplot(x='family_history_diabetes', hue='diagnosed_diabetes', data=train)
plt.title("Family History of Diabetes vs Diagnosis")
plt.show()


In [None]:
# ===============================
# Separate Features (X) and Target (y)
# ===============================

X = train.drop('diagnosed_diabetes', axis=1)   # All input features
y = train['diagnosed_diabetes']                # Target variable


In [None]:
# ===============================
# Identify Column Types
# ===============================

numerical_cols = X.select_dtypes(include=['int64', 'float64']).columns
categorical_cols = X.select_dtypes(include=['object']).columns

print("Numerical Columns:", numerical_cols)
print("Categorical Columns:", categorical_cols)


In [None]:
# Scale numerical features
print ('''Brings all numerical features to the same scale

Improves ML model performance''')
numeric_transformer = Pipeline(steps=[
    ('scaler', StandardScaler())
])


In [None]:
# Encode categorical features
print('''Converts text categories into numbers

Avoids errors with unseen values''')
categorical_transformer = Pipeline(steps=[
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])


In [None]:
# Combine numerical & categorical preprocessing
print('Combine numerical & categorical preprocessing')

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numerical_cols),
        ('cat', categorical_transformer, categorical_cols)
    ]
)


In [None]:
# ===============================
# Split Data
# ===============================

from sklearn.model_selection import train_test_split

X_train, X_val, y_train, y_val = train_test_split(
    X, y,
    test_size=0.2,
    random_state=42,
    stratify=y
)

print("Training size:", X_train.shape)
print("Validation size:", X_val.shape)


In [None]:
# ===============================
# Logistic Regression Model
# ===============================
print('Logistic Regression Model')
print('''Handles non-linear relationships

Works well with large datasets

Usually better than Logistic Regression''')

from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, roc_auc_score


In [None]:
# Create pipeline with preprocessing + model
print('Create pipeline with preprocessing + model')

log_reg = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('model', LogisticRegression(max_iter=1000))
])


In [None]:
# Train model
log_reg.fit(X_train, y_train)

# Predict on validation data
y_pred_lr = log_reg.predict(X_val)
y_pred_lr_proba = log_reg.predict_proba(X_val)[:,1]

# Evaluate
print("Logistic Regression Accuracy:", accuracy_score(y_val, y_pred_lr))
print("Logistic Regression ROC-AUC:", roc_auc_score(y_val, y_pred_lr_proba))


In [None]:
# ===============================
# Random Forest Model
# ===============================
print('Random Forest Model')

from sklearn.ensemble import RandomForestClassifier


In [None]:
rf_model = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('model', RandomForestClassifier(
        n_estimators=100,
        max_depth=10,
        random_state=42,
        n_jobs=-1
    ))
])


In [None]:
# Train model
rf_model.fit(X_train, y_train)

# Predict
y_pred_rf = rf_model.predict(X_val)
y_pred_rf_proba = rf_model.predict_proba(X_val)[:,1]

# Evaluate
print("Random Forest Accuracy:", accuracy_score(y_val, y_pred_rf))
print("Random Forest ROC-AUC:", roc_auc_score(y_val, y_pred_rf_proba))


In [None]:
# ===============================
# 7. Compare Models
# ===============================
print('''Side-by-side comparison

Helps choose the best model''')

results = pd.DataFrame({
    'Model': ['Logistic Regression', 'Random Forest'],
    'Accuracy': [
        accuracy_score(y_val, y_pred_lr),
        accuracy_score(y_val, y_pred_rf)
    ],
    'ROC_AUC': [
        roc_auc_score(y_val, y_pred_lr_proba),
        roc_auc_score(y_val, y_pred_rf_proba)
    ]
})

results


In [None]:
# ===============================
# Train Final Model on Full Dataset
# ===============================

final_model = rf_model   # Choose best-performing model

final_model.fit(X, y)


In [None]:
# ===============================
# Test Predictions
# ===============================

test_predictions = final_model.predict(test)

test_predictions[:10]


In [None]:
# ===============================
# Submission File
# ===============================

submission = pd.read_csv('/kaggle/input/playground-series-s5e12/sample_submission.csv')
submission['diagnosed_diabetes'] = test_predictions

submission.to_csv('submission.csv', index=False)

submission.head()
