In [None]:
# ===================== Imports ===================== #
import pandas as pd
import numpy as np
import gradio as gr
import xgboost as xgb
import tensorflow as tf
import logging

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score
from imblearn.over_sampling import SMOTE
from tensorflow.keras.models import Sequential, Model
from tensorflow.keras.layers import Dense, Input, Dropout, BatchNormalization, Conv1D, MaxPooling1D, Flatten, Add

import google.generativeai as genai

# ===================== Data Loading & Preprocessing ===================== #
df = pd.read_csv("1-DATA/Water_Potability.csv")
df = pd.DataFrame(SimpleImputer(strategy='mean').fit_transform(df), columns=df.columns)

X = df.drop("Potability", axis=1)
y = df["Potability"]

X = StandardScaler().fit_transform(X)
X, y = SMOTE().fit_resample(X, y)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=100)
X_train_cnn = X_train.reshape(-1, X.shape[1], 1)
X_test_cnn = X_test.reshape(-1, X.shape[1], 1)

# ===================== Model Training ===================== #

# --- XGBoost --- #
xgb_model = xgb.XGBClassifier(n_estimators=200, learning_rate=0.1, max_depth=5)
xgb_model.fit(X_train, y_train)
xgb_probs = xgb_model.predict_proba(X_test)[:, 1]
y_pred_xgb = (xgb_probs > 0.5).astype(int)

# --- MLP --- #
mlp = Sequential([
    Input(shape=(X.shape[1],)),
    Dense(128, activation='relu'), Dropout(0.3),
    Dense(64, activation='relu'), Dropout(0.3),
    Dense(32, activation='relu'), Dropout(0.3),
    Dense(1, activation='sigmoid')
])
mlp.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
mlp.fit(X_train, y_train, epochs=100, batch_size=32, validation_split=0.2, verbose=0)
mlp_probs = mlp.predict(X_test).flatten()
y_pred_mlp = (mlp_probs > 0.5).astype(int)

# --- CNN --- #
cnn = Sequential([
    Input(shape=(X.shape[1], 1)),
    Conv1D(64, 3, activation='relu'),
    MaxPooling1D(2),
    Conv1D(64, 3, activation='relu'),
    Flatten(),
    Dense(64, activation='relu'), Dropout(0.3),
    Dense(1, activation='sigmoid')
])
cnn.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
cnn.fit(X_train_cnn, y_train, epochs=100, batch_size=32, validation_split=0.2, verbose=0)
cnn_probs = cnn.predict(X_test_cnn).flatten()
y_pred_cnn = (cnn_probs > 0.5).astype(int)

# --- DNN --- #
dnn = Sequential([
    Input(shape=(X.shape[1],)),
    Dense(256, activation='relu'), BatchNormalization(), Dropout(0.4),
    Dense(128, activation='relu'), BatchNormalization(), Dropout(0.4),
    Dense(64, activation='relu'),
    Dense(1, activation='sigmoid')
])
dnn.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
dnn.fit(X_train, y_train, epochs=100, batch_size=32, validation_split=0.2, verbose=0)
dnn_probs = dnn.predict(X_test).flatten()
y_pred_dnn = (dnn_probs > 0.5).astype(int)

# --- ResNet Model --- #
def build_resnet(input_shape):
    inputs = Input(shape=input_shape)
    x = Dense(128, activation="relu")(inputs)
    x = BatchNormalization()(x)
    x = Dropout(0.3)(x)

    res = Dense(128, activation="relu")(x)
    res = BatchNormalization()(res)
    res = Dense(128, activation="relu")(res)
    res = BatchNormalization()(res)

    x = Add()([x, res])
    x = Dense(64, activation="relu")(x)
    x = BatchNormalization()(x)
    x = Dropout(0.3)(x)
    x = Dense(32, activation="relu")(x)

    return Model(inputs, Dense(1, activation="sigmoid")(x))

resnet = build_resnet((X.shape[1],))
resnet.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
resnet.fit(X_train, y_train, epochs=100, batch_size=32, validation_split=0.2, verbose=0)
resnet_probs = resnet.predict(X_test).flatten()
y_pred_resnet = (resnet_probs > 0.5).astype(int)

# ===================== Evaluation ===================== #
metrics_df = pd.DataFrame({
    "Model": ["XGBoost", "MLP", "CNN", "DNN", "ResNet"],
    "Accuracy": [accuracy_score(y_test, y_pred_xgb), accuracy_score(y_test, y_pred_mlp),
                 accuracy_score(y_test, y_pred_cnn), accuracy_score(y_test, y_pred_dnn),
                 accuracy_score(y_test, y_pred_resnet)],
    "Precision": [precision_score(y_test, y_pred_xgb), precision_score(y_test, y_pred_mlp),
                  precision_score(y_test, y_pred_cnn), precision_score(y_test, y_pred_dnn),
                  precision_score(y_test, y_pred_resnet)],
    "Recall": [recall_score(y_test, y_pred_xgb), recall_score(y_test, y_pred_mlp),
               recall_score(y_test, y_pred_cnn), recall_score(y_test, y_pred_dnn),
               recall_score(y_test, y_pred_resnet)],
    "F1 Score": [f1_score(y_test, y_pred_xgb), f1_score(y_test, y_pred_mlp),
                 f1_score(y_test, y_pred_cnn), f1_score(y_test, y_pred_dnn),
                 f1_score(y_test, y_pred_resnet)],
    "AUC": [roc_auc_score(y_test, xgb_probs), roc_auc_score(y_test, mlp_probs),
            roc_auc_score(y_test, cnn_probs), roc_auc_score(y_test, dnn_probs),
            roc_auc_score(y_test, resnet_probs)],
})
metrics_df.iloc[:, 1:] = metrics_df.iloc[:, 1:].round(3)

# ===================== Gemini API Setup ===================== #
try:
    genai.configure(api_key="AIzaSyAkMfxEHg0MszPh1kQABp-U9MQCTMw-Mro")  # Replace with your actual key
    model_gemini = genai.GenerativeModel("gemini-1.5-flash")
except Exception:
    model_gemini = None

# ===================== Prompts & Gemini Responses ===================== #
summary = metrics_df.to_string(index=False)
predefined_prompts = {
    "Explain model performance": f"""
I trained 5 models (XGBoost, MLP, CNN, DNN, ResNet) to classify water potability. Here's the summary:

{summary}

Can you explain what these results mean in simple terms?
""",
    "Which model is best and why?": f"""
Based on the following model performance metrics, which model would you recommend?

{summary}

Explain which one is best and why, in terms of generalization and real-world use.
""",
    "Suggest how to improve the weakest model": f"""
Below is a summary of model performance on water potability classification:

{summary}

Which model performed the worst, and what strategies could I use to improve its performance?
"""
}

precomputed_responses = {}
if model_gemini:
    for key, prompt in predefined_prompts.items():
        try:
            response = model_gemini.generate_content(prompt)
            precomputed_responses[key] = response.text.strip()
        except Exception as e:
            precomputed_responses[key] = f"⚠️ Gemini error: {e}"
else:
    for key in predefined_prompts:
        precomputed_responses[key] = "❌ Gemini not available. Check API key."

# ===================== Gradio Interface ===================== #
def get_response(prompt_key):
    return precomputed_responses.get(prompt_key, "❌ Invalid selection.")

logging.getLogger("gradio").setLevel(logging.ERROR)

with gr.Blocks(css=".full-width .scroll-hide { overflow-x: hidden !important; }") as demo:
    gr.Markdown("## 💧 Water Potability Model Evaluation & Gemini Insights")

    gr.Markdown("### 📊 Model Evaluation Metrics")
    gr.Dataframe(value=metrics_df, label="Performance Summary", interactive=False, render=True, elem_classes="full-width")

    gr.Markdown("### 🤖 Ask Gemini About the Results")
    prompt_selector = gr.Radio(
        list(predefined_prompts.keys()),
        label="Select a Prompt",
        interactive=True
    )

    response_box = gr.Markdown()
    prompt_selector.change(fn=get_response, inputs=prompt_selector, outputs=response_box)

    demo.launch(share=True, inline=False, debug=False)

[1m25/25[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step 
[1m25/25[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step 
[1m25/25[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step 
[1m25/25[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step  
Colab notebook detected. To show errors in colab notebook, set debug=True in launch()
* Running on public URL: https://4b220d8cdd68bbc469.gradio.live

This share link expires in 72 hours. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)
