In [1]:
from google.colab import files
files.upload()
!mkdir ~/.kaggle
!cp kaggle.json ~/.kaggle/
!chmod 600 ~/.kaggle/kaggle.json
!kaggle datasets download -d adityakadiwal/water-potability
!unzip water-potability.zip
import pandas as pd

water_df = pd.read_csv("water_potability.csv")
print(water_df.head())
print(water_df.info())


ModuleNotFoundError: No module named 'google.colab'

In [7]:
# ==========================
# 1. Import Libraries
# ==========================
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report, roc_auc_score, roc_curve, confusion_matrix
from imblearn.over_sampling import SMOTE
from xgboost import XGBClassifier
import shap
import matplotlib.pyplot as plt
import seaborn as sns

# 2. Load Dataset
data = pd.read_csv("/content/water_potability.csv")
print("Initial Shape:", data.shape)
print(data.head())
data = data.fillna(data.median())

# 3. Add Synthetic Features (Symptoms + Season)
np.random.seed(42)
data["Diarrhea_Cases"] = np.random.randint(0, 10, size=len(data))
data["Fever_Cases"] = np.random.randint(0, 15, size=len(data))
data["Vomiting_Cases"] = np.random.randint(0, 12, size=len(data))

# Add seasonality (simulate contamination effect)
data["Season"] = np.random.choice(["Summer", "Winter", "Monsoon"], size=len(data))
data = pd.get_dummies(data, columns=["Season"], drop_first=True)
print("Dataset after feature engineering:", data.shape)

# 4. Split Data
X = data.drop("Potability", axis=1)
y = data["Potability"]

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# 5. Handle Imbalance with SMOTE
smote = SMOTE(random_state=42)
X_train, y_train = smote.fit_resample(X_train, y_train)

print("After SMOTE:", pd.Series(y_train).value_counts())

# 6. Train XGBoost Model
model = XGBClassifier(
    n_estimators=300,
    max_depth=6,
    learning_rate=0.05,
    subsample=0.8,
    colsample_bytree=0.8,
    use_label_encoder=False,
    eval_metric="logloss",
    random_state=42
)
model.fit(X_train, y_train)

# Predictions
y_pred = model.predict(X_test)
y_prob = model.predict_proba(X_test)[:, 1]

# 7. Evaluation
print("📊 Classification Report:")
print(classification_report(y_test, y_pred))

roc_auc = roc_auc_score(y_test, y_prob)
print(f"🔥 ROC-AUC: {roc_auc:.4f}")

# Confusion Matrix
cm = confusion_matrix(y_test, y_pred)
sns.heatmap(cm, annot=True, fmt="d", cmap="Blues", xticklabels=["Safe","Unsafe"], yticklabels=["Safe","Unsafe"])
plt.title("Confusion Matrix")
plt.show()

# ROC Curve
fpr, tpr, _ = roc_curve(y_test, y_prob)
plt.plot(fpr, tpr, label=f"XGBoost (AUC={roc_auc:.2f})")
plt.plot([0,1], [0,1], "k--")
plt.xlabel("False Positive Rate")
plt.ylabel("True Positive Rate")
plt.title("ROC Curve")
plt.legend()
plt.show()

# 8. SHAP Explainability
explainer = shap.TreeExplainer(model)
shap_values = explainer.shap_values(X_test)

# Feature Importance
shap.summary_plot(shap_values, X_test, feature_names=X.columns)

# Force plot for a single prediction
shap.initjs()
shap.force_plot(explainer.expected_value, shap_values[0], X_test[0], matplotlib=True)
import joblib
joblib.dump(model, 'sihmodel.pkl')

ModuleNotFoundError: No module named 'sklearn'