In [1]:
# === SETUP: Install Required Packages ===
!pip install xgboost --quiet

# === FILE UPLOAD ===
from google.colab import files
uploaded = files.upload()  # Upload your 'water_potability.csv'

# === IMPORTS ===
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import LogisticRegression
from xgboost import XGBClassifier

# === LOAD & CLEAN DATA ===
filename = list(uploaded.keys())[0]
df = pd.read_csv(filename)
df.columns = df.columns.str.strip().str.lower()  # Standardize column names

# === PART 1: IRRIGATION SUITABILITY CLASSIFICATION ===
def classify_irrigation_suitability(row):
    ph = row['ph']
    hardness = row['hardness']
    sulfate = row['sulfate']
    solids = row['solids']
    turbidity = row['turbidity']
    if (6.5 <= ph <= 8.8) and (hardness <= 180) and (sulfate <= 250) and (solids <= 16000) and (1.0 <= turbidity <= 5.5):
        return 1
    else:
        return 0

df['irrigation_suitability'] = df.apply(classify_irrigation_suitability, axis=1)

# Introduce label noise
np.random.seed(42)
mask = np.random.rand(len(df)) < 0.20
df.loc[mask, 'irrigation_suitability'] = 1 - df.loc[mask, 'irrigation_suitability']

features_irrigation = ['ph', 'solids', 'hardness', 'sulfate', 'conductivity']
X_irr = df[features_irrigation]
y_irr = df['irrigation_suitability']

X_train_irr, X_test_irr, y_train_irr, y_test_irr = train_test_split(X_irr, y_irr, test_size=0.35, random_state=42)

model_irr = RandomForestClassifier(n_estimators=120, random_state=42, max_depth=8)
model_irr.fit(X_train_irr, y_train_irr)

y_pred_irr = model_irr.predict(X_test_irr)
print("=== Irrigation Suitability ===")
print(f"Accuracy: {accuracy_score(y_test_irr, y_pred_irr) * 100:.2f}%")
print(classification_report(y_test_irr, y_pred_irr))

# === PART 2: WATER POTABILITY PREDICTION ===
imputer = SimpleImputer(strategy="mean")
df_imputed = pd.DataFrame(imputer.fit_transform(df.drop(columns=['irrigation_suitability'])),
                          columns=df.drop(columns=['irrigation_suitability']).columns)

X = df_imputed.drop("potability", axis=1)
y = df_imputed["potability"]

scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.30, random_state=42)

# Updated evaluation function
def evaluate_model(model, name):
    model.fit(X_train, y_train)
    y_train_pred = model.predict(X_train)
    y_test_pred = model.predict(X_test)

    train_acc = accuracy_score(y_train, y_train_pred)
    test_acc = accuracy_score(y_test, y_test_pred)

    print(f"\n=== {name} ===")
    print(f"Training Accuracy: {train_acc * 100:.2f}%")
    print(f"Testing Accuracy: {test_acc * 100:.2f}%")
    print("Confusion Matrix:\n", confusion_matrix(y_test, y_test_pred))
    print("Classification Report:\n", classification_report(y_test, y_test_pred))

# Models list
models = [
    (KNeighborsClassifier(n_neighbors=5, p=2), "K-Nearest Neighbors"),
    (DecisionTreeClassifier(criterion='entropy', max_depth=8, splitter='best'), "Decision Tree"),
    (RandomForestClassifier(criterion='entropy', n_estimators=110, max_depth=10), "Random Forest"),
    (GaussianNB(), "Naive Bayes"),
    (LogisticRegression(max_iter=200, n_jobs=-1, random_state=0), "Logistic Regression"),
    (XGBClassifier(learning_rate=0.01, max_depth=5, n_estimators=110,
                   use_label_encoder=False, eval_metric='logloss', random_state=42), "XGBoost")
]

print("\n=== Water Potability Prediction ===")
for model, name in models:
    evaluate_model(model, name)



Saving water_potability (2).csv to water_potability (2).csv
=== Irrigation Suitability ===
Accuracy: 78.99%
              precision    recall  f1-score   support

           0       0.79      1.00      0.88       905
           1       0.60      0.01      0.02       242

    accuracy                           0.79      1147
   macro avg       0.70      0.51      0.45      1147
weighted avg       0.75      0.79      0.70      1147


=== Water Potability Prediction ===

=== K-Nearest Neighbors ===
Training Accuracy: 75.71%
Testing Accuracy: 62.77%
Confusion Matrix:
 [[476 141]
 [225 141]]
Classification Report:
               precision    recall  f1-score   support

         0.0       0.68      0.77      0.72       617
         1.0       0.50      0.39      0.44       366

    accuracy                           0.63       983
   macro avg       0.59      0.58      0.58       983
weighted avg       0.61      0.63      0.62       983


=== Decision Tree ===
Training Accuracy: 70.30%
Testin

Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)



=== XGBoost ===
Training Accuracy: 69.78%
Testing Accuracy: 64.60%
Confusion Matrix:
 [[577  40]
 [308  58]]
Classification Report:
               precision    recall  f1-score   support

         0.0       0.65      0.94      0.77       617
         1.0       0.59      0.16      0.25       366

    accuracy                           0.65       983
   macro avg       0.62      0.55      0.51       983
weighted avg       0.63      0.65      0.58       983

