In [28]:
# Step 1: Import Libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report
from imblearn.over_sampling import SMOTE
from sklearn.impute import KNNImputer

# Step 2: Load dataset
df = pd.read_csv("water_potability.csv")

# Step 3: Feature columns
feature_columns = ['ph','Hardness','Solids','Chloramines','Sulfate',
                   'Conductivity','Organic_carbon','Trihalomethanes','Turbidity']

# Step 4: Fill missing values using KNN Imputer
imputer = KNNImputer(n_neighbors=5)
df[feature_columns] = imputer.fit_transform(df[feature_columns])

# Step 5: Define features and target
X = df[feature_columns]
y = df['Potability']

# Step 6: Train-test split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

# Step 7: Scale features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Step 8: Handle imbalance using SMOTE
smote = SMOTE(random_state=42)
X_train_res, y_train_res = smote.fit_resample(X_train_scaled, y_train)

# Step 9: Train Random Forest
model = RandomForestClassifier(n_estimators=200, random_state=42)
model.fit(X_train_res, y_train_res)

# Step 10: Evaluate model
y_pred = model.predict(X_test_scaled)
print("Accuracy:", accuracy_score(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))

# Step 11: Function to predict water potability reliably
def predict_water_potability(user_input, threshold=0.5):
    """
    Combines rule-based safety checks with Random Forest prediction.
    user_input: list of 9 values in order of feature_columns
    threshold: probability threshold for predicting Drinkable
    """
    # Rule-based extreme unsafe water checks
    ph, hardness, solids, chloramines, sulfate, conductivity, organic_carbon, trihalomethanes, turbidity = user_input

    # Basic safety rules (common safe limits)
    if ph < 6.5 or ph > 8.5:
        return "Not Drinkable ❌"
    if solids > 2000 or chloramines > 10 or sulfate > 400:
        return "Not Drinkable ❌"
    if turbidity > 5 or trihalomethanes > 100:
        return "Not Drinkable ❌"

    # If it passes safety rules, use Random Forest
    user_df = pd.DataFrame([user_input], columns=feature_columns)
    user_scaled = scaler.transform(user_df)
    prob = model.predict_proba(user_scaled)[0][1]  # probability of class 1
    return "Drinkable ✅" if prob >= threshold else "Not Drinkable ❌"

# Step 12: Test inputs
test_inputs = [
   [7.0, 160, 900, 3.5, 180, 350, 8, 30, 2.8],
    [5.8, 200, 1500, 5.0, 220, 400, 12, 50, 3.5],
    [8.2, 250, 2500, 6.0, 450, 600, 20, 60, 4.0],
    [7.4, 190, 1000, 2.5, 180, 370, 10, 45, 3.0],
    [9.0, 280, 3000, 12.0, 500, 800, 25, 110, 6.0],
    [6.8, 170, 1200, 4.0, 200, 360, 14, 40, 3.2],
    [7.5, 210, 1800, 3.0, 190, 380, 16, 55, 4.5],
    [8.7, 300, 3500, 11.0, 480, 900, 28, 120, 6.5]

]

# Step 13: Run predictions
for i, inp in enumerate(test_inputs):
    print(f"Test {i+1}: Prediction = {predict_water_potability(inp, threshold=0.5)}")


Accuracy: 0.6402439024390244

Classification Report:
               precision    recall  f1-score   support

           0       0.68      0.77      0.72       400
           1       0.55      0.44      0.49       256

    accuracy                           0.64       656
   macro avg       0.62      0.60      0.60       656
weighted avg       0.63      0.64      0.63       656

Test 1: Prediction = Drinkable ✅
Test 2: Prediction = Not Drinkable ❌
Test 3: Prediction = Not Drinkable ❌
Test 4: Prediction = Drinkable ✅
Test 5: Prediction = Not Drinkable ❌
Test 6: Prediction = Drinkable ✅
Test 7: Prediction = Drinkable ✅
Test 8: Prediction = Not Drinkable ❌
