In [4]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.impute import SimpleImputer
from sklearn.metrics import classification_report, confusion_matrix
import joblib

# Load dataset
df = pd.read_csv("water_potability_Project.csv")

# Select only relevant 6 features
selected_features = ["ph", "Hardness", "Solids", "Chloramines", "Sulfate", "Turbidity"]
X = df[selected_features]
y = df["Potability"]

# Impute missing values
imputer = SimpleImputer(strategy="mean")
X_imputed = imputer.fit_transform(X)

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X_imputed, y, test_size=0.2, random_state=42, stratify=y)

# Build model
model = RandomForestClassifier(n_estimators=200, random_state=42, class_weight='balanced')
model.fit(X_train, y_train)

# Evaluate
y_pred = model.predict(X_test)
print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))


[[356  44]
 [171  85]]
              precision    recall  f1-score   support

           0       0.68      0.89      0.77       400
           1       0.66      0.33      0.44       256

    accuracy                           0.67       656
   macro avg       0.67      0.61      0.60       656
weighted avg       0.67      0.67      0.64       656



In [5]:
import pickle

# Save model
with open("water_potability_model.pkl", "wb") as f:
    pickle.dump(model, f)

# Save imputer or scaler
with open("water_imputer_model.pkl", "wb") as f:
    pickle.dump(imputer, f)