Importing Libraries

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import joblib
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from sklearn.preprocessing import RobustScaler
from sklearn.impute import KNNImputer

Loading Dataset

In [2]:
dataset = "./water_potability.csv"
df = pd.read_csv(dataset)

print(df.head())

         ph    Hardness        Solids  Chloramines     Sulfate  Conductivity  \
0       NaN  204.890455  20791.318981     7.300212  368.516441    564.308654   
1  3.716080  129.422921  18630.057858     6.635246         NaN    592.885359   
2  8.099124  224.236259  19909.541732     9.275884         NaN    418.606213   
3  8.316766  214.373394  22018.417441     8.059332  356.886136    363.266516   
4  9.092223  181.101509  17978.986339     6.546600  310.135738    398.410813   

   Organic_carbon  Trihalomethanes  Turbidity  Potability  
0       10.379783        86.990970   2.963135           0  
1       15.180013        56.329076   4.500656           0  
2       16.868637        66.420093   3.055934           0  
3       18.436524       100.341674   4.628771           0  
4       11.558279        31.997993   4.075075           0  


Data Cleaning and Preprocessing

In [3]:
X = df.drop('Potability', axis=1)
y = df['Potability']

imputer = KNNImputer(n_neighbors=5)
X_imputed = pd.DataFrame(imputer.fit_transform(X), columns=X.columns)

X.drop_duplicates(inplace=True)

scaler = RobustScaler()
numeric = X_imputed.select_dtypes(include=[np.number]).columns
X_scaled = X_imputed.copy()
X_scaled[numeric] = scaler.fit_transform(X_imputed[numeric])

Train Model

In [4]:
X_train, X_test, y_train, y_test = train_test_split(
    X_scaled, y, test_size=0.2, random_state=42, stratify=y
)
rf = RandomForestClassifier(n_estimators=100, random_state=42)
rf.fit(X_train, y_train)

y_pred = rf.predict(X_test)

print("Accuracy: ", accuracy_score(y_test, y_pred))
print("Classification Report:\n", classification_report(y_test, y_pred))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))

Accuracy:  0.6524390243902439
Classification Report:
               precision    recall  f1-score   support

           0       0.66      0.88      0.75       400
           1       0.61      0.30      0.40       256

    accuracy                           0.65       656
   macro avg       0.64      0.59      0.58       656
weighted avg       0.64      0.65      0.62       656

Confusion Matrix:
 [[351  49]
 [179  77]]


Saving model

In [6]:
joblib.dump(rf,'water_potability_model.pkl')
joblib.dump(scaler,'scaler.pkl')

['scaler.pkl']