Importing Libraries

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer

Loading Dataset

In [10]:
dataset = "./water_potability.csv"
df = pd.read_csv(dataset)

print(df.head())

         ph    Hardness        Solids  ...  Trihalomethanes  Turbidity  Potability
0       NaN  204.890455  20791.318981  ...        86.990970   2.963135           0
1  3.716080  129.422921  18630.057858  ...        56.329076   4.500656           0
2  8.099124  224.236259  19909.541732  ...        66.420093   3.055934           0
3  8.316766  214.373394  22018.417441  ...       100.341674   4.628771           0
4  9.092223  181.101509  17978.986339  ...        31.997993   4.075075           0

[5 rows x 10 columns]


Data Cleaning and Preprocessing

In [36]:
X = df.drop('Potability', axis=1)
y = df['Potability']

imputer = SimpleImputer(strategy="median")
numeric = X.select_dtypes(include=["float64", "int64"]).columns
X[numeric] = imputer.fit_transform(X[numeric])

X.drop_duplicates(inplace=True)

scaler = StandardScaler()
X[numeric] = scaler.fit_transform(X[numeric])
print(X.head())

         ph  Hardness    Solids  ...  Organic_carbon  Trihalomethanes  Turbidity
0 -0.025474  0.259195 -0.139471  ...       -1.180651         1.305434  -1.286298
1 -2.284717 -2.036414 -0.385987  ...        0.270597        -0.639186   0.684218
2  0.697319  0.847665 -0.240047  ...        0.781117         0.000800  -1.167365
3  0.845393  0.547651  0.000493  ...        1.255134         2.152154   0.848412
4  1.372982 -0.464429 -0.460249  ...       -0.824357        -2.182297   0.138786

[5 rows x 9 columns]


Train Model

In [37]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)
rf = RandomForestClassifier(n_estimators=100, random_state=42)
rf.fit(X_train, y_train)

y_pred = rf.predict(X_test)

print("Accuracy: ", accuracy_score(y_test, y_pred))
print("Classification Report:\n", classification_report(y_test, y_pred))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))

Accuracy:  0.6585365853658537
Classification Report:
               precision    recall  f1-score   support

           0       0.66      0.89      0.76       400
           1       0.63      0.30      0.41       256

    accuracy                           0.66       656
   macro avg       0.65      0.59      0.58       656
weighted avg       0.65      0.66      0.62       656

Confusion Matrix:
 [[355  45]
 [179  77]]
