In [26]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

In [27]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from sklearn.metrics import (classification_report,confusion_matrix,accuracy_score,mean_squared_error,r2_score)

In [28]:
from google.colab import files
uploaded=files.upload()
df=pd.read_csv("water_potability.csv")
df.head()


Saving water_potability.csv to water_potability (2).csv


Unnamed: 0,ph,Hardness,Solids,Chloramines,Sulfate,Conductivity,Organic_carbon,Trihalomethanes,Turbidity,Potability
0,,204.890455,20791.318981,7.300212,368.516441,564.308654,10.379783,86.99097,2.963135,0
1,3.71608,129.422921,18630.057858,6.635246,,592.885359,15.180013,56.329076,4.500656,0
2,8.099124,224.236259,19909.541732,9.275884,,418.606213,16.868637,66.420093,3.055934,0
3,8.316766,214.373394,22018.417441,8.059332,356.886136,363.266516,18.436524,100.341674,4.628771,0
4,9.092223,181.101509,17978.986339,6.5466,310.135738,398.410813,11.558279,31.997993,4.075075,0


In [29]:
print(df.isnull().sum())
df=df.fillna(df.median())

ph                 491
Hardness             0
Solids               0
Chloramines          0
Sulfate            781
Conductivity         0
Organic_carbon       0
Trihalomethanes    162
Turbidity            0
Potability           0
dtype: int64


In [30]:
X=df.drop("Potability",axis=1)
y=df["Potability"]
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.2,random_state=42)

clf=RandomForestClassifier(random_state=42)
clf.fit(X_train,y_train)

y_pred = clf.predict(X_test)
print("Accuracy:",accuracy_score(y_test,y_pred))
print(confusion_matrix(y_test,y_pred))
print(classification_report(y_test,y_pred))


Accuracy: 0.6737804878048781
[[350  62]
 [152  92]]
              precision    recall  f1-score   support

           0       0.70      0.85      0.77       412
           1       0.60      0.38      0.46       244

    accuracy                           0.67       656
   macro avg       0.65      0.61      0.61       656
weighted avg       0.66      0.67      0.65       656



In [31]:
def compute_safety_score(row):
    score=10
    if row['ph'] < 6.5 or row['ph'] > 8.5:score-=2
    if row['Hardness'] > 300:score-=1
    if row['Solids'] > 500:score-=2
    if row['Chloramines'] > 4:score-=1
    if row['Sulfate'] > 400:score-=1
    if row['Conductivity'] > 500:score-=1
    if row['Organic_carbon'] > 5:score-=1
    if row['Trihalomethanes'] > 80:score-=1
    if row['Turbidity'] > 5:score-=1
    return max(1,score)

df["SafetyScore"] = df.apply(compute_safety_score, axis=1)
df[["Potability", "SafetyScore"]].head()


Unnamed: 0,Potability,SafetyScore
0,0,4
1,0,3
2,0,6
3,0,5
4,0,4


In [32]:
Xr=df.drop(["Potability","SafetyScore"],axis=1)
yr=df["SafetyScore"]
Xr_train,Xr_test,yr_train,yr_test=train_test_split(Xr,yr,test_size=0.2,random_state=42)

reg=RandomForestRegressor()
reg.fit(Xr_train,yr_train)

yr_pred=reg.predict(Xr_test)

# Evaluating the regression model
print("R² Score:",r2_score(yr_test, yr_pred))
print("MSE:",mean_squared_error(yr_test, yr_pred))

pd.DataFrame({
    "Actual Score":yr_test.values,
    "Predicted Score":yr_pred.round(1)
}).head(10)


R² Score: 0.9800784650452841
MSE: 0.029630792682926833


Unnamed: 0,Actual Score,Predicted Score
0,6,6.0
1,6,6.0
2,6,6.0
3,6,5.3
4,6,6.0
5,3,3.0
6,3,3.0
7,3,3.0
8,5,5.0
9,6,6.0
