In [1]:
!pip install imblearn

Collecting imblearn
  Downloading imblearn-0.0-py2.py3-none-any.whl.metadata (355 bytes)
Collecting imbalanced-learn (from imblearn)
  Downloading imbalanced_learn-0.13.0-py3-none-any.whl.metadata (8.8 kB)
Collecting sklearn-compat<1,>=0.1 (from imbalanced-learn->imblearn)
  Downloading sklearn_compat-0.1.3-py3-none-any.whl.metadata (18 kB)
Downloading imblearn-0.0-py2.py3-none-any.whl (1.9 kB)
Downloading imbalanced_learn-0.13.0-py3-none-any.whl (238 kB)
Downloading sklearn_compat-0.1.3-py3-none-any.whl (18 kB)
Installing collected packages: sklearn-compat, imbalanced-learn, imblearn
Successfully installed imbalanced-learn-0.13.0 imblearn-0.0 sklearn-compat-0.1.3


In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import classification_report
from imblearn.under_sampling import RandomUnderSampler
import numpy as np

In [3]:

df = pd.read_csv('water_quality.csv')

X = df.drop('Potability', axis=1)
y = df['Potability']

X = X.fillna(X.mean())

In [4]:
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)


mlp = MLPClassifier(hidden_layer_sizes=(64, 32), max_iter=500, random_state=42)
mlp.fit(X_train, y_train)
y_pred = mlp.predict(X_test)
report_before = classification_report(y_test, y_pred, output_dict=True)


undersampler = RandomUnderSampler(random_state=42)
X_train_resampled, y_train_resampled = undersampler.fit_resample(X_train, y_train)

mlp_balanced = MLPClassifier(hidden_layer_sizes=(64, 32), max_iter=500, random_state=42)
mlp_balanced.fit(X_train_resampled, y_train_resampled)
y_pred_balanced = mlp_balanced.predict(X_test)
report_after = classification_report(y_test, y_pred_balanced, output_dict=True)


In [6]:
metrics = ['precision', 'recall', 'f1-score']

data = {
    'Metric': metrics,
    'Before Balancing': [report_before['macro avg'][metric] for metric in metrics],
    'After Balancing': [report_after['macro avg'][metric] for metric in metrics],
}

df_results = pd.DataFrame(data)
print(df_results)

for metric in metrics:
    improvement = report_after['macro avg'][metric] - report_before['macro avg'][metric]
    print(f"Improvement in {metric}: {improvement:.4f}")


      Metric  Before Balancing  After Balancing
0  precision          0.729757         0.571658
1     recall          0.634558         0.751334
2   f1-score          0.667474         0.570115
Improvement in precision: -0.1581
Improvement in recall: 0.1168
Improvement in f1-score: -0.0974
