In [4]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score
from sklearn.impute import KNNImputer
from sklearn.preprocessing import MinMaxScaler
import re

# Load the data
file_path = 'adjusted_water_quality_data_v2.csv'  # Replace with your file path
data = pd.read_csv(file_path)

# Convert specific columns to numeric, handling non-numeric entries
columns_to_convert = ['pH', 'SO4', 'NO3', 'Mg', 'K', 'F']
for col in columns_to_convert:
    data[col] = pd.to_numeric(data[col], errors='coerce')

# Scaling and imputing missing values
imputation_columns = ['pH', 'TDS', 'Cl', 'SO4', 'NO3', 'TH', 'Ca', 'Mg', 'Na', 'K', 'F']
scaler = MinMaxScaler()
data_scaled = scaler.fit_transform(data[imputation_columns])
knn_imputer = KNNImputer(n_neighbors=5)
data_imputed_scaled = knn_imputer.fit_transform(data_scaled)
data_imputed = pd.DataFrame(scaler.inverse_transform(data_imputed_scaled), columns=imputation_columns)
data[imputation_columns] = data_imputed

# Function to reclassify water quality
def reclassify_water_quality(row):
    # Criteria for reclassification
    exceedances = sum([
        row['pH'] < 6.5 or row['pH'] > 8.5,
        row['TDS'] > 1000,
        row['Cl'] > 250,
        row['SO4'] > 250,
        row['NO3'] > 50,
        row['TH'] > 500,
        row['Ca'] > 75,
        row['Mg'] > 50
    ])
    if exceedances == 0:
        return 'Excellent'
    elif exceedances <= 2:
        return 'Good'
    elif exceedances <= 4:
        return 'Moderate'
    elif exceedances <= 6:
        return 'Poor'
    else:
        return 'Worst'

data['Water_Quality_New'] = data.apply(reclassify_water_quality, axis=1)

# Preparing features and target for model training
features = ['pH', 'TDS', 'Cl', 'TH', 'NO3', 'Ca', 'Mg']
X = data[features]
y = data['Water_Quality_New']

# Splitting the dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize and train the RandomForestClassifier
rf_classifier = RandomForestClassifier(random_state=42)
rf_classifier.fit(X_train, y_train)

# Predict and evaluate the model
y_pred = rf_classifier.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
classification_rep = classification_report(y_test, y_pred)
print(y_pred)
# Print the results
print("Accuracy:", accuracy)
print("Classification Report:\n", classification_rep)

['Excellent' 'Excellent' 'Excellent' ... 'Excellent' 'Good' 'Excellent']
Accuracy: 0.9599141016463851
Classification Report:
               precision    recall  f1-score   support

   Excellent       1.00      1.00      1.00       749
        Good       0.95      0.98      0.97       411
    Moderate       0.82      0.77      0.79       121
        Poor       0.89      0.84      0.86       101
       Worst       0.81      0.87      0.84        15

    accuracy                           0.96      1397
   macro avg       0.89      0.89      0.89      1397
weighted avg       0.96      0.96      0.96      1397

