In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.multioutput import MultiOutputClassifier
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score, classification_report

import pickle

In [2]:
df = pd.read_csv('/kaggle/input/water-quality-data/waterquality.csv')


In [3]:
df.isna().sum()

Date                        5
Salinity (ppt)            130
DissolvedOxygen (mg/L)    851
pH                         95
SecchiDepth (m)            73
WaterDepth (m)             71
WaterTemp (C)             121
AirTemp (C)                 0
dtype: int64

In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2371 entries, 0 to 2370
Data columns (total 8 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   Date                    2366 non-null   object 
 1   Salinity (ppt)          2241 non-null   float64
 2   DissolvedOxygen (mg/L)  1520 non-null   float64
 3   pH                      2276 non-null   float64
 4   SecchiDepth (m)         2298 non-null   float64
 5   WaterDepth (m)          2300 non-null   float64
 6   WaterTemp (C)           2250 non-null   float64
 7   AirTemp (C)             2371 non-null   float64
dtypes: float64(7), object(1)
memory usage: 148.3+ KB


In [5]:
df.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
Salinity (ppt),2241.0,0.717068,1.230819,0.0,0.0,0.0,1.0,9.0
DissolvedOxygen (mg/L),1520.0,6.646263,2.506608,0.0,4.8,6.5,8.5,15.1
pH,2276.0,7.168212,0.788485,0.3,6.5,7.0,7.5,9.9
SecchiDepth (m),2298.0,0.524898,0.473663,0.0,0.3,0.4,0.65,9.0
WaterDepth (m),2300.0,0.762559,0.62114,0.01,0.4,0.65,0.95,12.0
WaterTemp (C),2250.0,18.062138,8.298246,0.0,11.0,19.0,25.0,74.0
AirTemp (C),2371.0,15.66305,10.31472,-17.777778,8.888889,16.666667,23.888889,33.5


In [6]:
df.fillna(df.mean(numeric_only=True), inplace=True)
df.isnull().sum()


Date                      5
Salinity (ppt)            0
DissolvedOxygen (mg/L)    0
pH                        0
SecchiDepth (m)           0
WaterDepth (m)            0
WaterTemp (C)             0
AirTemp (C)               0
dtype: int64

In [7]:
df.shape

(2371, 8)

In [8]:
# df = df.dropna(subset=[
#     'Salinity (ppt)', 'DissolvedOxygen (mg/L)', 'pH', 
#     'SecchiDepth (m)', 'WaterDepth (m)', 'WaterTemp (C)', 'AirTemp (C)'
# ])
# df.head(20)


In [9]:
def label_environment(row):
    if 6.5 <= row['pH'] <= 8.5 and row['DissolvedOxygen (mg/L)'] >= 5 and 0.5 <= row['Salinity (ppt)'] <= 30:
        return 'Good'
    elif (row['pH'] < 6.5 or row['pH'] > 8.5 or row['DissolvedOxygen (mg/L)'] < 3 or row['Salinity (ppt)'] < 0.5):
        return 'Poor'
    else:
        return 'Moderate'

df['EnvironmentQuality'] = df.apply(label_environment, axis=1)

In [10]:
def simulate_bacteria_level(row):
    if row['DissolvedOxygen (mg/L)'] < 3 or row['pH'] < 6.5 or row['WaterTemp (C)'] > 30:
        return 'High'
    elif row['DissolvedOxygen (mg/L)'] < 5 or row['WaterTemp (C)'] > 28:
        return 'Moderate'
    else:
        return 'Low'

df['BacteriaLevel'] = df.apply(simulate_bacteria_level, axis=1)


In [11]:
features = ['Salinity (ppt)', 'DissolvedOxygen (mg/L)', 'pH', 'SecchiDepth (m)',
            'WaterDepth (m)', 'WaterTemp (C)', 'AirTemp (C)']

X = df[features]
le_env = LabelEncoder()
le_bact = LabelEncoder()
y_env = le_env.fit_transform(df['EnvironmentQuality'])
y_bact = le_bact.fit_transform(df['BacteriaLevel'])

In [12]:
Y = np.column_stack((y_env, y_bact))

In [13]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=42)

In [14]:
base_model = RandomForestClassifier(n_estimators=100, random_state=42)
model = MultiOutputClassifier(base_model)
model.fit(X_train, Y_train)

In [15]:
y_pred = model.predict(X_test)

# Separate multi-output columns
y_env_test = Y_test[:, 0]
y_bact_test = Y_test[:, 1]

y_env_pred = y_pred[:, 0]
y_bact_pred = y_pred[:, 1]

# Environment Quality Accuracy & Report
print("Environment Quality Prediction:")
print(f"Accuracy: {accuracy_score(y_env_test, y_env_pred) * 100:.2f}%")
print(classification_report(y_env_test, y_env_pred, target_names=le_env.classes_))

# Bacteria Level Accuracy & Report
print("Bacteria Level Prediction:")
print(f"Accuracy: {accuracy_score(y_bact_test, y_bact_pred) * 100:.2f}%")
print(classification_report(y_bact_test, y_bact_pred, target_names=le_bact.classes_))

Environment Quality Prediction:
Accuracy: 99.79%
              precision    recall  f1-score   support

        Good       1.00      1.00      1.00       148
    Moderate       0.93      1.00      0.96        13
        Poor       1.00      1.00      1.00       314

    accuracy                           1.00       475
   macro avg       0.98      1.00      0.99       475
weighted avg       1.00      1.00      1.00       475

Bacteria Level Prediction:
Accuracy: 99.79%
              precision    recall  f1-score   support

        High       1.00      1.00      1.00        39
         Low       1.00      1.00      1.00       365
    Moderate       1.00      0.99      0.99        71

    accuracy                           1.00       475
   macro avg       1.00      1.00      1.00       475
weighted avg       1.00      1.00      1.00       475



In [16]:
with open('multioutput_model.pkl', 'wb') as f:
    pickle.dump(model, f)

In [17]:
def predict_env_and_bact_quality(new_data):
    import numpy as np
    import pandas as pd
    import pickle

    # Re-create the encoders (you can also save/load them from file)
    env_labels = ['Poor', 'Moderate', 'Good']
    bact_labels = ['Low', 'Medium', 'High']

    # Convert input dict to DataFrame
    df_input = pd.DataFrame([new_data])

    # Load the model
    with open('multioutput_model.pkl', 'rb') as f:
        loaded_model = pickle.load(f)

    # Predict
    y_pred = loaded_model.predict(df_input)[0]

    # Decode numerical predictions
    env_quality = env_labels[y_pred[0]]
    bact_level = bact_labels[y_pred[1]]

    return {
        "Environment Quality": env_quality,
        "Bacteria Level": bact_level
    }


In [18]:
sample_input = {
    'Salinity (ppt)': 9.5,
    'DissolvedOxygen (mg/L)': 3.1,
    'pH': 5.8,
    'SecchiDepth (m)': 0.7,
    'WaterDepth (m)': 1.2,
    'WaterTemp (C)': 30.5,
    'AirTemp (C)': 34.2
}


result = predict_env_and_bact_quality(sample_input)

print("🐟 Prediction Result:")
print(f" - Environment Quality: {result['Environment Quality']}")
print(f" - Bacteria Level: {result['Bacteria Level']}")


🐟 Prediction Result:
 - Environment Quality: Good
 - Bacteria Level: Low
