In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report
import pickle

In [2]:
df = pd.read_csv('/kaggle/input/water-quality-data/waterquality.csv')


In [3]:
df.isna().sum()

Date                        5
Salinity (ppt)            130
DissolvedOxygen (mg/L)    851
pH                         95
SecchiDepth (m)            73
WaterDepth (m)             71
WaterTemp (C)             121
AirTemp (C)                 0
dtype: int64

In [4]:
df = df.dropna(subset=[
    'Salinity (ppt)', 'DissolvedOxygen (mg/L)', 'pH', 
    'SecchiDepth (m)', 'WaterDepth (m)', 'WaterTemp (C)', 'AirTemp (C)'
])
df.head(20)


Unnamed: 0,Date,Salinity (ppt),DissolvedOxygen (mg/L),pH,SecchiDepth (m),WaterDepth (m),WaterTemp (C),AirTemp (C)
19,1990-01-22,1.0,9.7,7.5,0.15,0.65,10.0,12.0
21,1990-02-07,3.4,10.8,7.5,0.2,0.6,10.0,10.0
22,1990-02-12,3.2,10.6,7.0,0.2,0.7,10.0,9.0
23,1990-02-20,4.2,11.6,7.5,0.15,0.39,5.0,5.0
24,1990-02-27,2.8,13.8,7.0,0.5,1.5,2.5,1.0
25,1990-03-05,2.9,11.6,7.0,0.2,0.4,7.0,8.5
26,1990-03-12,2.9,9.0,7.0,0.15,0.69,17.0,21.0
27,1990-03-19,2.8,7.8,7.0,0.22,0.92,15.0,14.0
28,1990-03-26,3.0,10.8,7.0,0.25,0.4,9.5,9.0
29,1990-04-02,1.8,9.6,8.0,0.15,0.7,5.0,15.0


In [5]:
def label_environment(row):
    if 6.5 <= row['pH'] <= 8.5 and row['DissolvedOxygen (mg/L)'] >= 5 and 0.5 <= row['Salinity (ppt)'] <= 30:
        return 'Good'
    elif (row['pH'] < 6.5 or row['pH'] > 8.5 or row['DissolvedOxygen (mg/L)'] < 3 or row['Salinity (ppt)'] < 0.5):
        return 'Poor'
    else:
        return 'Moderate'

df['EnvironmentQuality'] = df.apply(label_environment, axis=1)

In [6]:
features = ['Salinity (ppt)', 'DissolvedOxygen (mg/L)', 'pH', 'SecchiDepth (m)',
            'WaterDepth (m)', 'WaterTemp (C)', 'AirTemp (C)']

X = df[features]
y = df['EnvironmentQuality']

In [7]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [8]:
model = RandomForestClassifier(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

In [9]:
y_pred = model.predict(X_test)
print(f"Accuracy: {accuracy_score(y_test, y_pred) * 100:.2f}%")
print(" Classification", classification_report(y_test, y_pred))

Accuracy: 97.73%
 Classification               precision    recall  f1-score   support

        Good       0.99      1.00      0.99        83
    Moderate       0.69      1.00      0.81        11
        Poor       1.00      0.96      0.98       170

    accuracy                           0.98       264
   macro avg       0.89      0.99      0.93       264
weighted avg       0.98      0.98      0.98       264



In [10]:
with open('environment_quality_model.pkl', 'wb') as f:
    pickle.dump(model, f)

In [11]:
def predict_environment_quality(new_data):
    """
    Predicts environment quality from new data
    new_data should be a dict with the required 7 features
    """
    df_new = pd.DataFrame([new_data])
    with open('environment_quality_model.pkl', 'rb') as f:
        loaded_model = pickle.load(f)
    prediction = loaded_model.predict(df_new)[0]
    return prediction



sample_input = {
    'Salinity (ppt)': 5.0,
    'DissolvedOxygen (mg/L)': 6.2,
    'pH': 7.2,
    'SecchiDepth (m)': 1.5,
    'WaterDepth (m)': 3.0,
    'WaterTemp (C)': 24.0,
    'AirTemp (C)': 25.5
}

print("🐟 Environment Status:", predict_environment_quality(sample_input))

🐟 Environment Status: Good
