<a href="https://colab.research.google.com/github/Pranaiu/EduNet-Internship-Aiml/blob/main/Final_Project.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
# pip install pandas numpy matplotlib seaborn scikit-learn joblib

import pandas as pd
import numpy as np
from sklearn.multioutput import MultiOutputRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score
import joblib

# Load dataset
df = pd.read_csv('/content/afa2e701598d20110228.csv', sep=';')

# Convert date to datetime
df['date'] = pd.to_datetime(df['date'], format='%d.%m.%Y')

# Sort by id and date
df = df.sort_values(by=['id', 'date'])

# Extract year and month
df['year'] = df['date'].dt.year
df['month'] = df['date'].dt.month

# Drop rows with missing target pollutant values
pollutants = ['O2', 'NO3', 'NO2', 'SO4', 'PO4', 'CL']
df = df.dropna(subset=pollutants)

# Define features and target
X = df[['id', 'year']]
y = df[pollutants]

# Encode station ID using one-hot encoding
X_encoded = pd.get_dummies(X, columns=['id'], drop_first=True)

# Split into train and test sets
X_train, X_test, y_train, y_test = train_test_split(
    X_encoded, y, test_size=0.2, random_state=42
)

# Train the model
model = MultiOutputRegressor(RandomForestRegressor(n_estimators=100, random_state=42))
model.fit(X_train, y_train)

# Evaluate model
y_pred = model.predict(X_test)
print("Model Performance on the Test Data:")
for i, pollutant in enumerate(pollutants):
    print(f'{pollutant}:')
    print('   MSE:', mean_squared_error(y_test.iloc[:, i], y_pred[:, i]))
    print('   R2:', r2_score(y_test.iloc[:, i], y_pred[:, i]))
    print()

# Prediction for a new input
station_id = '22'
year_input = 2024
input_data = pd.DataFrame({'year': [year_input], 'id': [station_id]})
input_encoded = pd.get_dummies(input_data, columns=['id'])

# Ensure all required columns are present
missing_cols = set(X_encoded.columns) - set(input_encoded.columns)
for col in missing_cols:
    input_encoded[col] = 0
input_encoded = input_encoded[X_encoded.columns]  # match column order

# Predict
predicted_pollutants = model.predict(input_encoded)[0]
print(f"\nPredicted pollutant levels for station '{station_id}' in {year_input}:")
for p, val in zip(pollutants, predicted_pollutants):
    print(f"  {p}: {val:.2f}")

# Save model and feature columns
joblib.dump(model, 'pollution_model.pkl')
joblib.dump(X_encoded.columns.tolist(), "model_columns.pkl")
print('\nModel and column structure saved!')


Model Performance on the Test Data:
O2:
   MSE: 22.21825046040189
   R2: -0.01674257045728833

NO3:
   MSE: 18.153106746365886
   R2: 0.5162032171220556

NO2:
   MSE: 10.607352172601502
   R2: -78.42066512350873

SO4:
   MSE: 2412.139350033052
   R2: 0.4118345603876148

PO4:
   MSE: 0.38496938017964155
   R2: 0.3221189891402043

CL:
   MSE: 34882.81433245622
   R2: 0.7357918194149974


Predicted pollutant levels for station '22' in 2024:
  O2: 12.60
  NO3: 6.90
  NO2: 0.13
  SO4: 143.08
  PO4: 0.50
  CL: 67.33

Model and column structure saved!


In [3]:
%%writefile app.py
import pandas as pd
import numpy as np
import joblib
import streamlit as st

# Title and Description
st.title("💧 Water Pollutants Predictor")
st.write("📊 Predict common water pollutant levels based on Year and Station ID using a pre-trained machine learning model.")

# Load the trained model and feature structure
model = joblib.load("/content/afa2e701598d20110228.csv")
model_cols = joblib.load("/content/afa2e701598d20110228.csv")

# List of pollutant labels
pollutants = ['O2', 'NO3', 'NO2', 'SO4', 'PO4', 'CL']

# User Input Section
st.sidebar.header("📥 Input Parameters")
year_input = st.sidebar.number_input("Enter Year", min_value=2000, max_value=2100, value=2024)
station_id = st.sidebar.text_input("Enter Station ID (e.g. '1', '5', '22')", value='1')

# Predict Button
if st.sidebar.button('🔍 Predict'):
    if not station_id.strip():
        st.warning('⚠️ Please enter a valid Station ID')
    else:
        # Step 1: Prepare Input Data
        input_df = pd.DataFrame({'year': [year_input], 'id': [station_id]})
        input_encoded = pd.get_dummies(input_df, columns=['id'])

        # Step 2: Align with model's training columns
        missing_cols = set(model_cols) - set(input_encoded.columns)
        for col in missing_cols:
            input_encoded[col] = 0
        input_encoded = input_encoded[model_cols]  # reorder to match training

        # Step 3: Predict pollutant levels
        predicted_pollutants = model.predict(input_encoded)[0]

        # Step 4: Display results
        st.subheader(f"📡 Predicted pollutant levels for Station ID '{station_id}' in {year_input}:")
        result_dict = {p: round(val, 2) for p, val in zip(pollutants, predicted_pollutants)}
        st.json(result_dict)


Overwriting app.py
