<a href="https://colab.research.google.com/github/Tauseef2707/Air-quality-detection-using-machine-learning/blob/main/Air_Quality_Detection_py.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import os
import argparse
import json
from typing import Dict, Optional

import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split, RandomizedSearchCV, cross_val_score
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier
from sklearn.metrics import mean_absolute_error, r2_score, accuracy_score, classification_report, confusion_matrix
from sklearn.linear_model import Ridge
from sklearn.base import BaseEstimator, TransformerMixin
import joblib

# --------------------------- Utility: AQI calculation (PM2.5-based simple) ---------------------------
# This is a simplified AQI calculator for PM2.5 based on US EPA breakpoints. For production use a
# full AQI calculator covering all pollutants per local standards.

PM25_BREAKPOINTS = [
    (0.0, 12.0, 0, 50),
    (12.1, 35.4, 51, 100),
    (35.5, 55.4, 101, 150),
    (55.5, 150.4, 151, 200),
    (150.5, 250.4, 201, 300),
    (250.5, 350.4, 301, 400),
    (350.5, 500.4, 401, 500),
]

def pm25_to_aqi(pm25: float) -> int:
    if np.isnan(pm25):
        return np.nan
    for (c_low, c_high, i_low, i_high) in PM25_BREAKPOINTS:
        if c_low <= pm25 <= c_high:
            aqi = ((i_high - i_low) / (c_high - c_low)) * (pm25 - c_low) + i_low
            return int(round(aqi))
    return 500

def aqi_to_category(aqi: float) -> str:
    if np.isnan(aqi):
        return 'Unknown'
    if aqi <= 50:
        return 'Good'
    if aqi <= 100:
        return 'Moderate'
    if aqi <= 150:
        return 'Unhealthy for Sensitive Groups'
    if aqi <= 200:
        return 'Unhealthy'
    if aqi <= 300:
        return 'Very Unhealthy'
    return 'Hazardous'

# --------------------------- Transformer: Datetime features ---------------------------
class DateTimeFeatures(BaseEstimator, TransformerMixin):
    """Extracts datetime features from a pandas Series or array-like of datetimes.
    Adds: hour, dayofweek, day, month, is_weekend
    """
    def __init__(self, datetime_col: str = 'Timestamp'):
        self.datetime_col = datetime_col

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        # X expected to be a DataFrame
        X = X.copy()
        if self.datetime_col in X.columns:
            dt = pd.to_datetime(X[self.datetime_col], errors='coerce')
            X['hour'] = dt.dt.hour
            X['dayofweek'] = dt.dt.dayofweek
            X['day'] = dt.dt.day
            X['month'] = dt.dt.month
            X['is_weekend'] = dt.dt.dayofweek.isin([5,6]).astype(int)
            X = X.drop(columns=[self.datetime_col])
        return X

# --------------------------- IO / Preprocessing / Pipeline ---------------------------

def load_data(csv_path: str) -> pd.DataFrame:
    df = pd.read_csv(csv_path)
    return df


def prepare_dataset(df: pd.DataFrame, datetime_col: Optional[str] = 'Timestamp') -> pd.DataFrame:
    df = df.copy()
    # Standardize column names
    df.columns = [c.strip() for c in df.columns]

    # If AQI missing but PM2.5 present, compute AQI
    if 'AQI' not in df.columns and 'PM2.5' in df.columns:
        df['AQI'] = df['PM2.5'].apply(pm25_to_aqi)

    # Add AQI category if not present
    if 'AQI_Category' not in df.columns and 'AQI' in df.columns:
        df['AQI_Category'] = df['AQI'].apply(aqi_to_category)

    # Keep common pollutant and meteorological columns if present
    expected_columns = ['PM2.5', 'PM10', 'NO2', 'SO2', 'CO', 'O3', 'Temperature', 'Humidity', 'WindSpeed', 'Pressure']
    available = [c for c in expected_columns if c in df.columns]

    # If timestamp exists, keep it for feature extraction
    if datetime_col and datetime_col in df.columns:
        available = [datetime_col] + available

    # Always keep AQI and category if present
    for c in ['AQI', 'AQI_Category']:
        if c in df.columns and c not in available:
            available.append(c) # Add if not already present

    return df[available]


def build_preprocessing_pipeline(df: pd.DataFrame, datetime_col: Optional[str] = 'Timestamp') -> ColumnTransformer:
    numeric_cols = df.select_dtypes(include=[np.number]).columns.tolist()
    # remove target columns if present
    numeric_cols = [c for c in numeric_cols if c not in ('AQI',)]

    # We'll allow datetime transformer to drop the original datetime column
    datetime_transformer = Pipeline(steps=[('dt', DateTimeFeatures(datetime_col=datetime_col))])

    numeric_transformer = Pipeline(steps=[
        ('imputer', SimpleImputer(strategy='median')),
        ('scaler', StandardScaler()),
    ])

    # No categorical columns expected normally, but handle if present
    categorical_cols = df.select_dtypes(include=['object', 'category']).columns.tolist()
    categorical_cols = [c for c in categorical_cols if c not in ('AQI_Category',)] # Exclude target

    categorical_transformer = Pipeline(steps=[
        ('imputer', SimpleImputer(strategy='most_frequent')),
        ('onehot', OneHotEncoder(handle_unknown='ignore'))
    ])

    transformers = []
    # Apply datetime transformer first if the column exists
    if datetime_col and datetime_col in df.columns:
         # The DateTimeFeatures transformer expects the dataframe as input, so we apply it to all columns initially
         transformers.append(('datetime', datetime_transformer, [datetime_col]))
         # After datetime transformation, the original numeric and categorical columns might change or new ones are added.
         # For simplicity, we will define the remaining transformations based on the original dataframe columns,
         # and the ColumnTransformer will apply them by name if they exist after the first step.
         # A more robust approach might involve a FeatureUnion or chaining transformers differently.
         # However, given the structure, applying numeric/categorical to original columns should work
         # provided the datetime transformer correctly handles/drops the timestamp col and doesn't interfere.


    # Add numeric and categorical pipelines
    # Ensure we apply these only to columns that are NOT the datetime column,
    # as the datetime transformer handles that.
    cols_for_num = [c for c in numeric_cols if c != datetime_col]
    cols_for_cat = [c for c in categorical_cols if c != datetime_col]

    if cols_for_num:
        transformers.append(('num', numeric_transformer, cols_for_num))
    if cols_for_cat:
        transformers.append(('cat', categorical_transformer, cols_for_cat))


    preproc = ColumnTransformer(transformers=transformers, remainder='passthrough') # Keep other columns if any

    return preproc

# --------------------------- Model training & evaluation ---------------------------

def train_models(df: pd.DataFrame, datetime_col: Optional[str] = 'Timestamp', output_dir: str = 'models') -> Dict[str, str]:
    os.makedirs(output_dir, exist_ok=True)

    # Ensure dataset prepared
    df = prepare_dataset(df, datetime_col=datetime_col)

    # Drop rows with no target for supervised training
    if 'AQI' not in df.columns:
        raise ValueError('No AQI column found. Provide AQI or PM2.5 to compute AQI.')

    # Select features and targets
    # Ensure we don't include target columns in features
    feature_columns = [c for c in df.columns if c not in ('AQI', 'AQI_Category')]

    if not feature_columns:
         raise ValueError("No features left after removing target columns. Check your dataset.")


    X = df[feature_columns]
    y_reg = df['AQI'].values
    y_clf = df['AQI_Category'].values if 'AQI_Category' in df.columns else np.array([aqi_to_category(v) for v in y_reg])

    # Drop rows where AQI is NaN (if any were generated from missing PM2.5 and no AQI was present)
    non_nan_indices = ~np.isnan(y_reg)
    X = X[non_nan_indices].reset_index(drop=True)
    y_reg = y_reg[non_nan_indices]
    y_clf = y_clf[non_nan_indices]


    # Check if enough samples remain after dropping NaNs
    if X.shape[0] < 2: # Need at least 2 samples for split
        raise ValueError("Not enough valid data points after removing rows with missing AQI.")


    preproc = build_preprocessing_pipeline(X, datetime_col=datetime_col)

    # Regression pipeline
    reg_pipeline = Pipeline(steps=[
        ('preproc', preproc),
        ('reg', RandomForestRegressor(n_estimators=200, random_state=42, n_jobs=-1))
    ])

    # Classification pipeline (predict AQI category)
    clf_pipeline = Pipeline(steps=[
        ('preproc', preproc),
        ('clf', RandomForestClassifier(n_estimators=200, random_state=42, n_jobs=-1))
    ])

    # Train/test split
    # Ensure stratified split for classification if possible (depends on category distribution)
    # If using shuffle=True with random_state, the split is reproducible.
    # If y_clf has very few samples per class, stratification might fail, so add error handling or simplify.
    try:
        X_train, X_test, y_reg_train, y_reg_test, y_clf_train, y_clf_test = train_test_split(
            X, y_reg, y_clf, test_size=0.2, random_state=42, shuffle=True, stratify=y_clf
        )
    except ValueError as e:
         print(f"Warning: Could not perform stratified split due to class distribution: {e}. Proceeding with non-stratified split.")
         X_train, X_test, y_reg_train, y_reg_test, y_clf_train, y_clf_test = train_test_split(
            X, y_reg, y_clf, test_size=0.2, random_state=42, shuffle=True
        )


    print('Training regression model...')
    reg_pipeline.fit(X_train, y_reg_train)
    print('Training classification model...')
    clf_pipeline.fit(X_train, y_clf_train)

    # Eval regression
    y_reg_pred = reg_pipeline.predict(X_test)
    mae = mean_absolute_error(y_reg_test, y_reg_pred)
    r2 = r2_score(y_reg_test, y_reg_pred)
    print(f'Regression MAE: {mae:.2f}, R2: {r2:.3f}')

    # Eval classification
    y_clf_pred = clf_pipeline.predict(X_test)
    acc = accuracy_score(y_clf_test, y_clf_pred)
    print(f'Classification Accuracy: {acc:.3f}')
    print('Classification report:')
    print(classification_report(y_clf_test, y_clf_pred))

    # Save models
    reg_path = os.path.join(output_dir, 'aqi_regressor.joblib')
    clf_path = os.path.join(output_dir, 'aqi_classifier.joblib')
    joblib.dump(reg_pipeline, reg_path)
    joblib.dump(clf_pipeline, clf_path)

    print(f'Models saved to {output_dir}')
    return {'regressor': reg_path, 'classifier': clf_path}

# --------------------------- Inference helpers ---------------------------


def load_models(model_dir: str = 'models') -> Dict[str, object]:
    reg_path = os.path.join(model_dir, 'aqi_regressor.joblib')
    clf_path = os.path.join(model_dir, 'aqi_classifier.joblib')
    if not os.path.exists(reg_path) or not os.path.exists(clf_path):
        raise FileNotFoundError('Model files not found. Train models first or provide correct model dir.')
    reg = joblib.load(reg_path)
    clf = joblib.load(clf_path)
    return {'regressor': reg, 'classifier': clf}


def predict_from_dict(sample: Dict, models: Dict[str, object], datetime_col: Optional[str] = 'Timestamp'):
    """Accepts a dict with keys matching training dataframe columns and returns predicted AQI and category."""
    df = pd.DataFrame([sample])
    # Ensure the dataframe has the datetime column if it was used during training
    if datetime_col and datetime_col not in df.columns:
        # If the original training data included a datetime column, the pipeline expects it.
        # If the sample doesn't have it, we need to decide how to handle it.
        # For now, we'll raise an error, but a more robust solution might impute or require the user to provide it.
        raise ValueError(f"Sample missing required datetime column: {datetime_col}")

    reg = models['regressor']
    clf = models['classifier']
    # The pipeline handles preprocessing, including datetime feature extraction
    pred_aqi = reg.predict(df)[0]
    pred_cat = clf.predict(df)[0]
    return {'predicted_aqi': float(pred_aqi), 'predicted_category': str(pred_cat)}

# --------------------------- CLI ---------------------------

def main(mode: str, data_path: Optional[str] = None, model_dir: str = 'models', input_json: Optional[str] = None):
    """Main function to run air quality detection tasks."""
    if mode == 'train':
        if not data_path:
            raise ValueError('data_path is required for training mode')
        print('Loading data...')
        df = load_data(data_path)
        print(f'Data shape: {df.shape}')
        train_models(df, datetime_col='Timestamp', output_dir=model_dir)

    elif mode == 'predict':
        if not input_json:
            raise ValueError('input_json is required for predict mode')
        models = load_models(model_dir)
        # Assume input_json is a path to a JSON file containing the sample data
        with open(input_json, 'r') as f:
            sample = json.load(f)
        # Pass datetime_col to predict_from_dict if needed for consistency
        result = predict_from_dict(sample, models, datetime_col='Timestamp')
        print('Prediction:')
        print(json.dumps(result, indent=2))


if __name__ == '__main__':
    # Example usage within Colab - replace 'path/to/your/data.csv' with your actual data file
    # To train:
    main(mode='train', data_path='/path/to/your/data.csv')

    # To predict (after training and saving models):
    # Create a sample.json file with your input data (e.g., {'Timestamp': '2023-01-01 10:00:00', 'Temperature': 25.0, ...})
    # main(mode='predict', input_json='sample.json')

In [2]:
import pandas as pd

# Load the CSV file
df = pd.read_csv('/content/city_day.csv')

# Show first 10 rows
df.head(10)


Unnamed: 0,City,Date,PM2.5,PM10,NO,NO2,NOx,NH3,CO,SO2,O3,Benzene,Toluene,Xylene,AQI,AQI_Bucket
0,Ahmedabad,2015-01-01,,,0.92,18.22,17.15,,0.92,27.64,133.36,0.0,0.02,0.0,,
1,Ahmedabad,2015-01-02,,,0.97,15.69,16.46,,0.97,24.55,34.06,3.68,5.5,3.77,,
2,Ahmedabad,2015-01-03,,,17.4,19.3,29.7,,17.4,29.07,30.7,6.8,16.4,2.25,,
3,Ahmedabad,2015-01-04,,,1.7,18.48,17.97,,1.7,18.59,36.08,4.43,10.14,1.0,,
4,Ahmedabad,2015-01-05,,,22.1,21.42,37.76,,22.1,39.33,39.31,7.01,18.89,2.78,,
5,Ahmedabad,2015-01-06,,,45.41,38.48,81.5,,45.41,45.76,46.51,5.42,10.83,1.93,,
6,Ahmedabad,2015-01-07,,,112.16,40.62,130.77,,112.16,32.28,33.47,0.0,0.0,0.0,,
7,Ahmedabad,2015-01-08,,,80.87,36.74,96.75,,80.87,38.54,31.89,0.0,0.0,0.0,,
8,Ahmedabad,2015-01-09,,,29.16,31.0,48.0,,29.16,58.68,25.75,0.0,0.0,0.0,,
9,Ahmedabad,2015-01-10,,,,7.04,0.0,,,8.29,4.55,0.0,0.0,0.0,,


In [3]:
# Remove rows where target AQI is missing
df = df.dropna(subset=['AQI'])

# Fill remaining missing values with column average
df = df.fillna(df.mean(numeric_only=True))

# Show updated info
df.info()


<class 'pandas.core.frame.DataFrame'>
Index: 24850 entries, 28 to 29530
Data columns (total 16 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   City        24850 non-null  object 
 1   Date        24850 non-null  object 
 2   PM2.5       24850 non-null  float64
 3   PM10        24850 non-null  float64
 4   NO          24850 non-null  float64
 5   NO2         24850 non-null  float64
 6   NOx         24850 non-null  float64
 7   NH3         24850 non-null  float64
 8   CO          24850 non-null  float64
 9   SO2         24850 non-null  float64
 10  O3          24850 non-null  float64
 11  Benzene     24850 non-null  float64
 12  Toluene     24850 non-null  float64
 13  Xylene      24850 non-null  float64
 14  AQI         24850 non-null  float64
 15  AQI_Bucket  24850 non-null  object 
dtypes: float64(13), object(3)
memory usage: 3.2+ MB


In [4]:
# Convert Date column to datetime
df['Date'] = pd.to_datetime(df['Date'])

# Extract year and month features
df['Year'] = df['Date'].dt.year
df['Month'] = df['Date'].dt.month

df.head()


Unnamed: 0,City,Date,PM2.5,PM10,NO,NO2,NOx,NH3,CO,SO2,O3,Benzene,Toluene,Xylene,AQI,AQI_Bucket,Year,Month
28,Ahmedabad,2015-01-29,83.13,118.454435,6.93,28.71,33.72,23.848366,6.93,49.52,59.76,0.02,0.0,3.14,209.0,Poor,2015,1
29,Ahmedabad,2015-01-30,79.84,118.454435,13.85,28.68,41.08,23.848366,13.85,48.49,97.07,0.04,0.0,4.81,328.0,Very Poor,2015,1
30,Ahmedabad,2015-01-31,94.52,118.454435,24.39,32.66,52.61,23.848366,24.39,67.39,111.33,0.24,0.01,7.67,514.0,Severe,2015,1
31,Ahmedabad,2015-02-01,135.99,118.454435,43.48,42.08,84.57,23.848366,43.48,75.23,102.7,0.4,0.04,25.87,782.0,Severe,2015,2
32,Ahmedabad,2015-02-02,178.33,118.454435,54.56,35.31,72.8,23.848366,54.56,55.04,107.38,0.46,0.06,35.61,914.0,Severe,2015,2


In [5]:
# Select useful columns
features = ['PM2.5', 'PM10', 'NO', 'NO2', 'NOx', 'NH3', 'CO', 'SO2', 'O3', 'AQI_Bucket', 'Year', 'Month']
target = 'AQI'

# Convert AQI_Bucket to numbers
df['AQI_Bucket'] = df['AQI_Bucket'].astype('category').cat.codes

X = df[features]
y = df[target]


In [6]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)


In [7]:
from sklearn.ensemble import RandomForestRegressor

model = RandomForestRegressor(n_estimators=200, random_state=42)
model.fit(X_train, y_train)


In [8]:
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import numpy as np

y_pred = model.predict(X_test)

mae = mean_absolute_error(y_test, y_pred)
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
r2 = r2_score(y_test, y_pred)

print("Model Performance:")
print("MAE:", mae)
print("RMSE:", rmse)
print("R2 Score:", r2)


Model Performance:
MAE: 14.599521653731914
RMSE: 32.15700077978749
R2 Score: 0.9435273057669016


In [9]:
# Example future prediction
sample_data = X_test.iloc[0:5]
predicted = model.predict(sample_data)

print("Sample Prediction:")
print(predicted)


Sample Prediction:
[122.795 222.415 120.18  119.555  78.135]


In [12]:
%%writefile app.py
import streamlit as st
import pandas as pd
import joblib

model = joblib.load("air_quality_model.pkl")

st.title("Air Quality Prediction System")

PM25 = st.number_input("PM2.5")
PM10 = st.number_input("PM10")
NO = st.number_input("NO")
NO2 = st.number_input("NO2")
SO2 = st.number_input("SO2")
CO = st.number_input("CO")
O3 = st.number_input("O3")
NH3 = st.number_input("NH3")

if st.button("Predict AQI"):
    input_data = pd.DataFrame([[PM25, PM10, NO, NO2, SO2, CO, O3, NH3]],
                              columns=['PM2.5','PM10','NO','NO2','SO2','CO','O3','NH3'])
    result = model.predict(input_data)[0]
    st.success(f"Predicted AQI: {result}")


Overwriting app.py


In [14]:
!curl https://loca.lt/mytunnelpassword


^C


In [28]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
import joblib

df = pd.read_csv("city_day.csv")

# Remove rows where AQI is missing
df = df.dropna(subset=["AQI"])

# Select features (use only numeric columns)
features = ["PM2.5", "PM10", "NO2", "SO2"]
df = df.dropna(subset=features)

X = df[features]
y = df["AQI"]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

model = RandomForestRegressor()
model.fit(X_train, y_train)

joblib.dump(model, "air_quality_model.pkl")
print("✅ Model saved as air_quality_model.pkl")


✅ Model saved as air_quality_model.pkl


In [19]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
import joblib

df = pd.read_csv("city_day.csv")

# Remove rows where AQI is missing
df = df.dropna(subset=["AQI"])

# Select features (use only numeric columns)
features = ["PM2.5", "PM10", "NO2", "SO2"]
df = df.dropna(subset=features)

X = df[features]
y = df["AQI"]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

model = RandomForestRegressor()
model.fit(X_train, y_train)

joblib.dump(model, "air_quality_model.pkl")
print("✅ Model saved as air_quality_model.pkl")


✅ Model saved as air_quality_model.pkl


In [30]:
!streamlit run app.py & npx localtunnel --port 8501


[1G[0K⠙[1G[0K⠹[1G[0K⠸[1G[0K⠼
Collecting usage statistics. To deactivate, set browser.gatherUsageStats to false.
[0m
[1G[0K⠴[1G[0K⠦[1G[0K⠧[1G[0K⠇[1G[0K⠏[1G[0K⠋[1G[0K⠙[1G[0K⠹[1G[0K⠸[1G[0K⠼[1G[0K⠴[1G[0K⠦[1G[0K⠧[1G[0K⠇[1G[0K⠏[1G[0K⠋[1G[0K⠙[1G[0K⠹[1G[0K⠸[1G[0Kyour url is: https://slow-peaches-mix.loca.lt
[0m
[34m[1m  You can now view your Streamlit app in your browser.[0m
[0m
[34m  Local URL: [0m[1mhttp://localhost:8501[0m
[34m  Network URL: [0m[1mhttp://172.28.0.12:8501[0m
[34m  External URL: [0m[1mhttp://34.125.182.6:8501[0m
[0m
[34m  Stopping...[0m
^C


In [23]:
# run in a Colab cell
!curl -s https://loca.lt/mytunnelpassword || wget -q -O - https://loca.lt/mytunnelpassword


34.125.182.6

In [None]:
!jupyter nbconvert --to script Air_Quality_Detection.ipynb
