<a href="https://colab.research.google.com/github/PANDE380/AI-Coursework1/blob/main/AirQuality.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# Q1_air_quality_pipeline.py
# Requires: pandas, numpy, scipy, matplotlib, seaborn, scikit-learn, statsmodels
import pandas as pd
import numpy as np
from scipy import stats
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split, StratifiedKFold, cross_val_score
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import (confusion_matrix, accuracy_score, precision_score,
                             recall_score, f1_score, roc_auc_score, roc_curve)
from sklearn.pipeline import Pipeline
import warnings
warnings.filterwarnings('ignore')



In [None]:
# load dataset (replace path with your file)
df = pd.read_csv('AirQualityUCI.csv', sep=';', decimal=',', na_values=[-200, ''])

# Quick inspect
print(df.dtypes)
# Convert date/time: columns usually 'Date' and 'Time'
df['Datetime'] = pd.to_datetime(df['Date'] + ' ' + df['Time'], dayfirst=True, errors='coerce')
df = df.set_index('Datetime')
# Drop raw Date,Time if desired
df = df.drop(columns=['Date','Time'])

# Missing value handling
# Remove rows with >50% missing
df = df[df.isna().mean(axis=1) <= 0.5]

# Impute numeric missing with median
num_cols = df.select_dtypes(include=[np.number]).columns
imp = SimpleImputer(strategy='median')
df[num_cols] = imp.fit_transform(df[num_cols])

# Standardize numeric features
scaler = StandardScaler()
df[num_cols] = scaler.fit_transform(df[num_cols])

# Now df is cleaned and standardized



In [None]:
# Descriptive stats for CO, NO2, O3 (use original units if available)
for col in ['CO(GT)', 'NO2(GT)', 'O3']:
    if col in df.columns:
        arr = df[col].dropna()
        print(col, 'mean', arr.mean(), 'median', np.median(arr),
              'var', arr.var(), 'skew', stats.skew(arr), 'kurt', stats.kurtosis(arr))

# Histogram of CO
plt.figure(); sns.histplot(df['CO(GT)'].dropna(), kde=True); plt.title('CO histogram')

# Boxplots by weekday/weekend
df['weekday'] = df.index.weekday
df['is_weekend'] = df['weekday'] >= 5
plt.figure(figsize=(8,4))
sns.boxplot(x='is_weekend', y='CO(GT)', data=df.reset_index()); plt.title('CO: Weekday vs Weekend')

# Time series plot (daily mean CO)
daily = df['CO(GT)'].resample('D').mean()
plt.figure(figsize=(12,4)); daily.plot(); plt.title('Daily Mean CO')

# Correlation heatmap
plt.figure(figsize=(8,6)); sns.heatmap(df[num_cols].corr(), annot=True, fmt=".2f"); plt.title('Corr matrix')


In [None]:
# Daily aggregated features
daily_df = df.resample('D').agg(['mean','max','min'])
# Flatten column names
daily_df.columns = ['_'.join(col).strip() for col in daily_df.columns.values]

# Temperature difference (if temp available, e.g., 'T')
if 'T' in df.columns:
    temp_daily = df['T'].resample('D').agg(['max','min'])
    daily_df['temp_diff'] = temp_daily['max'] - temp_daily['min']

# Rolling averages (3-hour)
df['CO_roll3h'] = df['CO(GT)'].rolling(window=3).mean()


In [None]:
# t-test NO2 weekdays vs weekends
daily_no2 = df['NO2(GT)'].resample('D').mean().to_frame('no2')
daily_no2['is_weekend'] = daily_no2.index.weekday >= 5
w = daily_no2[daily_no2['is_weekend']]['no2'].dropna()
d = daily_no2[~daily_no2['is_weekend']]['no2'].dropna()
t_stat, p_val = stats.ttest_ind(d, w, equal_var=False)
print('t_stat', t_stat, 'p_val', p_val)

# 95% CI for average CO (daily mean)
co = df['CO(GT)'].resample('D').mean().dropna()
m = co.mean(); se = co.std(ddof=1)/np.sqrt(len(co))
ci = stats.t.interval(0.95, len(co)-1, loc=m, scale=se)
print('CO mean', m, '95% CI', ci)


In [None]:
# Build target on daily data
co_daily = df['CO(GT)'].resample('D').mean()
threshold = np.nanpercentile(co_daily, 90)
target = (co_daily > threshold).astype(int)

# Features: use daily_df for aggregated features, align index
X = daily_df.loc[target.index].fillna(0)
y = target

# Train/test split stratified
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=42)

# Models
models = {
    'logreg': LogisticRegression(max_iter=1000),
    'tree': DecisionTreeClassifier(random_state=42, max_depth=5)
}

# Evaluate with 5-fold stratified CV and compute mean ± std for metrics
from sklearn.model_selection import StratifiedKFold
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

def cv_metrics(model, X, y):
    accs=[]; precs=[]; recs=[]; f1s=[]; rocs=[]
    for train_idx, val_idx in cv.split(X,y):
        m = model
        m.fit(X.iloc[train_idx], y.iloc[train_idx])
        probs = m.predict_proba(X.iloc[val_idx])[:,1] if hasattr(m,'predict_proba') else m.predict(X.iloc[val_idx])
        preds = m.predict(X.iloc[val_idx])
        accs.append(accuracy_score(y.iloc[val_idx], preds))
        precs.append(precision_score(y.iloc[val_idx], preds, zero_division=0))
        recs.append(recall_score(y.iloc[val_idx], preds, zero_division=0))
        f1s.append(f1_score(y.iloc[val_idx], preds, zero_division=0))
        try:
            rocs.append(roc_auc_score(y.iloc[val_idx], probs))
        except:
            rocs.append(np.nan)
    return {
        'accuracy_mean_std': (np.mean(accs), np.std(accs)),
        'precision_mean_std': (np.mean(precs), np.std(precs)),
        'recall_mean_std': (np.mean(recs), np.std(recs)),
        'f1_mean_std': (np.mean(f1s), np.std(f1s)),
        'roc_auc_mean_std': (np.nanmean(rocs), np.nanstd(rocs))
    }

results = {name: cv_metrics(model, X, y) for name, model in models.items()}
print(results)


Please upload the `AirQualityUCI.csv` file.

In [None]:
from google.colab import files
uploaded = files.upload()