In [None]:
!pip install kagglehub --upgrade




In [None]:
import pandas as pd

df = pd.read_csv("/kaggle/input/uci-air-quality-dataset/AirQualityUCI.csv", sep=None, engine='python')

df = df.dropna(axis=1, how='all')
df.columns = df.columns.str.strip().str.replace('"', '').str.replace('\ufeff', '')

print("✅ Auto-detected columns:")
print(df.columns.tolist())


✅ Auto-detected columns:
['Date', 'Time', 'CO(GT)', 'PT08.S1(CO)', 'NMHC(GT)', 'C6H6(GT)', 'PT08.S2(NMHC)', 'NOx(GT)', 'PT08.S3(NOx)', 'NO2(GT)', 'PT08.S4(NO2)', 'PT08.S5(O3)', 'T', 'RH', 'AH']


In [None]:
# Use Benzene concentration as target (C6H6(GT))
target_col = 'C6H6(GT)'

# Drop rows with missing target values
df = df.dropna(subset=[])

# Binary classification: high or low pollution
threshold = df[target_col].median()
df['AirQuality'] = (df[target_col] > threshold).astype(int)

# Prepare X and y
X = df.drop(columns=['Date', 'Time', 'AirQuality'], errors='ignore')
y = df['AirQuality']

# Drop any remaining NaNs
X = X.dropna()
y = y.loc[X.index]

print("✅ X shape:", X.shape, " | y shape:", y.shape)


✅ X shape: (9357, 13)  | y shape: (9357,)


In [None]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [None]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, f1_score
import time

# Define pipeline
lr_pipeline = Pipeline([
    ('scaler', StandardScaler()),
    ('classifier', LogisticRegression(max_iter=1000))
])

# Train
start = time.time()
lr_pipeline.fit(X_train, y_train)
lr_time = time.time() - start

# Evaluate
y_pred_lr = lr_pipeline.predict(X_test)
lr_acc = accuracy_score(y_test, y_pred_lr)
lr_f1 = f1_score(y_test, y_pred_lr)

print(f"🔹 Logistic Regression Accuracy: {lr_acc:.3f}")
print(f"🔹 Logistic Regression F1 Score: {lr_f1:.3f}")
print(f"🔹 Training Time: {lr_time:.3f} seconds")


🔹 Logistic Regression Accuracy: 0.989
🔹 Logistic Regression F1 Score: 0.989
🔹 Training Time: 0.034 seconds


In [None]:
from xgboost import XGBClassifier

# Define pipeline
xgb_pipeline = Pipeline([
    ('scaler', StandardScaler()),  # kept for MLOps consistency
    ('classifier', XGBClassifier(
        n_estimators=100,
        max_depth=5,
        learning_rate=0.1,
        use_label_encoder=False,
        eval_metric='logloss'
    ))
])

# Train
start = time.time()
xgb_pipeline.fit(X_train, y_train)
xgb_time = time.time() - start

# Evaluate
y_pred_xgb = xgb_pipeline.predict(X_test)
xgb_acc = accuracy_score(y_test, y_pred_xgb)
xgb_f1 = f1_score(y_test, y_pred_xgb)

print(f"🌳 XGBoost Accuracy: {xgb_acc:.3f}")
print(f"🌳 XGBoost F1 Score: {xgb_f1:.3f}")
print(f"🌳 Training Time: {xgb_time:.3f} seconds")


🌳 XGBoost Accuracy: 1.000
🌳 XGBoost F1 Score: 1.000
🌳 Training Time: 0.087 seconds


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


In [None]:
import time

# Logistic Regression runtime
start_time = time.time()
_ = lr_pipeline.predict(X_test)
lr_time = time.time() - start_time

# XGBoost runtime
start_time = time.time()
_ = xgb_pipeline.predict(X_test)
xgb_time = time.time() - start_time

print(f"\nRuntime (seconds):")
print(f"LR-Pipeline: {lr_time:.5f}")
print(f"XGB-Pipeline: {xgb_time:.5f}")




Runtime (seconds):
LR-Pipeline: 0.00340
XGB-Pipeline: 0.00651


In [None]:
%pip install memory_profiler



In [None]:
import joblib, os, psutil

# Save models
joblib.dump(lr_pipeline, "lr_pipeline.joblib")
joblib.dump(xgb_pipeline, "xgb_pipeline.joblib")

# Model sizes in KB
lr_size = os.path.getsize("lr_pipeline.joblib") / 1024
xgb_size = os.path.getsize("xgb_pipeline.joblib") / 1024

# Current system memory usage
mem_usage = psutil.virtual_memory().percent

print("\n💾 Resource Summary:")
print(f"• Logistic Regression model size: {lr_size:.2f} KB")
print(f"• XGBoost model size: {xgb_size:.2f} KB")
print(f"• Current system RAM usage: {mem_usage:.2f}%")



💾 Resource Summary:
• Logistic Regression model size: 2.09 KB
• XGBoost model size: 78.67 KB
• Current system RAM usage: 10.30%


In [None]:
# ============================
# TP4 - Model Preparation Script
# Generates: lr_pipeline.pkl, xgb_pipeline.pkl
# ============================

import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, f1_score
from xgboost import XGBClassifier
import pickle

# ----------------------------
# 1. Load and clean dataset
# ----------------------------
df = pd.read_csv("/kaggle/input/uci-air-quality-dataset/AirQualityUCI.csv", sep=None, engine='python')

df = df.dropna(axis=1, how='all')
df.columns = df.columns.str.strip().str.replace('"', '').str.replace('\ufeff', '')

print("✅ Auto-detected columns:")
print(df.columns.tolist())

# df = pd.read_csv(file_path, sep=";", skiprows=1, names=columns)
df = df.dropna(axis=1, how="all")  # drop empty columns
df = df.dropna()  # drop missing rows

# Clean column names
df.columns = [c.strip().replace('"', "") for c in df.columns]

print("✅ Cleaned columns:", df.columns.tolist()[:10], "...")

# ----------------------------
# 2. Select features + target
# ----------------------------
# Target = Benzene concentration (C6H6(GT))
target_col = "C6H6(GT)"
if target_col not in df.columns:
    raise ValueError(f"Column '{target_col}' not found! Check dataset structure.")

df = df[df[target_col] != -200]  # remove invalid sensor values

# Create binary target: high pollution (1) if above median
threshold = df[target_col].median()
df["target"] = (df[target_col] > threshold).astype(int)

# Select numeric sensor features
features = [
    "CO(GT)", "PT08.S1(CO)", "NMHC(GT)",
    "PT08.S2(NMHC)", "NOx(GT)", "PT08.S3(NOx)",
    "NO2(GT)", "PT08.S4(NO2)", "PT08.S5(O3)",
    "T", "RH", "AH"
]

X = df[features]
y = df["target"]

# ----------------------------
# 3. Train/test split
# ----------------------------
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

# ----------------------------
# 4. Logistic Regression Pipeline
# ----------------------------
lr_pipeline = Pipeline([
    ("scaler", StandardScaler()),
    ("model", LogisticRegression(max_iter=500))
])

lr_pipeline.fit(X_train, y_train)

y_pred_lr = lr_pipeline.predict(X_test)
print("\n🔹 Logistic Regression Results:")
print("Accuracy:", accuracy_score(y_test, y_pred_lr))
print("F1 Score:", f1_score(y_test, y_pred_lr))

# Save model
with open("TP4/models/lr_pipeline.pkl", "wb") as f:
    pickle.dump(lr_pipeline, f)
print("✅ Saved: TP4/models/lr_pipeline.pkl")

# ----------------------------
# 5. XGBoost Pipeline
# ----------------------------
xgb_pipeline = Pipeline([
    ("scaler", StandardScaler()),  # for consistency in MLOps
    ("model", XGBClassifier(
        n_estimators=100,
        learning_rate=0.1,
        max_depth=4,
        use_label_encoder=False,
        eval_metric="logloss"
    ))
])

xgb_pipeline.fit(X_train, y_train)

y_pred_xgb = xgb_pipeline.predict(X_test)
print("\n🔹 XGBoost Results:")
print("Accuracy:", accuracy_score(y_test, y_pred_xgb))
print("F1 Score:", f1_score(y_test, y_pred_xgb))

# Save model
with open("TP4/models/xgb_pipeline.pkl", "wb") as f:
    pickle.dump(xgb_pipeline, f)
print("✅ Saved: TP4/models/xgb_pipeline.pkl")

# ----------------------------
# 6. Optional: Model sizes
# ----------------------------
import os
lr_size = os.path.getsize("TP4/models/lr_pipeline.pkl") / 1024
xgb_size = os.path.getsize("TP4/models/xgb_pipeline.pkl") / 1024
print(f"\n📦 Model sizes -> Logistic: {lr_size:.2f} KB | XGBoost: {xgb_size:.2f} KB")


✅ Auto-detected columns:
['Date', 'Time', 'CO(GT)', 'PT08.S1(CO)', 'NMHC(GT)', 'C6H6(GT)', 'PT08.S2(NMHC)', 'NOx(GT)', 'PT08.S3(NOx)', 'NO2(GT)', 'PT08.S4(NO2)', 'PT08.S5(O3)', 'T', 'RH', 'AH']
✅ Cleaned columns: ['Date', 'Time', 'CO(GT)', 'PT08.S1(CO)', 'NMHC(GT)', 'C6H6(GT)', 'PT08.S2(NMHC)', 'NOx(GT)', 'PT08.S3(NOx)', 'NO2(GT)'] ...

🔹 Logistic Regression Results:
Accuracy: 0.9872151195108394
F1 Score: 0.9871866295264624


FileNotFoundError: [Errno 2] No such file or directory: 'TP4/models/lr_pipeline.pkl'

In [None]:
# ============================================================
# 🧠 TP4 - Logistic Regression & XGBoost Pipelines
# ============================================================

# 1️⃣ Install required libraries
!pip install xgboost pandas scikit-learn

# 2️⃣ Import libraries
import pandas as pd
import numpy as np
import os
import pickle
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, f1_score
from sklearn.pipeline import Pipeline
from xgboost import XGBClassifier

# 3️⃣ Load and clean dataset
# (Assuming your file name is AirQualityUCI.csv from Kaggle)
df = pd.read_csv("/kaggle/input/uci-air-quality-dataset/AirQualityUCI.csv", sep=None, engine='python')

df = df.dropna(axis=1, how='all')
df.columns = df.columns.str.strip().str.replace('"', '').str.replace('\ufeff', '')

# Clean column names
df.columns = df.columns.str.strip().str.replace('"', '')

# Remove empty column names
df = df.loc[:, ~df.columns.str.contains('^Unnamed')]

print("✅ Cleaned Columns:")
print(df.columns.tolist())

# Drop missing target values
df = df.dropna(subset=['C6H6(GT)'])

# Binary classification target
df['PollutionLevel'] = (df['C6H6(GT)'] > df['C6H6(GT)'].median()).astype(int)

# Feature selection
features = ['PT08.S1(CO)', 'PT08.S2(NMHC)', 'PT08.S3(NOx)',
            'PT08.S4(NO2)', 'PT08.S5(O3)', 'T', 'RH', 'AH']
X = df[features]
y = df['PollutionLevel']

# Split data
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

# ============================================================
# 4️⃣ Logistic Regression Pipeline
# ============================================================
lr_pipeline = Pipeline([
    ("scaler", StandardScaler()),
    ("model", LogisticRegression(max_iter=500))
])

lr_pipeline.fit(X_train, y_train)
y_pred_lr = lr_pipeline.predict(X_test)

print("\n🔹 Logistic Regression Results:")
print("Accuracy:", round(accuracy_score(y_test, y_pred_lr), 4))
print("F1 Score:", round(f1_score(y_test, y_pred_lr), 4))

# ============================================================
# 5️⃣ XGBoost Pipeline
# ============================================================
xgb_pipeline = Pipeline([
    ("model", XGBClassifier(
        n_estimators=100,
        learning_rate=0.1,
        max_depth=5,
        random_state=42,
        eval_metric='logloss'
    ))
])

xgb_pipeline.fit(X_train, y_train)
y_pred_xgb = xgb_pipeline.predict(X_test)

print("\n🔹 XGBoost Results:")
print("Accuracy:", round(accuracy_score(y_test, y_pred_xgb), 4))
print("F1 Score:", round(f1_score(y_test, y_pred_xgb), 4))

# ============================================================
# 6️⃣ Save Models
# ============================================================
os.makedirs("TP4/models", exist_ok=True)

with open("TP4/models/lr_pipeline.pkl", "wb") as f:
    pickle.dump(lr_pipeline, f)

with open("TP4/models/xgb_pipeline.pkl", "wb") as f:
    pickle.dump(xgb_pipeline, f)

print("\n✅ Models saved successfully!")
print("📂 lr_pipeline.pkl and xgb_pipeline.pkl are in TP4/models/")

# ============================================================
# 7️⃣ Resource Analysis (File size, etc.)
# ============================================================
lr_size = os.path.getsize("TP4/models/lr_pipeline.pkl") / 1024
xgb_size = os.path.getsize("TP4/models/xgb_pipeline.pkl") / 1024

print(f"\n📦 Model Sizes:")
print(f"   • Logistic Regression: {lr_size:.2f} KB")
print(f"   • XGBoost: {xgb_size:.2f} KB")

# Optional: Download models from Colab
from google.colab import files
files.download("TP4/models/lr_pipeline.pkl")
files.download("TP4/models/xgb_pipeline.pkl")


✅ Cleaned Columns:
['Date', 'Time', 'CO(GT)', 'PT08.S1(CO)', 'NMHC(GT)', 'C6H6(GT)', 'PT08.S2(NMHC)', 'NOx(GT)', 'PT08.S3(NOx)', 'NO2(GT)', 'PT08.S4(NO2)', 'PT08.S5(O3)', 'T', 'RH', 'AH']

🔹 Logistic Regression Results:
Accuracy: 0.9936
F1 Score: 0.9936

🔹 XGBoost Results:
Accuracy: 0.9995
F1 Score: 0.9995

✅ Models saved successfully!
📂 lr_pipeline.pkl and xgb_pipeline.pkl are in TP4/models/

📦 Model Sizes:
   • Logistic Regression: 1.48 KB
   • XGBoost: 87.24 KB


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>