In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from xgboost import XGBClassifier
from catboost import CatBoostClassifier
from sklearn.metrics import classification_report
from sklearn.metrics import f1_score,recall_score,precision_score
from lightgbm import LGBMClassifier
from urllib.parse import urlparse
from sklearn.model_selection import train_test_split 

from sklearn import preprocessing 
import pickle

In [2]:
import mlflow

In [37]:
mlflow.set_experiment('air pollution experiment 2')

2025/01/01 17:13:46 INFO mlflow.tracking.fluent: Experiment with name 'air pollution experiment 2' does not exist. Creating a new experiment.


<Experiment: artifact_location='file:///D:/MLOPS/Air%20pollution/mlruns/3', creation_time=1735740827242, experiment_id='3', last_update_time=1735740827242, lifecycle_stage='active', name='air pollution experiment 2', tags={}>

In [4]:
data=pd.read_csv("updated_pollution_dataset.csv")

In [4]:
data['Air Quality'].value_counts()

Good         2000
Moderate     1500
Poor         1000
Hazardous     500
Name: Air Quality, dtype: int64

# Helper

In [17]:
def train(x_train,y_train,model):
    return model.fit(x_train,y_train)
def classification_report_test(x_test,y_test,model):
    classes=['Good','Moderate','Poor','Hazardous']
    return classification_report(y_test,model.predict(x_test),target_names=classes,output_dict=True)
def f1_score_test(x_test,y_test,model):
    return f1_score(y_test,model.predict(x_test),average='weighted')
def recall_score_test(x_test,y_test,model):
    return recall_score(y_test,model.predict(x_test),average='weighted')
def precision_score_test(x_test,y_test,model):
    return precision_score(y_test,model.predict(x_test),average='weighted')

# Encoding

In [6]:
label_encoder = preprocessing.LabelEncoder()
data['Air Quality']=label_encoder.fit_transform(data['Air Quality'])

# Train models

In [7]:
Y=data['Air Quality']
X=data.drop(['Air Quality'],axis=1)

In [8]:
data['Air Quality'].value_counts()

0    2000
2    1500
3    1000
1     500
Name: Air Quality, dtype: int64

In [10]:
x_train,x_test,y_train,y_test=train_test_split(X,Y,test_size=0.2,random_state=42)

# models with no ubalanced classes handling

In [11]:
xgb=train(x_train,y_train,XGBClassifier())
cb=train(x_train,y_train,CatBoostClassifier())
lgb=train(x_train,y_train,LGBMClassifier())

Learning rate set to 0.084924
0:	learn: 1.2145458	total: 185ms	remaining: 3m 5s
1:	learn: 1.0728164	total: 215ms	remaining: 1m 47s
2:	learn: 0.9727965	total: 247ms	remaining: 1m 22s
3:	learn: 0.8896061	total: 283ms	remaining: 1m 10s
4:	learn: 0.8172542	total: 307ms	remaining: 1m 1s
5:	learn: 0.7542283	total: 320ms	remaining: 53s
6:	learn: 0.6957327	total: 334ms	remaining: 47.4s
7:	learn: 0.6462734	total: 349ms	remaining: 43.2s
8:	learn: 0.6048356	total: 366ms	remaining: 40.3s
9:	learn: 0.5645679	total: 382ms	remaining: 37.8s
10:	learn: 0.5306524	total: 399ms	remaining: 35.9s
11:	learn: 0.5004249	total: 424ms	remaining: 34.9s
12:	learn: 0.4714922	total: 443ms	remaining: 33.6s
13:	learn: 0.4468509	total: 469ms	remaining: 33s
14:	learn: 0.4261716	total: 494ms	remaining: 32.4s
15:	learn: 0.4053928	total: 510ms	remaining: 31.4s
16:	learn: 0.3873400	total: 524ms	remaining: 30.3s
17:	learn: 0.3697606	total: 541ms	remaining: 29.5s
18:	learn: 0.3530689	total: 561ms	remaining: 29s
19:	learn: 0.3

In [12]:
xgb_report=classification_report_test(x_test,y_test,xgb)
cb_report=classification_report_test(x_test,y_test,cb)
lgb_report=classification_report_test(x_test,y_test,lgb)

In [13]:
xgb_f1_score=f1_score_test(x_test,y_test,xgb)
cb_f1_score=f1_score_test(x_test,y_test,cb)
lgb_f1_score=f1_score_test(x_test,y_test,lgb)

In [28]:
xgb_recall_score=recall_score_test(x_test,y_test, xgb)
cb_recall_score=recall_score_test(x_test, y_test, cb)
lgb_recall_score=recall_score_test(x_test, y_test, lgb)

In [19]:
xgb_precision_score=precision_score_test(x_test,y_test, xgb)
cb_precision_score=precision_score_test(x_test, y_test, cb)
lgb_precision_score=precision_score_test(x_test, y_test, lgb)

# Unbalanced classes

In [21]:
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler

In [22]:
over = SMOTE(sampling_strategy='minority')
under = RandomUnderSampler(sampling_strategy={0:1500,1:1500,2:1000,3:1000})
X_re, y_re = over.fit_resample(X, Y)
X_re, y_re=under.fit_resample(X_re,y_re)     

In [None]:
y_re.value_counts()

1    1500
0    1500
3    1000
2    1000
Name: Air Quality, dtype: int64

In [23]:
x_train_re,x_test_re,y_train_re,y_test_re=train_test_split(X_re,y_re,test_size=0.2,random_state=42)

In [116]:
print(type(X_re),type(y_re))

<class 'pandas.core.frame.DataFrame'> <class 'pandas.core.series.Series'>


In [24]:
xgb_balanced=train(x_train_re,y_train_re,XGBClassifier())
cb_balanced=train(x_train_re,y_train_re,CatBoostClassifier())
lgb_balanced=train(x_train_re,y_train_re,LGBMClassifier())

Learning rate set to 0.084924
0:	learn: 1.2339907	total: 15.3ms	remaining: 15.2s
1:	learn: 1.1066588	total: 28.9ms	remaining: 14.4s
2:	learn: 1.0056598	total: 42.8ms	remaining: 14.2s
3:	learn: 0.9218081	total: 66.3ms	remaining: 16.5s
4:	learn: 0.8501102	total: 85ms	remaining: 16.9s
5:	learn: 0.7894978	total: 100ms	remaining: 16.6s
6:	learn: 0.7321816	total: 114ms	remaining: 16.2s
7:	learn: 0.6849703	total: 128ms	remaining: 15.8s
8:	learn: 0.6455938	total: 140ms	remaining: 15.4s
9:	learn: 0.6066826	total: 155ms	remaining: 15.3s
10:	learn: 0.5734815	total: 168ms	remaining: 15.1s
11:	learn: 0.5443699	total: 181ms	remaining: 14.9s
12:	learn: 0.5170336	total: 195ms	remaining: 14.8s
13:	learn: 0.4925054	total: 212ms	remaining: 14.9s
14:	learn: 0.4695709	total: 230ms	remaining: 15.1s
15:	learn: 0.4486743	total: 256ms	remaining: 15.7s
16:	learn: 0.4306142	total: 288ms	remaining: 16.6s
17:	learn: 0.4115372	total: 310ms	remaining: 16.9s
18:	learn: 0.3937986	total: 327ms	remaining: 16.9s
19:	lear

In [25]:
xgb_balanced_report=classification_report_test(x_test_re,y_test_re,xgb_balanced)
cb_balanced_report=classification_report_test(x_test_re,y_test_re,cb_balanced)
lgb_balanced_report=classification_report_test(x_test_re,y_test_re,lgb_balanced)

In [26]:
xgb_f1_score_balanced=f1_score_test(x_test_re,y_test_re,xgb_balanced)
cb_f1_score_balanced=f1_score_test(x_test_re,y_test_re,cb_balanced)
lgb_f1_score_balanced=f1_score_test(x_test_re,y_test_re,lgb_balanced)

In [32]:
xgb_recall_score_balanced=recall_score_test(x_test_re,y_test_re,xgb_balanced)
cb_recall_score_balanced=recall_score_test(x_test_re,y_test_re,cb_balanced)
lgb_recall_score_balanced=recall_score_test(x_test_re,y_test_re,lgb_balanced)

In [33]:
xgb_precision_score_balanced=precision_score_test(x_test_re,y_test_re,xgb_balanced)
cb_precision_score_balanced=precision_score_test(x_test_re,y_test_re,xgb_balanced)
lgb_precision_score_balanced=precision_score_test(x_test_re,y_test_re,xgb_balanced)

# MLFLOW

In [38]:

expermints={
    'unbalanced data':{
        'xgboost':[xgb,xgb_report['accuracy'],xgb_f1_score,xgb_recall_score,
                   xgb_precision_score],
        'catboost':[cb,cb_report['accuracy'],cb_f1_score,cb_recall_score,
                    cb_precision_score],
        'lightgbm':[lgb,lgb_report['accuracy'],lgb_f1_score,lgb_recall_score,
                    lgb_precision_score]
        }
        ,
    'balanced data': {
        'xgboost':[xgb_balanced,xgb_balanced_report['accuracy'],xgb_f1_score_balanced,xgb_recall_score_balanced,
                   xgb_precision_score_balanced],
        'catboost':[cb_balanced,cb_balanced_report['accuracy'],cb_f1_score_balanced,cb_recall_score_balanced,
                    cb_precision_score_balanced],
        'lightgbm':[lgb_balanced,lgb_balanced_report['accuracy'],lgb_f1_score_balanced,lgb_recall_score_balanced,
                    lgb_precision_score_balanced]
        }
}


In [39]:
for key,exp in expermints.items(): 
    for m in ['xgboost','catboost','lightgbm']:
            with mlflow.start_run(nested=True,run_name=m+' '+key): 
                model=exp[m][0]
                acc=exp[m][1]       
                f1_sc=exp[m][2]  
                recall=exp[m][3]
                precision=exp[m][4]     
                mlflow.log_metric('accuracy',acc)
                mlflow.log_metric('f1_score',f1_sc) 
                mlflow.log_metric('recall_score',recall)
                mlflow.log_metric('precision_score',precision)                
                mlflow.sklearn.log_model(model, m+' '+key)

In [52]:
logged_model = 'runs:/0084390d6d85478bb8edd8f8317e7c6d/xgboost unbalanced data'

# Load model as a PyFuncModel.
loaded_model = mlflow.pyfunc.load_model(logged_model)

# Predict on a Pandas DataFrame.
import pandas as pd
loaded_model.predict(pd.DataFrame(np.array([[1,0,2,1,0,1,0,0,0]])))

array([2], dtype=int64)

In [56]:
with open('pollution.pkl', 'wb') as f:
    pickle.dump(loaded_model, f)

In [57]:
X.columns

Index(['Temperature', 'Humidity', 'PM2.5', 'PM10', 'NO2', 'SO2', 'CO',
       'Proximity_to_Industrial_Areas', 'Population_Density'],
      dtype='object')