In [73]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [74]:
import numpy as np
import pandas as pd
import re
import seaborn as sns
import matplotlib.pyplot as plt

import lightgbm as lgb
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import train_test_split, cross_val_score, KFold
from sklearn.metrics import accuracy_score
%matplotlib inline

In [75]:
df = pd.read_csv('/content/drive/MyDrive/Final_df_classification.csv')
df.head()

Unnamed: 0,powiat_voivod,DATE,CC,FG,HU,RR,SD,TG,voivodship,[animal stock] total,...,dayofweek,season,CAQI_lag1,holiday_name,is_school_holiday,is_lockdown,is_have_coalplant,rolling7_CAQI,rolling30_CAQI,rolling90_CAQI
0,"powiat aleksandrowski, kujawsko-pomorskie",2017-01-01,4.74675,0.0,88.4,1.0,0.0,1.8,kujawsko-pomorskie,44.997895,...,6,Winter,24.968064,New Year's Day,0.0,0.0,0.0,24.968064,28.884791,47.077522
1,"powiat aleksandrowski, kujawsko-pomorskie",2017-01-02,6.167555,36.8,88.4,1.0,0.0,0.5,kujawsko-pomorskie,44.997895,...,0,Winter,24.968064,Non Holiday,0.0,0.0,0.0,24.968064,28.884791,47.077522
2,"powiat aleksandrowski, kujawsko-pomorskie",2017-01-03,5.198131,0.0,88.4,63.0,0.0,0.3,kujawsko-pomorskie,44.997895,...,1,Winter,17.943745,Non Holiday,0.0,0.0,0.0,21.455904,28.884791,47.077522
3,"powiat aleksandrowski, kujawsko-pomorskie",2017-01-04,8.432231,64.2,88.4,10.0,0.0,1.2,kujawsko-pomorskie,44.997895,...,2,Winter,17.5,Non Holiday,0.0,0.0,0.0,20.13727,28.884791,47.077522
4,"powiat aleksandrowski, kujawsko-pomorskie",2017-01-05,4.992977,0.0,82.4,2.0,1.0,-3.9,kujawsko-pomorskie,44.997895,...,3,Winter,23.941667,Non Holiday,0.0,0.0,0.0,21.088369,28.884791,47.077522


In [76]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 361548 entries, 0 to 361547
Columns: 131 entries, powiat_voivod to rolling90_CAQI
dtypes: float64(120), int64(5), object(6)
memory usage: 361.3+ MB


In [77]:
df.columns.tolist()

['powiat_voivod',
 'DATE',
 'CC',
 'FG',
 'HU',
 'RR',
 'SD',
 'TG',
 'voivodship',
 '[animal stock] total',
 '[area by land] utilised agricultural area',
 '[area by land] forests',
 '[area by land] residential areas',
 '[area by land] industrial lands',
 '[area by land] recreational and rest areas',
 '[area by land] lands under waters',
 '[area by land] wasteland',
 '[area by land] other area',
 '[emission of particulates] fugitive',
 '[emission of particulates] fuel combustion',
 '[emission of particulates] cement/lime and refractory materials',
 '[emission of particulates] silicon',
 '[emission of particulates] chemical fertilizers',
 '[emission of particulates] surface-active agents',
 '[emission of particulates] carbon and graphite, soot',
 '[emission of pollutant gases] fugitive',
 '[emission of pollutant gases] sulphur dioxide',
 '[emission of pollutant gases] nitrogen oxides',
 '[emission of pollutant gases] carbon monoxide',
 '[emission of pollutant gases] carbon dioxide',
 '[

In [78]:
# Remove any special characters from the column names
df.columns = [re.sub('[^0-9a-zA-Z]+', '_', col) for col in df.columns]

The use of special JSON characters in column names is not supported by LightGBM, so it's important to remove them.

In [79]:
# Converting the DATE column to datetime format
df['DATE'] = pd.to_datetime(df['DATE'])

In [80]:
# encoding the target variable
le = LabelEncoder()
df['CAQI_level_encoded'] = le.fit_transform(df['CAQI_level'])

In [81]:
# The mapping of the labels to the categories
print('Label encoding mapping:')
for i, cat in enumerate(le.classes_):
    print(f'{cat} -> {i}')

Label encoding mapping:
high -> 0
low -> 1
medium -> 2
vhigh -> 3
vlow -> 4


In [82]:
# drop the 'CAQI_level' column
df = df.drop(['CAQI_level'], axis=1)

# Dropping the holiday column as well, as it throws an error when encoding
df = df.drop(['holiday_name'], axis=1)

In [83]:
# splitting the data into train, validation and test sets
train_data = df[(df['DATE'] >= '2017-01-01') & (df['DATE'] <= '2020-05-31')]
val_data = df[(df['DATE'] >= '2020-06-02') & (df['DATE'] <= '2021-02-28')]
test_data = df[(df['DATE'] >= '2021-03-02') & (df['DATE'] <= '2021-12-31')]

In [84]:
# dropping the 'DATE' column from all the sets as it is no longer needed for modelling
train_data = train_data.drop(['DATE'], axis=1)
val_data = val_data.drop(['DATE'], axis=1)
test_data = test_data.drop(['DATE'], axis=1)

In [85]:
# encoding the other categorical columns
cat_cols = ['powiat_voivod', 'voivodship', 'season']
for col in cat_cols:
    le = LabelEncoder()
    train_data[col] = le.fit_transform(train_data[col])
    val_data[col] = le.transform(val_data[col])
    test_data[col] = le.transform(test_data[col])

In [86]:
# defining the features and target variable
X_train = train_data.drop(['CAQI_level_encoded'], axis=1)
y_train = train_data['CAQI_level_encoded']

X_val = val_data.drop(['CAQI_level_encoded'], axis=1)
y_val = val_data['CAQI_level_encoded']

X_test = test_data.drop(['CAQI_level_encoded'], axis=1)
y_test = test_data['CAQI_level_encoded']

LightGBM is a gradient boosting framework based on decision trees

In [87]:
from lightgbm import LGBMClassifier
# initializing the LightGBM classifier
lgbm = LGBMClassifier()

# we fit the model on the training set
lgbm.fit(X_train, y_train)

In [88]:
# Prediction on the validation set
y_pred = lgbm.predict(X_val)

# Accuracy score on the validation set
val_accuracy = accuracy_score(y_val, y_pred)
print("Validation Accuracy:", val_accuracy)

Validation Accuracy: 0.7416258169934641


In [89]:
print("Validation set Classification report:")
print(classification_report(y_val, y_pred))

Validation set Classification report:
              precision    recall  f1-score   support

           0       0.45      0.22      0.29      1023
           1       0.66      0.65      0.66     18611
           2       0.50      0.48      0.49      4671
           3       0.06      0.12      0.08        34
           4       0.83      0.86      0.84     29517

    accuracy                           0.74     53856
   macro avg       0.50      0.46      0.47     53856
weighted avg       0.74      0.74      0.74     53856



---
- The model achieved the highest precision, recall, and F1-score for class 4, with values of 0.83, 0.86, and 0.84, respectively. This indicates that the model performed well in correctly identifying class 4 instances.
- Class 1 also had a relatively high precision, recall, and F1-score of 0.66, 0.65, and 0.66, respectively, indicating that the model performed reasonably well in identifying class 1 instances.
- The model did not perform as well on the other classes, with precision, recall, and F1-score values ranging from 0.06 to 0.50. For example, the precision, recall, and F1-score for class 0 were 0.45, 0.22, and 0.29, respectively, indicating that the model had difficulty identifying class 0 instances.
- The macro average of precision, recall, and F1-score is 0.50, 0.46, and 0.47, respectively, indicating that the model's performance across all classes is average.
- The overall accuracy of the model on this dataset is 0.74, meaning that the model correctly classified 74% of the instances in the dataset.


---



Label encoding mapping:
high -> 0, 
low -> 1, 
medium -> 2, 
vhigh -> 3, 
vlow -> 4

In [90]:
# prediction on the test set
y_pred_test = lgbm.predict(X_test)

# Accuracy score on the test set
test_accuracy = accuracy_score(y_test, y_pred_test)
print("Test Accuracy:", test_accuracy)

Test Accuracy: 0.7540818016227853


In [91]:
print("Test set Classification report:")
print(classification_report(y_test, y_pred_test))

Test set Classification report:
              precision    recall  f1-score   support

           0       0.34      0.20      0.25       435
           1       0.69      0.75      0.72     24346
           2       0.55      0.39      0.45      3683
           3       0.00      0.00      0.00        13
           4       0.83      0.81      0.82     31913

    accuracy                           0.75     60390
   macro avg       0.48      0.43      0.45     60390
weighted avg       0.75      0.75      0.75     60390



---
- The model performs relatively well for classes 1 and 4, with precision, recall, and F1-score values all above 0.7
- Class 0 has a low precision and recall value, indicating that the model struggles to correctly identify instances of this class
- The macro average F1-score is 0.45, indicating that the model struggles to perform well across all classes.
- Class 2 has a lower F1-score, which suggests that the model has some difficulty identifying instances of this class correctly.
- The overall accuracy of the model is 0.75, which means that it correctly predicted the class of 75% of instances in the test set.

---

- F1 score is a weighted harmonic mean of precision and recall such that the best score is 1.0 and the worst is 0.0

## **Cross Validation:**

In [92]:
from sklearn.model_selection import cross_val_score, StratifiedKFold

# Defining the k-fold cross-validation object
kfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

# Computing the cross-validation scores
# Used the training set data only for cross validation.
cv_scores = cross_val_score(lgbm, X_train, y_train, cv=kfold, scoring='accuracy')

# Cross-validation scores
print("Cross-validation scores:", cv_scores)
print("Mean CV accuracy:", cv_scores.mean())

Cross-validation scores: [0.75199465 0.75306292 0.75877362 0.75049108 0.75543225]
Mean CV accuracy: 0.753950904215573


---
- In this cross-validation where I have used KFold cross-validation technique with 5 splits. 
- We split the data into 5 folds, then trained the model on 4 folds and evaluated it on the 5th fold, and repeated this process for all the 5 folds.
---

- The cross-validation scores range from **0.75199465** to **0.75877362**, and the mean CV accuracy is **0.753950904215573**. 
- This means that the model is performing consistently across all the folds and the estimated generalization performance is around 75.4%.
