In [1]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import MinMaxScaler, LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
import warnings
warnings.filterwarnings('ignore')

def preprocess_air_quality_data(file_path):
    # Load data
    df = pd.read_csv(file_path, low_memory=False)
    df.drop(columns=['City','NH3'], inplace=True)
    
    # Handle missing values
    numeric_columns = df.select_dtypes(include=['float64']).columns
    imputer = SimpleImputer(strategy='mean')
    df[numeric_columns] = imputer.fit_transform(df[numeric_columns])
    
    # Process datetime and create time features
    df['Datetime'] = pd.to_datetime(df['Datetime'], format='mixed', errors='coerce')
    df['Year'] = df['Datetime'].dt.year
    df['Month'] = df['Datetime'].dt.month
    df['Day'] = df['Datetime'].dt.day
    df['Hour'] = df['Datetime'].dt.hour
    
    # Categorize PM2.5 values
    bins = [0, 12, 35.4, 55.4, 150.4, 250.4, float('inf')]
    labels = ['Good', 'Moderate', 'Unhealthy for Sensitive', 'Unhealthy', 'Very Unhealthy', 'Hazardous']
    df['PM2.5_Category'] = pd.cut(df['PM2.5'], bins=bins, labels=labels)
    
    # Perform undersampling
    return df


# For each dataset
df = preprocess_air_quality_data('../Data/Ahmedabad_data.csv')

# Print sizes to see the reduction
print("Original sizes:", len(df))

Original sizes: 48192


In [2]:
df.head()

Unnamed: 0,Datetime,PM2.5,PM10,NO,NO2,NOx,CO,SO2,O3,Benzene,Toluene,Xylene,AQI,AQI_Bucket,Year,Month,Day,Hour,PM2.5_Category
0,2015-01-01 01:00:00,67.273271,111.493008,1.0,40.01,36.37,1.0,122.07,39.067821,0.0,0.0,0.0,453.542641,,2015,1,1,1,Unhealthy
1,2015-01-01 02:00:00,67.273271,111.493008,0.02,27.75,19.73,0.02,85.9,39.067821,0.0,0.0,0.0,453.542641,,2015,1,1,2,Unhealthy
2,2015-01-01 03:00:00,67.273271,111.493008,0.08,19.32,11.08,0.08,52.83,39.067821,0.0,0.0,0.0,453.542641,,2015,1,1,3,Unhealthy
3,2015-01-01 04:00:00,67.273271,111.493008,0.3,16.45,9.2,0.3,39.53,153.58,0.0,0.0,0.0,453.542641,,2015,1,1,4,Unhealthy
4,2015-01-01 05:00:00,67.273271,111.493008,0.12,14.9,7.85,0.12,32.63,39.067821,0.0,0.0,0.0,453.542641,,2015,1,1,5,Unhealthy


In [3]:
df.describe()

Unnamed: 0,Datetime,PM2.5,PM10,NO,NO2,NOx,CO,SO2,O3,Benzene,Toluene,Xylene,AQI,Year,Month,Day,Hour
count,48192,48192.0,48192.0,48192.0,48192.0,48192.0,48192.0,48192.0,48192.0,48192.0,48192.0,48192.0,48192.0,48192.0,48192.0,48192.0,48192.0
mean,2017-10-01 00:30:00,67.273271,111.493008,22.236505,59.4706,47.308052,21.999914,52.796652,39.067821,5.29488,27.464264,4.198443,453.542641,2017.271518,6.249626,15.723108,11.5
min,2015-01-01 01:00:00,0.34,5.65,0.01,0.01,0.0,0.01,0.01,0.02,0.0,0.0,0.0,34.0,2015.0,1.0,1.0,0.0
25%,2016-05-17 00:45:00,44.8975,111.493008,6.98,27.87,22.69,6.92,29.45,18.79,1.25,7.83,0.65,255.0,2016.0,3.0,8.0,5.75
50%,2017-10-01 00:30:00,67.273271,111.493008,21.4,59.4706,47.308052,21.355,52.796652,39.067821,4.12,27.464264,3.21,453.542641,2017.0,6.0,16.0,11.5
75%,2019-02-15 00:15:00,67.273271,111.493008,22.236505,59.4706,47.308052,21.999914,52.796652,39.067821,5.29488,32.2,4.198443,453.542641,2019.0,9.0,23.0,17.25
max,2020-07-01 00:00:00,999.99,999.99,498.57,494.15,498.61,498.57,199.96,199.83,381.58,488.53,461.39,3133.0,2020.0,12.0,31.0,23.0
std,,43.330042,23.884558,35.045166,48.802689,42.603398,35.110546,35.997898,29.746733,9.838525,31.058501,10.115803,367.804415,1.600125,3.439819,8.79654,6.922258


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 48192 entries, 0 to 48191
Data columns (total 19 columns):
 #   Column          Non-Null Count  Dtype         
---  ------          --------------  -----         
 0   Datetime        48192 non-null  datetime64[ns]
 1   PM2.5           48192 non-null  float64       
 2   PM10            48192 non-null  float64       
 3   NO              48192 non-null  float64       
 4   NO2             48192 non-null  float64       
 5   NOx             48192 non-null  float64       
 6   CO              48192 non-null  float64       
 7   SO2             48192 non-null  float64       
 8   O3              48192 non-null  float64       
 9   Benzene         48192 non-null  float64       
 10  Toluene         48192 non-null  float64       
 11  Xylene          48192 non-null  float64       
 12  AQI             48192 non-null  float64       
 13  AQI_Bucket      30921 non-null  object        
 14  Year            48192 non-null  int32         
 15  Mo

In [5]:
df.isna().sum()

Datetime              0
PM2.5                 0
PM10                  0
NO                    0
NO2                   0
NOx                   0
CO                    0
SO2                   0
O3                    0
Benzene               0
Toluene               0
Xylene                0
AQI                   0
AQI_Bucket        17271
Year                  0
Month                 0
Day                   0
Hour                  0
PM2.5_Category        0
dtype: int64

In [6]:
df_tmp = df.copy()

In [7]:
df_tmp.drop(['AQI_Bucket', 'Datetime'], axis=1, inplace=True)
df_tmp.head()

Unnamed: 0,PM2.5,PM10,NO,NO2,NOx,CO,SO2,O3,Benzene,Toluene,Xylene,AQI,Year,Month,Day,Hour,PM2.5_Category
0,67.273271,111.493008,1.0,40.01,36.37,1.0,122.07,39.067821,0.0,0.0,0.0,453.542641,2015,1,1,1,Unhealthy
1,67.273271,111.493008,0.02,27.75,19.73,0.02,85.9,39.067821,0.0,0.0,0.0,453.542641,2015,1,1,2,Unhealthy
2,67.273271,111.493008,0.08,19.32,11.08,0.08,52.83,39.067821,0.0,0.0,0.0,453.542641,2015,1,1,3,Unhealthy
3,67.273271,111.493008,0.3,16.45,9.2,0.3,39.53,153.58,0.0,0.0,0.0,453.542641,2015,1,1,4,Unhealthy
4,67.273271,111.493008,0.12,14.9,7.85,0.12,32.63,39.067821,0.0,0.0,0.0,453.542641,2015,1,1,5,Unhealthy


In [8]:
df_tmp['PM2.5_Category'] = df_tmp['PM2.5_Category'].astype("category").cat.as_ordered()

In [9]:
df_tmp.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 48192 entries, 0 to 48191
Data columns (total 17 columns):
 #   Column          Non-Null Count  Dtype   
---  ------          --------------  -----   
 0   PM2.5           48192 non-null  float64 
 1   PM10            48192 non-null  float64 
 2   NO              48192 non-null  float64 
 3   NO2             48192 non-null  float64 
 4   NOx             48192 non-null  float64 
 5   CO              48192 non-null  float64 
 6   SO2             48192 non-null  float64 
 7   O3              48192 non-null  float64 
 8   Benzene         48192 non-null  float64 
 9   Toluene         48192 non-null  float64 
 10  Xylene          48192 non-null  float64 
 11  AQI             48192 non-null  float64 
 12  Year            48192 non-null  int32   
 13  Month           48192 non-null  int32   
 14  Day             48192 non-null  int32   
 15  Hour            48192 non-null  int32   
 16  PM2.5_Category  48192 non-null  category
dtypes: category(

In [10]:
df_tmp['PM2.5_Category'].cat.categories

Index(['Good', 'Moderate', 'Unhealthy for Sensitive', 'Unhealthy',
       'Very Unhealthy', 'Hazardous'],
      dtype='object')

In [11]:
df_tmp['PM2.5_Category'].cat.codes

0        3
1        3
2        3
3        3
4        3
        ..
48187    2
48188    2
48189    1
48190    3
48191    2
Length: 48192, dtype: int8

In [12]:
df_tmp.isnull().sum()/len(df_tmp)

PM2.5             0.0
PM10              0.0
NO                0.0
NO2               0.0
NOx               0.0
CO                0.0
SO2               0.0
O3                0.0
Benzene           0.0
Toluene           0.0
Xylene            0.0
AQI               0.0
Year              0.0
Month             0.0
Day               0.0
Hour              0.0
PM2.5_Category    0.0
dtype: float64

In [13]:
np.random.seed(42)
from sklearn.model_selection import train_test_split

X = df_tmp.drop("PM2.5", axis=1)
Y = df_tmp["PM2.5"]

X_train, X_valid, Y_train, Y_valid = train_test_split(X, Y, train_size=0.8)

len(X_train), len(X_valid)

X_train.shape, X_valid.shape

((38553, 16), (9639, 16))

In [14]:
for label, content in X_train.items():
    if pd.api.types.is_numeric_dtype(content):
        print(label)


PM10
NO
NO2
NOx
CO
SO2
O3
Benzene
Toluene
Xylene
AQI
Year
Month
Day
Hour


In [15]:
for label, content in X_train.items():
    if pd.api.types.is_numeric_dtype(content):
        if pd.isnull(content).sum():
            print(label, "--> has", pd.isnull(content).sum(), "null values")

In [16]:
pd.Categorical(X_train["PM2.5_Category"]).codes

array([3, 1, 3, ..., 3, 3, 1], dtype=int8)

In [17]:
for label, content in X_train.items():
    if not pd.api.types.is_numeric_dtype(content):
        #turn category into number
        X_train[label] = pd.Categorical(content).codes + 1

In [18]:
X_train.info()

<class 'pandas.core.frame.DataFrame'>
Index: 38553 entries, 180 to 15795
Data columns (total 16 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   PM10            38553 non-null  float64
 1   NO              38553 non-null  float64
 2   NO2             38553 non-null  float64
 3   NOx             38553 non-null  float64
 4   CO              38553 non-null  float64
 5   SO2             38553 non-null  float64
 6   O3              38553 non-null  float64
 7   Benzene         38553 non-null  float64
 8   Toluene         38553 non-null  float64
 9   Xylene          38553 non-null  float64
 10  AQI             38553 non-null  float64
 11  Year            38553 non-null  int32  
 12  Month           38553 non-null  int32  
 13  Day             38553 non-null  int32  
 14  Hour            38553 non-null  int32  
 15  PM2.5_Category  38553 non-null  int8   
dtypes: float64(11), int32(4), int8(1)
memory usage: 4.2 MB


In [19]:
for label, content in X_valid.items():
    if not pd.api.types.is_numeric_dtype(content):
        X_valid[label] = pd.Categorical(content).codes + 1

In [20]:
X_train.shape, X_valid.shape

((38553, 16), (9639, 16))

In [21]:
%%time
# Lets build a machine learning model
from sklearn.ensemble import RandomForestRegressor

CPU times: total: 0 ns
Wall time: 74.1 ms


In [22]:
np.random.seed(42)
from sklearn.metrics import mean_squared_log_error, mean_absolute_error, r2_score, mean_squared_error

def rmsle(y_test, y_preds):
    """
    calculate root mean square log error between prediction and true labels    
    """

    return np.sqrt(mean_squared_log_error(y_test, y_preds))

# create a function to evaluate model on a few different levels

def show_scores(model):
    train_preds = model.predict(X_train)
    val_preds = model.predict(X_valid)
    scores = {"Training MAE" : mean_absolute_error(Y_train,train_preds),
              "valid MAE" : mean_absolute_error(Y_valid, val_preds),
              "Training RMSE" : np.sqrt(mean_squared_error(Y_train,train_preds)),
              "valid RMSE" : np.sqrt(mean_squared_error(Y_valid, val_preds)),
              "Training R^2": r2_score(Y_train,train_preds),
              "valid R^2" : r2_score(Y_valid, val_preds)}
    
    return scores

In [23]:
model = RandomForestRegressor(n_jobs=-1, random_state=42)

In [24]:
%%time
model.fit(X_train, Y_train)

CPU times: total: 54.3 s
Wall time: 6.72 s


In [25]:
show_scores(model)

{'Training MAE': 2.3865479174120674,
 'valid MAE': 6.671595034477475,
 'Training RMSE': 5.75670173189663,
 'valid RMSE': 16.740012504528877,
 'Training R^2': 0.9821958947011048,
 'valid R^2': 0.85568996537681}

In [26]:
%%time

# most ideal parameter 

ideal_model = RandomForestRegressor(n_estimators=90,
                                    min_samples_leaf=18,
                                    min_samples_split=3,
                                    max_features=0.5,
                                    n_jobs=-1,
                                    max_samples=None,
                                    random_state=42)  # random state so that our reult is reproducable

# FIT THE MODEL

ideal_model.fit(X_train, Y_train)

CPU times: total: 13.2 s
Wall time: 1.56 s


In [27]:
show_scores(ideal_model)

{'Training MAE': 6.097120356037825,
 'valid MAE': 7.14408276881962,
 'Training RMSE': 14.214104830921338,
 'valid RMSE': 16.960165558686224,
 'Training R^2': 0.8914545243933532,
 'valid R^2': 0.8518692746431766}

# Linear Regression

In [28]:
from sklearn.linear_model import LinearRegression
linreg = LinearRegression()
linreg.fit(X_train,Y_train)

In [29]:
y_pred = linreg.predict(X_valid)

In [30]:
from sklearn.model_selection import cross_val_score
print('Cross_val_score', cross_val_score(linreg, X_train, Y_train, cv=10, scoring="r2").mean())
print('r2_score', r2_score(Y_valid, y_pred))
print("valid MAE ", mean_absolute_error(Y_valid, y_pred))
print("valid RMSE", np.sqrt(mean_squared_error(Y_valid, y_pred)))

Cross_val_score 0.586734962650266
r2_score 0.5614218692876063
valid MAE  16.646218530249527
valid RMSE 29.183071468928862
