In [1]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import MinMaxScaler, LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
import warnings
warnings.filterwarnings('ignore')

def preprocess_air_quality_data(file_path):
    # Load data
    df = pd.read_csv(file_path, low_memory=False)
    df.drop(columns=['Xylene','PM10','City'], inplace=True)
    
    # Handle missing values
    numeric_columns = df.select_dtypes(include=['float64']).columns
    imputer = SimpleImputer(strategy='mean')
    df[numeric_columns] = imputer.fit_transform(df[numeric_columns])
    
    # Process datetime and create time features
    df['Datetime'] = pd.to_datetime(df['Datetime'], format='mixed', errors='coerce')
    df['Year'] = df['Datetime'].dt.year
    df['Month'] = df['Datetime'].dt.month
    df['Day'] = df['Datetime'].dt.day
    df['Hour'] = df['Datetime'].dt.hour
    
    # Categorize PM2.5 values
    bins = [0, 12, 35.4, 55.4, 150.4, 250.4, float('inf')]
    labels = ['Good', 'Moderate', 'Unhealthy for Sensitive', 'Unhealthy', 'Very Unhealthy', 'Hazardous']
    df['PM2.5_Category'] = pd.cut(df['PM2.5'], bins=bins, labels=labels)
    
    # Perform undersampling
    return df


# For each dataset
df = preprocess_air_quality_data('../Data/Gurugram_data.csv')

# Print sizes to see the reduction
print("Original sizes:", len(df))

Original sizes: 40258


In [2]:
df.head()

Unnamed: 0,Datetime,PM2.5,NO,NO2,NOx,NH3,CO,SO2,O3,Benzene,Toluene,AQI,AQI_Bucket,Year,Month,Day,Hour,PM2.5_Category
0,2015-11-27 15:00:00,114.449409,17.923547,12.5,5.85,26.679585,0.08,9.518058,17.65,1.753423,5.394773,223.877409,,2015,11,27,15,Unhealthy
1,2015-11-27 16:00:00,114.449409,4.02,10.03,10.34,26.679585,1.4,8.38,13.42,1.753423,5.394773,223.877409,,2015,11,27,16,Unhealthy
2,2015-11-27 17:00:00,114.449409,11.25,7.07,15.14,26.679585,1.75,16.69,5.42,1.753423,5.394773,223.877409,,2015,11,27,17,Unhealthy
3,2015-11-27 18:00:00,114.449409,4.55,7.06,8.44,26.679585,2.91,18.85,3.22,1.753423,5.394773,223.877409,,2015,11,27,18,Unhealthy
4,2015-11-27 19:00:00,114.449409,5.68,7.8,9.67,26.679585,4.58,21.52,5.15,1.753423,5.394773,223.877409,,2015,11,27,19,Unhealthy


In [3]:
df.describe()

Unnamed: 0,Datetime,PM2.5,NO,NO2,NOx,NH3,CO,SO2,O3,Benzene,Toluene,AQI,Year,Month,Day,Hour
count,40258,40258.0,40258.0,40258.0,40258.0,40258.0,40258.0,40258.0,40258.0,40258.0,40258.0,40258.0,40258.0,40258.0,40258.0,40258.0
mean,2018-03-15 07:30:00,114.449409,17.923547,23.117616,30.032255,26.679585,1.199674,9.518058,34.747413,1.753423,5.394773,223.877409,2017.719186,6.305182,15.754682,11.501391
min,2015-11-27 15:00:00,0.05,0.01,0.01,0.0,0.01,0.0,0.01,0.01,0.0,0.0,31.0,2015.0,1.0,1.0,0.0
25%,2017-01-19 23:15:00,55.7225,5.77,11.12,14.96,26.679585,0.47,4.43,12.71,0.0325,2.82,135.0,2017.0,3.0,8.0,6.0
50%,2018-03-15 07:30:00,100.325,9.68,21.55,30.032255,26.679585,0.79,8.84,30.315,1.753423,5.394773,223.877409,2018.0,6.0,16.0,12.0
75%,2019-05-08 15:45:00,127.1,17.923547,25.35,30.032255,26.679585,1.199674,9.78,44.9675,1.97,5.394773,305.0,2019.0,9.0,23.0,18.0
max,2020-07-01 00:00:00,999.99,499.99,495.56,485.42,103.0,49.27,190.9,199.8,315.21,70.27,966.0,2020.0,12.0,31.0,23.0
std,,102.580994,29.830501,20.800775,29.608619,4.659342,2.569179,9.558732,28.992327,5.716766,6.860723,110.948377,1.359297,3.495001,8.810321,6.922666


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 40258 entries, 0 to 40257
Data columns (total 18 columns):
 #   Column          Non-Null Count  Dtype         
---  ------          --------------  -----         
 0   Datetime        40258 non-null  datetime64[ns]
 1   PM2.5           40258 non-null  float64       
 2   NO              40258 non-null  float64       
 3   NO2             40258 non-null  float64       
 4   NOx             40258 non-null  float64       
 5   NH3             40258 non-null  float64       
 6   CO              40258 non-null  float64       
 7   SO2             40258 non-null  float64       
 8   O3              40258 non-null  float64       
 9   Benzene         40258 non-null  float64       
 10  Toluene         40258 non-null  float64       
 11  AQI             40258 non-null  float64       
 12  AQI_Bucket      33420 non-null  object        
 13  Year            40258 non-null  int32         
 14  Month           40258 non-null  int32         
 15  Da

In [5]:
df.isna().sum()

Datetime             0
PM2.5                0
NO                   0
NO2                  0
NOx                  0
NH3                  0
CO                   0
SO2                  0
O3                   0
Benzene              0
Toluene              0
AQI                  0
AQI_Bucket        6838
Year                 0
Month                0
Day                  0
Hour                 0
PM2.5_Category       0
dtype: int64

In [6]:
df_tmp = df.copy()

In [7]:
df_tmp.drop(['AQI_Bucket', 'Datetime'], axis=1, inplace=True)
df_tmp.head()

Unnamed: 0,PM2.5,NO,NO2,NOx,NH3,CO,SO2,O3,Benzene,Toluene,AQI,Year,Month,Day,Hour,PM2.5_Category
0,114.449409,17.923547,12.5,5.85,26.679585,0.08,9.518058,17.65,1.753423,5.394773,223.877409,2015,11,27,15,Unhealthy
1,114.449409,4.02,10.03,10.34,26.679585,1.4,8.38,13.42,1.753423,5.394773,223.877409,2015,11,27,16,Unhealthy
2,114.449409,11.25,7.07,15.14,26.679585,1.75,16.69,5.42,1.753423,5.394773,223.877409,2015,11,27,17,Unhealthy
3,114.449409,4.55,7.06,8.44,26.679585,2.91,18.85,3.22,1.753423,5.394773,223.877409,2015,11,27,18,Unhealthy
4,114.449409,5.68,7.8,9.67,26.679585,4.58,21.52,5.15,1.753423,5.394773,223.877409,2015,11,27,19,Unhealthy


In [8]:
df_tmp['PM2.5_Category'] = df_tmp['PM2.5_Category'].astype("category").cat.as_ordered()

In [9]:
df_tmp.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 40258 entries, 0 to 40257
Data columns (total 16 columns):
 #   Column          Non-Null Count  Dtype   
---  ------          --------------  -----   
 0   PM2.5           40258 non-null  float64 
 1   NO              40258 non-null  float64 
 2   NO2             40258 non-null  float64 
 3   NOx             40258 non-null  float64 
 4   NH3             40258 non-null  float64 
 5   CO              40258 non-null  float64 
 6   SO2             40258 non-null  float64 
 7   O3              40258 non-null  float64 
 8   Benzene         40258 non-null  float64 
 9   Toluene         40258 non-null  float64 
 10  AQI             40258 non-null  float64 
 11  Year            40258 non-null  int32   
 12  Month           40258 non-null  int32   
 13  Day             40258 non-null  int32   
 14  Hour            40258 non-null  int32   
 15  PM2.5_Category  40258 non-null  category
dtypes: category(1), float64(11), int32(4)
memory usage: 4.0 MB

In [10]:
df_tmp['PM2.5_Category'].cat.categories

Index(['Good', 'Moderate', 'Unhealthy for Sensitive', 'Unhealthy',
       'Very Unhealthy', 'Hazardous'],
      dtype='object')

In [11]:
df_tmp['PM2.5_Category'].cat.codes

0        3
1        3
2        3
3        3
4        3
        ..
40253    2
40254    2
40255    2
40256    2
40257    3
Length: 40258, dtype: int8

In [12]:
df_tmp.isnull().sum()/len(df_tmp)

PM2.5             0.0
NO                0.0
NO2               0.0
NOx               0.0
NH3               0.0
CO                0.0
SO2               0.0
O3                0.0
Benzene           0.0
Toluene           0.0
AQI               0.0
Year              0.0
Month             0.0
Day               0.0
Hour              0.0
PM2.5_Category    0.0
dtype: float64

In [13]:
np.random.seed(42)
from sklearn.model_selection import train_test_split

X = df_tmp.drop("PM2.5", axis=1)
Y = df_tmp["PM2.5"]

X_train, X_valid, Y_train, Y_valid = train_test_split(X, Y, train_size=0.8)

len(X_train), len(X_valid)

X_train.shape, X_valid.shape

((32206, 15), (8052, 15))

In [14]:
for label, content in X_train.items():
    if pd.api.types.is_numeric_dtype(content):
        print(label)


NO
NO2
NOx
NH3
CO
SO2
O3
Benzene
Toluene
AQI
Year
Month
Day
Hour


In [15]:
for label, content in X_train.items():
    if pd.api.types.is_numeric_dtype(content):
        if pd.isnull(content).sum():
            print(label, "--> has", pd.isnull(content).sum(), "null values")

In [16]:
pd.Categorical(X_train["PM2.5_Category"]).codes

array([5, 3, 3, ..., 1, 3, 2], dtype=int8)

In [17]:
for label, content in X_train.items():
    if not pd.api.types.is_numeric_dtype(content):
        #turn category into number
        X_train[label] = pd.Categorical(content).codes + 1

In [18]:
X_train.info()

<class 'pandas.core.frame.DataFrame'>
Index: 32206 entries, 17526 to 15795
Data columns (total 15 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   NO              32206 non-null  float64
 1   NO2             32206 non-null  float64
 2   NOx             32206 non-null  float64
 3   NH3             32206 non-null  float64
 4   CO              32206 non-null  float64
 5   SO2             32206 non-null  float64
 6   O3              32206 non-null  float64
 7   Benzene         32206 non-null  float64
 8   Toluene         32206 non-null  float64
 9   AQI             32206 non-null  float64
 10  Year            32206 non-null  int32  
 11  Month           32206 non-null  int32  
 12  Day             32206 non-null  int32  
 13  Hour            32206 non-null  int32  
 14  PM2.5_Category  32206 non-null  int8   
dtypes: float64(10), int32(4), int8(1)
memory usage: 3.2 MB


In [19]:
for label, content in X_valid.items():
    if not pd.api.types.is_numeric_dtype(content):
        X_valid[label] = pd.Categorical(content).codes + 1

In [20]:
X_train.shape, X_valid.shape

((32206, 15), (8052, 15))

In [22]:
%%time
# Lets build a machine learning model
from sklearn.ensemble import RandomForestRegressor

CPU times: total: 0 ns
Wall time: 0 ns


In [23]:
np.random.seed(42)
from sklearn.metrics import mean_squared_log_error, mean_absolute_error, r2_score

def rmsle(y_test, y_preds):
    """
    calculate root mean square log error between prediction and true labels    
    """

    return np.sqrt(mean_squared_log_error(y_test, y_preds))

# create a function to evaluate model on a few different levels

def show_scores(model):
    train_preds = model.predict(X_train)
    val_preds = model.predict(X_valid)
    scores = {"Training MAE" : mean_absolute_error(Y_train,train_preds),
              "valid MAE" : mean_absolute_error(Y_valid, val_preds),
              "Training RMSLE" : rmsle(Y_train,train_preds),
              "valid RMSLE" : rmsle(Y_valid, val_preds),
              "Training R^2": r2_score(Y_train,train_preds),
              "valid R^2" : r2_score(Y_valid, val_preds)}
    
    return scores

In [24]:
model = RandomForestRegressor(n_jobs=-1, random_state=42)

In [25]:
%%time
model.fit(X_train, Y_train)

CPU times: total: 50 s
Wall time: 4.96 s


In [26]:
show_scores(model)

{'Training MAE': 5.256153851794852,
 'valid MAE': 13.990707574042688,
 'Training RMSLE': 0.06889481232745796,
 'valid RMSLE': 0.1732676702102284,
 'Training R^2': 0.985705547467682,
 'valid R^2': 0.8970571986738166}

In [27]:
%%time

# most ideal parameter 

ideal_model = RandomForestRegressor(n_estimators=90,
                                    min_samples_leaf=18,
                                    min_samples_split=3,
                                    max_features=0.5,
                                    n_jobs=-1,
                                    max_samples=None,
                                    random_state=42)  # random state so that our reult is reproducable

# FIT THE MODEL

ideal_model.fit(X_train, Y_train)

CPU times: total: 10.8 s
Wall time: 1.14 s


In [28]:
show_scores(ideal_model)

{'Training MAE': 14.69828186086594,
 'valid MAE': 16.218414904815983,
 'Training RMSLE': 0.18848686504084666,
 'valid RMSLE': 0.21489869669767978,
 'Training R^2': 0.8843749132682114,
 'valid R^2': 0.8658747954276853}

# Linear Regression

In [29]:
from sklearn.linear_model import LinearRegression
linreg = LinearRegression()
linreg.fit(X_train,Y_train)

In [30]:
y_pred = linreg.predict(X_valid)

In [31]:
from sklearn.model_selection import cross_val_score
print('Cross_val_score', cross_val_score(linreg, X_train, Y_train, cv=10, scoring="r2").mean())
print('r2_score', r2_score(Y_valid, y_pred))
print("valid MAE ", mean_absolute_error(Y_valid, y_pred))

Cross_val_score 0.5965174665497764
r2_score 0.6087792296926573
valid MAE  32.81855433445258
