In [6]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import MinMaxScaler, LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
import warnings
warnings.filterwarnings('ignore')

def preprocess_air_quality_data(file_path):
    # Load data
    df = pd.read_csv(file_path, low_memory=False)
    df.drop(columns=['Xylene','PM10','City'], inplace=True)
    
    # Handle missing values
    numeric_columns = df.select_dtypes(include=['float64']).columns
    imputer = SimpleImputer(strategy='mean')
    df[numeric_columns] = imputer.fit_transform(df[numeric_columns])
    
    # Process datetime and create time features
    df['Datetime'] = pd.to_datetime(df['Datetime'], format='mixed', errors='coerce')
    df['Year'] = df['Datetime'].dt.year
    df['Month'] = df['Datetime'].dt.month
    df['Day'] = df['Datetime'].dt.day
    df['Hour'] = df['Datetime'].dt.hour
    
    # Categorize PM2.5 values
    bins = [0, 12, 35.4, 55.4, 150.4, 250.4, float('inf')]
    labels = ['Good', 'Moderate', 'Unhealthy for Sensitive', 'Unhealthy', 'Very Unhealthy', 'Hazardous']
    df['PM2.5_Category'] = pd.cut(df['PM2.5'], bins=bins, labels=labels)
    
    # Perform undersampling
    return df


# For each dataset
df = preprocess_air_quality_data('../Data/Chennai_data.csv')

# Print sizes to see the reduction
print("Original sizes:", len(df))

Original sizes: 48192


In [7]:
df.head()

Unnamed: 0,Datetime,PM2.5,NO,NO2,NOx,NH3,CO,SO2,O3,Benzene,Toluene,AQI,AQI_Bucket,Year,Month,Day,Hour,PM2.5_Category
0,2015-01-01 01:00:00,50.201152,41.56,47.97,57.25,6.36,0.92,7.19,7.65,0.11,2.047557,114.113485,,2015,1,1,1,Unhealthy for Sensitive
1,2015-01-01 02:00:00,50.201152,15.29,12.31,20.94,4.93,0.77,8.03,10.96,0.17,2.047557,114.113485,,2015,1,1,2,Unhealthy for Sensitive
2,2015-01-01 03:00:00,50.201152,14.49,13.72,19.29,3.92,1.06,10.53,12.14,0.19,2.047557,114.113485,,2015,1,1,3,Unhealthy for Sensitive
3,2015-01-01 04:00:00,50.201152,14.23,14.45,18.89,4.54,1.54,10.04,11.95,0.18,2.047557,114.113485,,2015,1,1,4,Unhealthy for Sensitive
4,2015-01-01 05:00:00,50.201152,13.62,20.16,18.4,4.85,1.03,9.19,11.64,0.18,2.047557,114.113485,,2015,1,1,5,Unhealthy for Sensitive


In [8]:
df.describe()

Unnamed: 0,Datetime,PM2.5,NO,NO2,NOx,NH3,CO,SO2,O3,Benzene,Toluene,AQI,Year,Month,Day,Hour
count,48192,48192.0,48192.0,48192.0,48192.0,48192.0,48192.0,48192.0,48192.0,48192.0,48192.0,48192.0,48192.0,48192.0,48192.0,48192.0
mean,2017-10-01 00:30:00,50.201152,9.205325,16.822431,17.697846,62.957465,1.065728,7.881015,32.449558,0.97098,2.047557,114.113485,2017.271518,6.249626,15.723108,11.5
min,2015-01-01 01:00:00,0.02,0.01,0.05,0.0,0.01,0.0,0.03,0.01,0.0,0.0,21.0,2015.0,1.0,1.0,0.0
25%,2016-05-17 00:45:00,28.84,4.94,11.14,11.87,27.82,0.52,4.18,15.34,0.0,0.0,77.0,2016.0,3.0,8.0,5.75
50%,2017-10-01 00:30:00,44.78,7.23,15.08,16.35,62.957465,0.81,6.2,27.56,0.22,0.85,101.0,2017.0,6.0,16.0,11.5
75%,2019-02-15 00:15:00,59.1,10.09,19.52,20.56,62.957465,1.08,9.05,42.4725,0.97098,2.047557,131.0,2019.0,9.0,23.0,17.25
max,2020-07-01 00:00:00,999.99,202.49,472.53,256.47,499.97,46.8,141.26,198.37,391.88,265.98,490.0,2020.0,12.0,31.0,23.0
std,,41.799111,9.89808,11.068333,9.938286,57.492055,2.02096,7.112794,23.60223,4.649722,4.889071,55.526992,1.600125,3.439819,8.79654,6.922258


In [9]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 48192 entries, 0 to 48191
Data columns (total 18 columns):
 #   Column          Non-Null Count  Dtype         
---  ------          --------------  -----         
 0   Datetime        48192 non-null  datetime64[ns]
 1   PM2.5           48192 non-null  float64       
 2   NO              48192 non-null  float64       
 3   NO2             48192 non-null  float64       
 4   NOx             48192 non-null  float64       
 5   NH3             48192 non-null  float64       
 6   CO              48192 non-null  float64       
 7   SO2             48192 non-null  float64       
 8   O3              48192 non-null  float64       
 9   Benzene         48192 non-null  float64       
 10  Toluene         48192 non-null  float64       
 11  AQI             48192 non-null  float64       
 12  AQI_Bucket      44649 non-null  object        
 13  Year            48192 non-null  int32         
 14  Month           48192 non-null  int32         
 15  Da

In [10]:
df.isna().sum()

Datetime             0
PM2.5                0
NO                   0
NO2                  0
NOx                  0
NH3                  0
CO                   0
SO2                  0
O3                   0
Benzene              0
Toluene              0
AQI                  0
AQI_Bucket        3543
Year                 0
Month                0
Day                  0
Hour                 0
PM2.5_Category       0
dtype: int64

In [11]:
df_tmp = df.copy()

In [12]:
df_tmp.drop(['AQI_Bucket', 'Datetime'], axis=1, inplace=True)
df_tmp.head()

Unnamed: 0,PM2.5,NO,NO2,NOx,NH3,CO,SO2,O3,Benzene,Toluene,AQI,Year,Month,Day,Hour,PM2.5_Category
0,50.201152,41.56,47.97,57.25,6.36,0.92,7.19,7.65,0.11,2.047557,114.113485,2015,1,1,1,Unhealthy for Sensitive
1,50.201152,15.29,12.31,20.94,4.93,0.77,8.03,10.96,0.17,2.047557,114.113485,2015,1,1,2,Unhealthy for Sensitive
2,50.201152,14.49,13.72,19.29,3.92,1.06,10.53,12.14,0.19,2.047557,114.113485,2015,1,1,3,Unhealthy for Sensitive
3,50.201152,14.23,14.45,18.89,4.54,1.54,10.04,11.95,0.18,2.047557,114.113485,2015,1,1,4,Unhealthy for Sensitive
4,50.201152,13.62,20.16,18.4,4.85,1.03,9.19,11.64,0.18,2.047557,114.113485,2015,1,1,5,Unhealthy for Sensitive


In [13]:
df_tmp['PM2.5_Category'] = df_tmp['PM2.5_Category'].astype("category").cat.as_ordered()

In [14]:
df_tmp.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 48192 entries, 0 to 48191
Data columns (total 16 columns):
 #   Column          Non-Null Count  Dtype   
---  ------          --------------  -----   
 0   PM2.5           48192 non-null  float64 
 1   NO              48192 non-null  float64 
 2   NO2             48192 non-null  float64 
 3   NOx             48192 non-null  float64 
 4   NH3             48192 non-null  float64 
 5   CO              48192 non-null  float64 
 6   SO2             48192 non-null  float64 
 7   O3              48192 non-null  float64 
 8   Benzene         48192 non-null  float64 
 9   Toluene         48192 non-null  float64 
 10  AQI             48192 non-null  float64 
 11  Year            48192 non-null  int32   
 12  Month           48192 non-null  int32   
 13  Day             48192 non-null  int32   
 14  Hour            48192 non-null  int32   
 15  PM2.5_Category  48192 non-null  category
dtypes: category(1), float64(11), int32(4)
memory usage: 4.8 MB

In [15]:
df_tmp['PM2.5_Category'].cat.categories

Index(['Good', 'Moderate', 'Unhealthy for Sensitive', 'Unhealthy',
       'Very Unhealthy', 'Hazardous'],
      dtype='object')

In [16]:
df_tmp['PM2.5_Category'].cat.codes

0        2
1        2
2        2
3        2
4        2
        ..
48187    1
48188    1
48189    1
48190    1
48191    1
Length: 48192, dtype: int8

In [17]:
df_tmp.isnull().sum()/len(df_tmp)

PM2.5             0.0
NO                0.0
NO2               0.0
NOx               0.0
NH3               0.0
CO                0.0
SO2               0.0
O3                0.0
Benzene           0.0
Toluene           0.0
AQI               0.0
Year              0.0
Month             0.0
Day               0.0
Hour              0.0
PM2.5_Category    0.0
dtype: float64

In [18]:
np.random.seed(42)
from sklearn.model_selection import train_test_split

X = df_tmp.drop("PM2.5", axis=1)
Y = df_tmp["PM2.5"]

X_train, X_valid, Y_train, Y_valid = train_test_split(X, Y, train_size=0.8)

len(X_train), len(X_valid)

X_train.shape, X_valid.shape

((38553, 15), (9639, 15))

In [19]:
for label, content in X_train.items():
    if pd.api.types.is_numeric_dtype(content):
        print(label)


NO
NO2
NOx
NH3
CO
SO2
O3
Benzene
Toluene
AQI
Year
Month
Day
Hour


In [20]:
for label, content in X_train.items():
    if pd.api.types.is_numeric_dtype(content):
        if pd.isnull(content).sum():
            print(label, "--> has", pd.isnull(content).sum(), "null values")

In [21]:
pd.Categorical(X_train["PM2.5_Category"]).codes

array([2, 2, 2, ..., 2, 2, 3], dtype=int8)

In [22]:
for label, content in X_train.items():
    if not pd.api.types.is_numeric_dtype(content):
        #turn category into number
        X_train[label] = pd.Categorical(content).codes + 1

In [23]:
X_train.info()

<class 'pandas.core.frame.DataFrame'>
Index: 38553 entries, 180 to 15795
Data columns (total 15 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   NO              38553 non-null  float64
 1   NO2             38553 non-null  float64
 2   NOx             38553 non-null  float64
 3   NH3             38553 non-null  float64
 4   CO              38553 non-null  float64
 5   SO2             38553 non-null  float64
 6   O3              38553 non-null  float64
 7   Benzene         38553 non-null  float64
 8   Toluene         38553 non-null  float64
 9   AQI             38553 non-null  float64
 10  Year            38553 non-null  int32  
 11  Month           38553 non-null  int32  
 12  Day             38553 non-null  int32  
 13  Hour            38553 non-null  int32  
 14  PM2.5_Category  38553 non-null  int8   
dtypes: float64(10), int32(4), int8(1)
memory usage: 3.9 MB


In [24]:
for label, content in X_valid.items():
    if not pd.api.types.is_numeric_dtype(content):
        X_valid[label] = pd.Categorical(content).codes + 1

In [25]:
X_train.shape, X_valid.shape

((38553, 15), (9639, 15))

In [26]:
%%time
# Lets build a machine learning model
from sklearn.ensemble import RandomForestRegressor

CPU times: total: 46.9 ms
Wall time: 53.5 ms


In [27]:
np.random.seed(42)
from sklearn.metrics import mean_squared_log_error, mean_absolute_error, r2_score

def rmsle(y_test, y_preds):
    """
    calculate root mean square log error between prediction and true labels    
    """

    return np.sqrt(mean_squared_log_error(y_test, y_preds))

# create a function to evaluate model on a few different levels

def show_scores(model):
    train_preds = model.predict(X_train)
    val_preds = model.predict(X_valid)
    scores = {"Training MAE" : mean_absolute_error(Y_train,train_preds),
              "valid MAE" : mean_absolute_error(Y_valid, val_preds),
              "Training RMSLE" : rmsle(Y_train,train_preds),
              "valid RMSLE" : rmsle(Y_valid, val_preds),
              "Training R^2": r2_score(Y_train,train_preds),
              "valid R^2" : r2_score(Y_valid, val_preds)}
    
    return scores

In [28]:
model = RandomForestRegressor(n_jobs=-1, random_state=42)

In [29]:
%%time
model.fit(X_train, Y_train)

CPU times: total: 1min 12s
Wall time: 6.88 s


In [30]:
show_scores(model)

{'Training MAE': 2.450594132213749,
 'valid MAE': 6.64356108187817,
 'Training RMSLE': 0.07242557458896745,
 'valid RMSLE': 0.1837082938155696,
 'Training R^2': 0.9847112618978787,
 'valid R^2': 0.886697050117501}

In [31]:
%%time

# most ideal parameter 

ideal_model = RandomForestRegressor(n_estimators=90,
                                    min_samples_leaf=18,
                                    min_samples_split=3,
                                    max_features=0.5,
                                    n_jobs=-1,
                                    max_samples=None,
                                    random_state=42)  # random state so that our reult is reproducable

# FIT THE MODEL

ideal_model.fit(X_train, Y_train)

CPU times: total: 16.2 s
Wall time: 1.77 s


In [32]:
show_scores(ideal_model)

{'Training MAE': 6.2690524414039075,
 'valid MAE': 7.153757642871835,
 'Training RMSLE': 0.1802845300669528,
 'valid RMSLE': 0.19774716611747403,
 'Training R^2': 0.8873050742968958,
 'valid R^2': 0.8558087510530453}

# Linear Regression

In [33]:
from sklearn.linear_model import LinearRegression
linreg = LinearRegression()
linreg.fit(X_train,Y_train)

In [34]:
y_pred = linreg.predict(X_valid)

In [35]:
from sklearn.model_selection import cross_val_score
print('Cross_val_score', cross_val_score(linreg, X_train, Y_train, cv=10, scoring="r2").mean())
print('r2_score', r2_score(Y_valid, y_pred))
print("valid MAE ", mean_absolute_error(Y_valid, y_pred))

Cross_val_score 0.5756642561308302
r2_score 0.5394380578504832
valid MAE  13.35688383083081
