In [None]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import MinMaxScaler, LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
import warnings
warnings.filterwarnings('ignore')

def preprocess_air_quality_data(file_path):
    # Load data
    df = pd.read_csv(file_path, low_memory=False)
    df.drop(columns=['StationId'], inplace=True)
    
    # Handle missing values
    numeric_columns = df.select_dtypes(include=['float64']).columns
    imputer = SimpleImputer(strategy='mean')
    df[numeric_columns] = imputer.fit_transform(df[numeric_columns])
    
    # Process datetime and create time features
    df['Datetime'] = pd.to_datetime(df['Datetime'], format='mixed', errors='coerce')
    df['Year'] = df['Datetime'].dt.year
    df['Month'] = df['Datetime'].dt.month
    df['Day'] = df['Datetime'].dt.day
    df['Hour'] = df['Datetime'].dt.hour
    
    # Categorize PM2.5 values
    bins = [0, 12, 35.4, 55.4, 150.4, 250.4, float('inf')]
    labels = ['Good', 'Moderate', 'Unhealthy for Sensitive', 'Unhealthy', 'Very Unhealthy', 'Hazardous']
    df['PM2.5_Category'] = pd.cut(df['PM2.5'], bins=bins, labels=labels)
    
    return df


# For each dataset
df1 = preprocess_air_quality_data('../Data/new_data.csv')
# df2 = preprocess_air_quality_data('/kaggle/input/all-other-set/second_set.csv')
# df3 = preprocess_air_quality_data('/kaggle/input/all-other-set/third_set.csv')

# Combine all datasets
# final_df = pd.concat([df1, df2, df3], ignore_index=True)

# Print sizes to see the reduction
print("Original sizes:", len(df1))

Original sizes: 1048575


In [22]:
df = df1[:355000]
df.head()

Unnamed: 0,Datetime,PM2.5,PM10,NO,NO2,NOx,NH3,CO,SO2,O3,Benzene,Toluene,Xylene,AQI,AQI_Bucket,Year,Month,Day,Hour,PM2.5_Category
0,2017-11-24 17:00:00,60.5,98.0,2.35,30.8,18.25,8.5,0.1,11.85,126.4,0.1,6.1,0.1,226.952556,,2017,11,24,17,Unhealthy
1,2017-11-24 18:00:00,65.5,111.25,2.7,24.2,15.07,9.77,0.1,13.17,117.12,0.1,6.25,0.15,226.952556,,2017,11,24,18,Unhealthy
2,2017-11-24 19:00:00,80.0,132.0,2.1,25.18,15.15,12.02,0.1,12.08,98.98,0.2,5.98,0.18,226.952556,,2017,11,24,19,Unhealthy
3,2017-11-24 20:00:00,81.5,133.25,1.95,16.25,10.23,11.58,0.1,10.47,112.2,0.2,6.72,0.1,226.952556,,2017,11,24,20,Unhealthy
4,2017-11-24 21:00:00,75.25,116.0,1.43,17.48,10.43,12.03,0.1,9.12,106.35,0.2,5.75,0.08,226.952556,,2017,11,24,21,Unhealthy


In [23]:
df.describe()

Unnamed: 0,Datetime,PM2.5,PM10,NO,NO2,NOx,NH3,CO,SO2,O3,Benzene,Toluene,Xylene,AQI,Year,Month,Day,Hour
count,355000,355000.0,355000.0,355000.0,355000.0,355000.0,355000.0,355000.0,355000.0,355000.0,355000.0,355000.0,355000.0,355000.0,355000.0,355000.0,355000.0,355000.0
mean,2018-07-14 15:04:26.237745664,101.642875,195.390573,31.648493,41.611555,51.719014,32.712946,1.2733,15.876018,37.261115,3.317754,17.619067,2.476366,218.692769,2018.05733,6.235318,15.771915,11.501876
min,2015-01-01 01:00:00,0.01,0.01,0.01,0.01,0.0,0.01,0.0,0.01,0.01,0.0,0.0,0.0,16.0,2015.0,1.0,1.0,0.0
25%,2017-06-17 02:00:00,45.25,107.5,4.9,18.59,14.62,24.62,0.64,11.6,16.91,0.75,3.19,1.78,118.0,2017.0,3.0,8.0,6.0
50%,2018-11-12 16:00:00,102.0,201.51981,20.6,41.28,41.05,35.916707,1.21,14.909659,36.65,3.276346,20.037383,2.232788,226.952556,2018.0,6.0,16.0,12.0
75%,2019-10-19 08:00:00,105.015741,201.51981,32.150033,43.681179,52.392202,35.916707,1.393665,14.909659,43.310098,3.276346,20.037383,2.232788,269.0,2019.0,9.0,23.0,18.0
max,2020-07-01 00:00:00,999.99,1000.0,500.0,499.97,500.0,482.53,30.0,199.77,199.98,491.51,499.8,319.95,1109.0,2020.0,12.0,31.0,23.0
std,,89.971786,134.85351,49.906033,34.869068,62.346805,17.895725,1.093254,11.29974,28.394095,5.880905,25.518871,4.688799,125.378323,1.530799,3.438159,8.794425,6.922708


In [24]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 355000 entries, 0 to 354999
Data columns (total 20 columns):
 #   Column          Non-Null Count   Dtype         
---  ------          --------------   -----         
 0   Datetime        355000 non-null  datetime64[ns]
 1   PM2.5           355000 non-null  float64       
 2   PM10            355000 non-null  float64       
 3   NO              355000 non-null  float64       
 4   NO2             355000 non-null  float64       
 5   NOx             355000 non-null  float64       
 6   NH3             355000 non-null  float64       
 7   CO              355000 non-null  float64       
 8   SO2             355000 non-null  float64       
 9   O3              355000 non-null  float64       
 10  Benzene         355000 non-null  float64       
 11  Toluene         355000 non-null  float64       
 12  Xylene          355000 non-null  float64       
 13  AQI             355000 non-null  float64       
 14  AQI_Bucket      267211 non-null  obj

In [25]:
df.isna().sum()

Datetime              0
PM2.5                 0
PM10                  0
NO                    0
NO2                   0
NOx                   0
NH3                   0
CO                    0
SO2                   0
O3                    0
Benzene               0
Toluene               0
Xylene                0
AQI                   0
AQI_Bucket        87789
Year                  0
Month                 0
Day                   0
Hour                  0
PM2.5_Category        0
dtype: int64

In [26]:
df_tmp = df.copy()

In [27]:
df_tmp.drop(['AQI_Bucket', 'Datetime'], axis=1, inplace=True)
df_tmp.head()

Unnamed: 0,PM2.5,PM10,NO,NO2,NOx,NH3,CO,SO2,O3,Benzene,Toluene,Xylene,AQI,Year,Month,Day,Hour,PM2.5_Category
0,60.5,98.0,2.35,30.8,18.25,8.5,0.1,11.85,126.4,0.1,6.1,0.1,226.952556,2017,11,24,17,Unhealthy
1,65.5,111.25,2.7,24.2,15.07,9.77,0.1,13.17,117.12,0.1,6.25,0.15,226.952556,2017,11,24,18,Unhealthy
2,80.0,132.0,2.1,25.18,15.15,12.02,0.1,12.08,98.98,0.2,5.98,0.18,226.952556,2017,11,24,19,Unhealthy
3,81.5,133.25,1.95,16.25,10.23,11.58,0.1,10.47,112.2,0.2,6.72,0.1,226.952556,2017,11,24,20,Unhealthy
4,75.25,116.0,1.43,17.48,10.43,12.03,0.1,9.12,106.35,0.2,5.75,0.08,226.952556,2017,11,24,21,Unhealthy


In [28]:
df_tmp['PM2.5_Category'] = df_tmp['PM2.5_Category'].astype("category").cat.as_ordered()

In [29]:
df_tmp.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 355000 entries, 0 to 354999
Data columns (total 18 columns):
 #   Column          Non-Null Count   Dtype   
---  ------          --------------   -----   
 0   PM2.5           355000 non-null  float64 
 1   PM10            355000 non-null  float64 
 2   NO              355000 non-null  float64 
 3   NO2             355000 non-null  float64 
 4   NOx             355000 non-null  float64 
 5   NH3             355000 non-null  float64 
 6   CO              355000 non-null  float64 
 7   SO2             355000 non-null  float64 
 8   O3              355000 non-null  float64 
 9   Benzene         355000 non-null  float64 
 10  Toluene         355000 non-null  float64 
 11  Xylene          355000 non-null  float64 
 12  AQI             355000 non-null  float64 
 13  Year            355000 non-null  int32   
 14  Month           355000 non-null  int32   
 15  Day             355000 non-null  int32   
 16  Hour            355000 non-null  int32

In [30]:
df_tmp['PM2.5_Category'].cat.categories

Index(['Good', 'Moderate', 'Unhealthy for Sensitive', 'Unhealthy',
       'Very Unhealthy', 'Hazardous'],
      dtype='object')

In [31]:
df_tmp['PM2.5_Category'].cat.codes

0         3
1         3
2         3
3         3
4         3
         ..
354995    3
354996    3
354997    3
354998    3
354999    3
Length: 355000, dtype: int8

In [32]:
df_tmp.isnull().sum()/len(df_tmp)

PM2.5             0.0
PM10              0.0
NO                0.0
NO2               0.0
NOx               0.0
NH3               0.0
CO                0.0
SO2               0.0
O3                0.0
Benzene           0.0
Toluene           0.0
Xylene            0.0
AQI               0.0
Year              0.0
Month             0.0
Day               0.0
Hour              0.0
PM2.5_Category    0.0
dtype: float64

In [33]:
np.random.seed(42)
from sklearn.model_selection import train_test_split

X = df_tmp.drop("PM2.5", axis=1)
Y = df_tmp["PM2.5"]

X_train, X_valid, Y_train, Y_valid = train_test_split(X, Y, train_size=0.8)

len(X_train), len(X_valid)

X_train.shape, X_valid.shape

((284000, 17), (71000, 17))

In [34]:
for label, content in X_train.items():
    if pd.api.types.is_numeric_dtype(content):
        print(label)


PM10
NO
NO2
NOx
NH3
CO
SO2
O3
Benzene
Toluene
Xylene
AQI
Year
Month
Day
Hour


In [35]:
for label, content in X_train.items():
    if pd.api.types.is_numeric_dtype(content):
        if pd.isnull(content).sum():
            print(label, "--> has", pd.isnull(content).sum(), "null values")

In [36]:
pd.Categorical(X_train["PM2.5_Category"]).codes

array([3, 2, 1, ..., 3, 3, 3], dtype=int8)

In [37]:
for label, content in X_train.items():
    if not pd.api.types.is_numeric_dtype(content):
        #turn category into number
        X_train[label] = pd.Categorical(content).codes + 1

In [38]:
X_train.info()

<class 'pandas.core.frame.DataFrame'>
Index: 284000 entries, 353502 to 121958
Data columns (total 17 columns):
 #   Column          Non-Null Count   Dtype  
---  ------          --------------   -----  
 0   PM10            284000 non-null  float64
 1   NO              284000 non-null  float64
 2   NO2             284000 non-null  float64
 3   NOx             284000 non-null  float64
 4   NH3             284000 non-null  float64
 5   CO              284000 non-null  float64
 6   SO2             284000 non-null  float64
 7   O3              284000 non-null  float64
 8   Benzene         284000 non-null  float64
 9   Toluene         284000 non-null  float64
 10  Xylene          284000 non-null  float64
 11  AQI             284000 non-null  float64
 12  Year            284000 non-null  int32  
 13  Month           284000 non-null  int32  
 14  Day             284000 non-null  int32  
 15  Hour            284000 non-null  int32  
 16  PM2.5_Category  284000 non-null  int8   
dtypes: float64

In [39]:
for label, content in X_valid.items():
    if not pd.api.types.is_numeric_dtype(content):
        X_valid[label] = pd.Categorical(content).codes + 1

In [40]:
X_train.shape, X_valid.shape

((284000, 17), (71000, 17))

In [41]:
%%time
# Lets build a machine learning model
from sklearn.ensemble import RandomForestRegressor
model = RandomForestRegressor(n_jobs = -1, random_state = 42)
model.fit(X_train, Y_train)

CPU times: total: 10min 34s
Wall time: 1min 9s


In [42]:
np.random.seed(42)
from sklearn.metrics import mean_squared_log_error, mean_absolute_error, r2_score

def rmsle(y_test, y_preds):
    """
    calculate root mean square log error between prediction and true labels    
    """

    return np.sqrt(mean_squared_log_error(y_test, y_preds))

# create a function to evaluate model on a few different levels

def show_scores(model):
    train_preds = model.predict(X_train)
    val_preds = model.predict(X_valid)
    scores = {"Training MAE" : mean_absolute_error(Y_train,train_preds),
              "valid MAE" : mean_absolute_error(Y_valid, val_preds),
              "Training RMSLE" : rmsle(Y_train,train_preds),
              "valid RMSLE" : rmsle(Y_valid, val_preds),
              "Training R^2": r2_score(Y_train,train_preds),
              "valid R^2" : r2_score(Y_valid, val_preds)}
    
    return scores

In [43]:
model = RandomForestRegressor(n_jobs=-1, random_state=42)

In [44]:
%%time
model.fit(X_train, Y_train)

CPU times: total: 10min 4s
Wall time: 1min 6s


In [45]:
show_scores(model)

{'Training MAE': 3.229921143650981,
 'valid MAE': 8.626105523331365,
 'Training RMSLE': 0.06288664132726494,
 'valid RMSLE': 0.15188069348168307,
 'Training R^2': 0.9920512468257926,
 'valid R^2': 0.9445941650979641}

In [46]:
%%time

# most ideal parameter 

ideal_model = RandomForestRegressor(n_estimators=90,
                                    min_samples_leaf=18,
                                    min_samples_split=3,
                                    max_features=0.5,
                                    n_jobs=-1,
                                    max_samples=None,
                                    random_state=42)  # random state so that our reult is reproducable

# FIT THE MODEL

ideal_model.fit(X_train, Y_train)

CPU times: total: 2min 45s
Wall time: 17.9 s


In [47]:
show_scores(ideal_model)

{'Training MAE': 8.694859582297818,
 'valid MAE': 9.856649797123374,
 'Training RMSLE': 0.14998641470510804,
 'valid RMSLE': 0.16496718051831957,
 'Training R^2': 0.9442275932797072,
 'valid R^2': 0.9281161879790167}

In [48]:
from sklearn.linear_model import LinearRegression
linreg = LinearRegression()
linreg.fit(X_train,Y_train)

In [49]:
y_pred = linreg.predict(X_valid)

In [51]:
from sklearn.model_selection import cross_val_score
print('Cross_val_score', cross_val_score(linreg, X_train, Y_train, cv=10, scoring="r2").mean())
print('r2_score', r2_score(Y_valid, y_pred))
print("valid MAE ", mean_absolute_error(Y_valid, y_pred))

Cross_val_score 0.716068226934362
r2_score 0.7088907077473554
valid MAE  26.705010096884273
