In [2]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import MinMaxScaler, LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
import warnings
warnings.filterwarnings('ignore')

def preprocess_air_quality_data(file_path):
    # Load data
    df = pd.read_csv(file_path, low_memory=False)
    df.drop(columns=['City'], inplace=True)
    
    # Handle missing values
    numeric_columns = df.select_dtypes(include=['float64']).columns
    imputer = SimpleImputer(strategy='mean')
    df[numeric_columns] = imputer.fit_transform(df[numeric_columns])
    
    # Process datetime and create time features
    df['Datetime'] = pd.to_datetime(df['Datetime'], format='mixed', errors='coerce')
    df['Year'] = df['Datetime'].dt.year
    df['Month'] = df['Datetime'].dt.month
    df['Day'] = df['Datetime'].dt.day
    df['Hour'] = df['Datetime'].dt.hour
    
    # Categorize PM2.5 values
    bins = [0, 12, 35.4, 55.4, 150.4, 250.4, float('inf')]
    labels = ['Good', 'Moderate', 'Unhealthy for Sensitive', 'Unhealthy', 'Very Unhealthy', 'Hazardous']
    df['PM2.5_Category'] = pd.cut(df['PM2.5'], bins=bins, labels=labels)
    
    # Perform undersampling
    return df


# For each dataset
df = preprocess_air_quality_data('../Data/Jorapokhar_Kochi_Kolkata_data.csv')

# Print sizes to see the reduction
print("Original sizes:", len(df))



df_tmp = df.copy()
df_tmp.drop(['AQI_Bucket', 'Datetime'], axis=1, inplace=True)

df_tmp['PM2.5_Category'] = df_tmp['PM2.5_Category'].astype("category").cat.as_ordered()

df_tmp['PM2.5_Category'].cat.categories

df_tmp['PM2.5_Category'].cat.codes
df_tmp.isnull().sum()/len(df_tmp)

np.random.seed(42)
from sklearn.model_selection import train_test_split

X = df_tmp.drop("PM2.5", axis=1)
Y = df_tmp["PM2.5"]

X_train, X_valid, Y_train, Y_valid = train_test_split(X, Y, train_size=0.8)

len(X_train), len(X_valid)

X_train.shape, X_valid.shape

pd.Categorical(X_train["PM2.5_Category"]).codes

for label, content in X_train.items():
    if not pd.api.types.is_numeric_dtype(content):
        #turn category into number
        X_train[label] = pd.Categorical(content).codes + 1


for label, content in X_valid.items():
    if not pd.api.types.is_numeric_dtype(content):
        X_valid[label] = pd.Categorical(content).codes + 1

Original sizes: 51382


In [3]:
%%time
# Lets build a machine learning model
from sklearn.ensemble import RandomForestRegressor

np.random.seed(42)
from sklearn.metrics import mean_squared_log_error, mean_absolute_error, r2_score, mean_squared_error

def rmsle(y_test, y_preds):
    """
    calculate root mean square log error between prediction and true labels    
    """

    return np.sqrt(mean_squared_log_error(y_test, y_preds))

# create a function to evaluate model on a few different levels

def show_scores(model):
    train_preds = model.predict(X_train)
    val_preds = model.predict(X_valid)
    scores = {"Training MAE" : mean_absolute_error(Y_train,train_preds),
              "valid MAE" : mean_absolute_error(Y_valid, val_preds),
              "Training RMSE" : np.sqrt(mean_squared_error(Y_train,train_preds)),
              "valid RMSE" : np.sqrt(mean_squared_error(Y_valid, val_preds)),
              "Training R^2": r2_score(Y_train,train_preds),
              "valid R^2" : r2_score(Y_valid, val_preds)}
    
    return scores

model = RandomForestRegressor(n_jobs=-1, random_state=42)
model.fit(X_train, Y_train)

show_scores(model)

CPU times: total: 1min 5s
Wall time: 7.83 s


{'Training MAE': 1.1638056970641193,
 'valid MAE': 2.9651807244612036,
 'Training RMSE': 2.862725159309264,
 'valid RMSE': 6.287921775763495,
 'Training R^2': 0.9956208886188878,
 'valid R^2': 0.9775952794570182}

In [4]:
%%time

# most ideal parameter 

ideal_model = RandomForestRegressor(n_estimators=90,
                                    min_samples_leaf=18,
                                    min_samples_split=3,
                                    max_features=0.5,
                                    n_jobs=-1,
                                    max_samples=None,
                                    random_state=42)  # random state so that our reult is reproducable

# FIT THE MODEL

ideal_model.fit(X_train, Y_train)
show_scores(ideal_model)

CPU times: total: 13.9 s
Wall time: 1.83 s


{'Training MAE': 3.2334579320249657,
 'valid MAE': 3.5122467961292236,
 'Training RMSE': 8.278733960780183,
 'valid RMSE': 7.5135744370319335,
 'Training R^2': 0.9633769478796184,
 'valid R^2': 0.9680096893536677}

In [5]:
from sklearn.linear_model import LinearRegression
linreg = LinearRegression()
linreg.fit(X_train,Y_train)
y_pred = linreg.predict(X_valid)
from sklearn.model_selection import cross_val_score
print('Cross_val_score', cross_val_score(linreg, X_train, Y_train, cv=10, scoring="r2").mean())
print('r2_score', r2_score(Y_valid, y_pred))
print("valid MAE ", mean_absolute_error(Y_valid, y_pred))
print("valid RMSE", np.sqrt(mean_squared_error(Y_valid, y_pred)))

Cross_val_score 0.765291649516906
r2_score 0.773230596968564
valid MAE  12.50005246412342
valid RMSE 20.004587154049016
