In [117]:
import pandas as pd
import numpy as np
import pickle
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.svm import SVR
from sklearn.ensemble import BaggingRegressor, GradientBoostingRegressor, VotingRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error

In [100]:
cleaned_data= pd.read_csv('./data/cleaned_new_york_air_quality.csv')
cleaned_data.head()

Unnamed: 0.1,Unnamed: 0,date,pm25,o3,no2,co
0,0,2024-05-01,54.0,30.0,16.0,2.0
1,3,2024-05-04,27.0,22.0,10.0,2.0
2,5,2024-05-06,66.0,44.0,12.0,2.0
3,6,2024-05-07,34.0,41.0,18.0,2.0
4,8,2024-05-09,17.0,23.0,8.0,2.0


In [101]:
from sklearn.metrics import mean_absolute_error

# Create lagged features for PM2.5
for i in range(1, 8):
    cleaned_data[f'pm25_lag_{i}'] = cleaned_data['pm25'].shift(i)

cleaned_data.dropna(inplace=True)

# Features and target
X = cleaned_data[[f'pm25_lag_{i}' for i in range(1, 8)]]
y = cleaned_data['pm25']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, shuffle=False)

In [106]:
# Initialize models
models = {
    'Linear Regression': LinearRegression(),
    'Decision Tree Regressor': DecisionTreeRegressor(),
    'Random Forest Regressor': RandomForestRegressor(n_estimators=100),
    'Support Vector Regressor': SVR()
}

# Dictionary to store the performance metrics
performance = {}

# Train and evaluate each model
for model_name, model in models.items():
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)

    mae = mean_absolute_error(y_test, y_pred)
    mse = mean_squared_error(y_test, y_pred)
    rmse = np.sqrt(mse)

    performance[model_name] = {'MAE': mae, 'MSE': mse, 'RMSE': rmse}

# Convert the performance dictionary to a DataFrame for better readability
performance_df = pd.DataFrame(performance).T

performance_df

Unnamed: 0,MAE,MSE,RMSE
Linear Regression,9.734856,146.873468,12.119136
Decision Tree Regressor,14.156653,329.301428,18.146664
Random Forest Regressor,9.967672,158.489886,12.589277
Support Vector Regressor,10.400013,169.583429,13.02242


In [114]:
# Initialize ensemble models
bagging_regressor = BaggingRegressor(estimator=DecisionTreeRegressor(), n_estimators=100, random_state=42)
gradient_boosting_regressor = GradientBoostingRegressor(n_estimators=100, random_state=42)
voting_regressor = VotingRegressor(estimators=[
    ('lr', LinearRegression()),
    ('rf', RandomForestRegressor(n_estimators=100, random_state=42)),
    ('svr', SVR())
])

# Train and evaluate each ensemble model
ensemble_models = {
    'Bagging Regressor': bagging_regressor,
    'Gradient Boosting Regressor': gradient_boosting_regressor,
    'Voting Regressor': voting_regressor
}

# Dictionary to store the performance metrics
ensemble_performance = {}

# Train and evaluate each ensemble model
for model_name, model in ensemble_models.items():
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)

    mae = mean_absolute_error(y_test, y_pred)
    mse = mean_squared_error(y_test, y_pred)
    rmse = np.sqrt(mse)

    ensemble_performance[model_name] = {'MAE': mae, 'MSE': mse, 'RMSE': rmse}

# Convert the performance dictionary to a DataFrame for better readability
ensemble_performance_df = pd.DataFrame(ensemble_performance).T

ensemble_performance_df


Unnamed: 0,MAE,MSE,RMSE
Bagging Regressor,9.96966,157.588599,12.55343
Gradient Boosting Regressor,9.853147,154.230348,12.418951
Voting Regressor,9.866378,153.009225,12.36969


In [115]:
combined_performance_df = pd.concat([performance_df, ensemble_performance_df])

combined_performance_df

Unnamed: 0,MAE,MSE,RMSE
Linear Regression,9.734856,146.873468,12.119136
Decision Tree Regressor,14.156653,329.301428,18.146664
Random Forest Regressor,9.967672,158.489886,12.589277
Support Vector Regressor,10.400013,169.583429,13.02242
Bagging Regressor,9.96966,157.588599,12.55343
Gradient Boosting Regressor,9.853147,154.230348,12.418951
Voting Regressor,9.866378,153.009225,12.36969


In [118]:
with open('model.pkl', 'wb') as file:
    pickle.dump(models['Linear Regression'], file)

print("Model saved successfully as 'model.pkl'")

Model saved successfully as 'model.pkl'
