In [0]:
#Import all the libraries
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score , classification_report, mean_squared_error, r2_score,accuracy_score
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestRegressor
import numpy as np

In [0]:
#Import the dataset
df = pd.read_csv("/content/drive/My Drive/Training Data -v3.csv")

In [0]:
#Generating the correlation matrix and comparing the relation of each attribute with the AQI value
corr_matrix = df.corr()
corr_matrix['AQI'].sort_values(ascending=False)

In [0]:
#Scatter Plot for top 5 attributes
df.plot(kind="scatter",x = "PM10" , y = "AQI",alpha=0.1)
df.plot(kind="scatter",x = "CO" , y = "AQI",alpha=0.1)
df.plot(kind="scatter",x = "Precip" , y = "AQI",alpha=0.1)
df.plot(kind="scatter",x = "PM2.5" , y = "AQI",alpha=0.1)
df.plot(kind="scatter",x = "Speed" , y = "AQI",alpha=0.1)

In [0]:
#Separate Labels, Drop AQI and Date columns
labels = df['AQI']
df.drop(['AQI','Date'],axis=1,inplace=True)

In [0]:
#Splitting the dataset to 70:30 train-test ratio
X_train,X_test,y_train,y_test = train_test_split(df,labels,test_size = 0.3,random_state = 42)

In [0]:
#Fine Tuning the model using GridSearch to get the best possible hyperparameters
param_grid = [{'n_estimators':[3,10,30,40],'max_features':[2,4,3,1,5]},
              {'bootstrap':[False],'n_estimators': [3, 10], 'max_features': [2, 3, 4]}
             ]
forest_reg = RandomForestRegressor()
gridSearch = GridSearchCV(forest_reg,param_grid,cv=5,scoring="neg_mean_squared_error")
gridSearch.fit(X_train,y_train)

In [0]:
#Final Training
final_model = gridSearch.best_estimator_
y_train_pred = final_model.predict(X_train)
final_predictions = final_model.predict(X_test)
final_mse = mean_squared_error(y_test, final_predictions)
final_rmse = np.sqrt(final_mse)
train_mse = mean_squared_error(y_train,y_train_pred)
train_rmse = np.sqrt(train_mse)
Train_Accuracy = r2_score(y_train,y_train_pred)
Test_Accuracy = r2_score(y_test,final_predictions)

In [0]:
#Print the RMSE and Accuracy Values
print("RMSE Train:",np.round(train_rmse,2))
print("RMSE Test:",np.round(final_rmse,2))
print("Train Accuracy:",np.round(Train_Accuracy * 100,2))
print("Test Accuracy:",np.round(Test_Accuracy * 100,2))