In [1]:
import numpy as np
import pandas as pd

In [2]:
df = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/Ethans/Datasets/Real_Combine.csv')

In [3]:
df.head()

Unnamed: 0,T,TM,Tm,SLP,H,VV,V,VM,PM 2.5
0,7.4,9.8,4.8,1017.6,93.0,0.5,4.3,9.4,219.720833
1,7.8,12.7,4.4,1018.5,87.0,0.6,4.4,11.1,182.1875
2,6.7,13.4,2.4,1019.4,82.0,0.6,4.8,11.1,154.0375
3,8.6,15.5,3.3,1018.7,72.0,0.8,8.1,20.6,223.208333
4,12.4,20.9,4.4,1017.3,61.0,1.3,8.7,22.2,200.645833


**Dataset Description**

# T == Average Temperature (°C)

# TM == Maximum temperature (°C)

# Tm == Minimum temperature (°C)

# SLP == Atmospheric pressure at sea level (hPa)

# H == Average relative humidity (%)

# VV == Average visibility (Km)

# V == Average wind speed (Km/h)

# VM == Maximum sustained wind speed (Km/h)

# PM2.5== Fine particulate matter (PM2.5) is an air pollutant that is a concern for people's health when levels in air are high

**PM2.5 is the Target variable**

In [4]:
df.tail()

Unnamed: 0,T,TM,Tm,SLP,H,VV,V,VM,PM 2.5
1088,18.1,24.0,11.2,1015.4,56.0,1.8,15.9,25.9,288.416667
1089,17.8,25.0,10.7,1015.8,54.0,2.3,9.4,22.2,256.833333
1090,13.9,24.5,11.4,1015.0,95.0,0.6,8.7,14.8,169.0
1091,16.3,23.0,9.8,1016.9,78.0,1.1,7.4,16.5,186.041667
1092,16.3,23.4,9.0,1017.3,68.0,1.3,7.8,18.3,185.583333


In [5]:
# To get statistical data
df.describe()

Unnamed: 0,T,TM,Tm,SLP,H,VV,V,VM,PM 2.5
count,1093.0,1093.0,1093.0,1093.0,1093.0,1093.0,1093.0,1093.0,1092.0
mean,26.009241,32.482251,19.460201,1008.081885,62.918573,2.003111,6.75151,15.805124,109.090984
std,7.237401,6.679078,7.438653,7.529237,15.709816,0.747541,3.841137,7.308435,84.46579
min,6.7,9.8,0.0,991.5,20.0,0.3,0.4,1.9,0.0
25%,19.3,27.8,12.1,1001.1,54.0,1.6,3.7,11.1,41.833333
50%,28.2,34.2,21.2,1008.1,64.0,1.9,6.5,14.8,83.458333
75%,31.7,37.0,26.0,1015.0,74.0,2.6,9.1,18.3,158.291667
max,38.5,45.5,32.7,1023.2,98.0,5.8,24.4,57.6,404.5


In [6]:
df.isnull().sum()

T         0
TM        0
Tm        0
SLP       0
H         0
VV        0
V         0
VM        0
PM 2.5    1
dtype: int64

In [7]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1093 entries, 0 to 1092
Data columns (total 9 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   T       1093 non-null   float64
 1   TM      1093 non-null   float64
 2   Tm      1093 non-null   float64
 3   SLP     1093 non-null   float64
 4   H       1093 non-null   float64
 5   VV      1093 non-null   float64
 6   V       1093 non-null   float64
 7   VM      1093 non-null   float64
 8   PM 2.5  1092 non-null   float64
dtypes: float64(9)
memory usage: 77.0 KB


In [8]:
# Repalcing the nan value with the mean value
mean_value = df['PM 2.5'].mean()
df['PM 2.5'] = df['PM 2.5'].fillna(mean_value)

In [9]:
df.isnull().sum()

T         0
TM        0
Tm        0
SLP       0
H         0
VV        0
V         0
VM        0
PM 2.5    0
dtype: int64

In [10]:
relation = df.corr()
relation_index = relation.index

In [11]:
relation

Unnamed: 0,T,TM,Tm,SLP,H,VV,V,VM,PM 2.5
T,1.0,0.96757,0.953703,-0.881698,-0.510593,0.640738,0.302349,0.288292,-0.630705
TM,0.96757,1.0,0.89197,-0.823447,-0.588017,0.606746,0.293309,0.29759,-0.567378
Tm,0.953703,0.89197,1.0,-0.917557,-0.288647,0.577382,0.296546,0.267249,-0.673415
SLP,-0.881698,-0.823447,-0.917557,1.0,0.242307,-0.518015,-0.330155,-0.311231,0.62241
H,-0.510593,-0.588017,-0.288647,0.242307,1.0,-0.465514,-0.380805,-0.362632,0.137802
VV,0.640738,0.606746,0.577382,-0.518015,-0.465514,1.0,0.377029,0.34265,-0.573864
V,0.302349,0.293309,0.296546,-0.330155,-0.380805,0.377029,1.0,0.775715,-0.268498
VM,0.288292,0.29759,0.267249,-0.311231,-0.362632,0.34265,0.775715,1.0,-0.215813
PM 2.5,-0.630705,-0.567378,-0.673415,0.62241,0.137802,-0.573864,-0.268498,-0.215813,1.0


In [12]:
relation_index

Index(['T', 'TM', 'Tm', 'SLP', 'H', 'VV', 'V', 'VM', 'PM 2.5'], dtype='object')

In [13]:
# Feature Scaling using Standard Scaler
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
df.iloc[:,:-1] = scaler.fit_transform(df.iloc[:,:-1])

In [14]:
# After Scaling
df.head()

Unnamed: 0,T,TM,Tm,SLP,H,VV,V,VM,PM 2.5
0,-2.572437,-3.39757,-1.971716,1.264733,1.915694,-2.01166,-0.638517,-0.876803,219.720833
1,-2.517143,-2.963179,-2.025514,1.384321,1.533592,-1.877826,-0.612471,-0.644088,182.1875
2,-2.669201,-2.858326,-2.294503,1.50391,1.215174,-1.877826,-0.508288,-0.644088,154.0375
3,-2.406556,-2.543768,-2.173458,1.410897,0.578338,-1.61016,0.351226,0.656375,223.208333
4,-1.881265,-1.734903,-2.025514,1.22487,-0.122182,-0.940995,0.507501,0.8754,200.645833


In [15]:
# Train Test Splitting the data
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(df.iloc[:,:-1], df.iloc[:,-1], test_size=0.20, random_state=101)

In [16]:
X_train.shape

(874, 8)

In [17]:
from sklearn.ensemble import RandomForestRegressor

In [21]:
reg_rf_model = RandomForestRegressor()
reg_rf_model.fit(X_train,y_train)

In [22]:
y_pred = reg_rf_model.predict(X_test)

In [23]:
reg_rf_model.score(X_test,y_test)

0.8160819006052049

In [24]:
from sklearn.metrics import mean_absolute_error,mean_squared_error,r2_score

In [25]:
MAE = mean_absolute_error(y_test,y_pred)
MSE = mean_squared_error(y_test,y_pred)
RMSE = np.sqrt(MSE)
R2 = r2_score(y_test,y_pred)
print("Mean Absolute Error: ",MAE)
print("Mean Squared Error: ",MSE)
print("Root Mean Squared Error: ",RMSE)
print("R2 Score: ",R2)

Mean Absolute Error:  23.292472237902892
Mean Squared Error:  1458.7233087171057
Root Mean Squared Error:  38.193236426324304
R2 Score:  0.8160819006052049


In [26]:
# Hyperparameter Tuning
from sklearn.model_selection import GridSearchCV

In [27]:
# Hyper parameters range initialisation for Tuning
parameters = {"n_estimators":[90,100,120,150],
              "max_depth":[1,3,5,7],
              "min_samples_leaf":[1,2,3,4],
              "min_weight_fraction_leaf":[0.1,0.2,0.3,0.4],
              "max_features":["sqrt", "log2"],
              "max_leaf_nodes":[None,10,20,30]}

In [28]:
tuned_model = GridSearchCV(reg_rf_model,param_grid = parameters,scoring = 'neg_mean_squared_error',cv = 5,verbose = 3)

In [29]:
X = df.iloc[:,:-1]
y = df.iloc[:,-1]

In [30]:
tuned_model.fit(X,y)

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
[CV 1/5] END max_depth=5, max_features=sqrt, max_leaf_nodes=None, min_samples_leaf=2, min_weight_fraction_leaf=0.3, n_estimators=90;, score=-4208.993 total time=   0.1s
[CV 2/5] END max_depth=5, max_features=sqrt, max_leaf_nodes=None, min_samples_leaf=2, min_weight_fraction_leaf=0.3, n_estimators=90;, score=-2982.343 total time=   0.1s
[CV 3/5] END max_depth=5, max_features=sqrt, max_leaf_nodes=None, min_samples_leaf=2, min_weight_fraction_leaf=0.3, n_estimators=90;, score=-3223.449 total time=   0.1s
[CV 4/5] END max_depth=5, max_features=sqrt, max_leaf_nodes=None, min_samples_leaf=2, min_weight_fraction_leaf=0.3, n_estimators=90;, score=-3590.863 total time=   0.1s
[CV 5/5] END max_depth=5, max_features=sqrt, max_leaf_nodes=None, min_samples_leaf=2, min_weight_fraction_leaf=0.3, n_estimators=90;, score=-5237.899 total time=   0.1s
[CV 1/5] END max_depth=5, max_features=sqrt, max_leaf_nodes=None, min_samples_leaf=2, min_

In [31]:
tuned_model.best_params_

{'max_depth': 5,
 'max_features': 'log2',
 'max_leaf_nodes': 10,
 'min_samples_leaf': 1,
 'min_weight_fraction_leaf': 0.1,
 'n_estimators': 90}

In [32]:
tuned_model.best_estimator_

In [33]:
tuned_hyper_model = RandomForestRegressor(max_depth=5, max_features='log2', max_leaf_nodes=10,min_weight_fraction_leaf=0.1, n_estimators=90)

In [34]:
tuned_hyper_model.fit(X_train,y_train)

In [35]:
tuned_pred = tuned_hyper_model.predict(X_test)

In [36]:
MAE = mean_absolute_error(y_test,tuned_pred)
MSE = mean_squared_error(y_test,tuned_pred)
RMSE = np.sqrt(MSE)
R2 = r2_score(y_test,tuned_pred)
print("Mean Absolute Error: ",MAE)
print("Mean Squared Error: ",MSE)
print("Root Mean Squared Error: ",RMSE)
print("R2 Score: ",R2)

Mean Absolute Error:  44.05545579813606
Mean Squared Error:  3507.5852388035373
Root Mean Squared Error:  59.2248701037287
R2 Score:  0.5577582076525982


In [37]:
tuned_hyper_model.score(X_test,y_test)

0.5577582076525982