In [1]:
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np

In [15]:
dataframe = pd.read_csv("csv/82000278_Toamnei_2022_05.csv")
dataframe_size = len(dataframe.index)
train = dataframe[dataframe.index < dataframe_size*(85/100)]
test = dataframe[dataframe.index >= dataframe_size*(85/100)]


y_train = train['co2']
y_test = test['co2']
x_train = train['time']
x_test = test['time']
# Convert to numpy array
x_train = np.array(x_train)
x_test = np.array(x_test)
y_train = np.array(y_train)
y_test = np.array(y_test)
# Reshape
x_train = np.reshape(x_train, (-1,1))
x_test = np.reshape(x_test, (-1,1))
y_train = np.reshape(y_train, (-1,1))
y_test = np.reshape(y_test, (-1,1))

In [None]:
from statsmodels.tsa.statespace.sarimax import SARIMAX
from statsmodels.tsa.arima.model import ARIMA
y = train['co2']

ARMAmodel = ARIMA(y, order=(2,2,2))
ARMAmodel = ARMAmodel.fit()
y_pred = ARMAmodel.get_forecast(len(test.index))
y_pred_df = y_pred.conf_int(alpha=0.05)
y_pred_df['Predictions'] = ARMAmodel.predict(start = y_pred_df.index[0], end = y_pred_df.index[-1])
y_pred_df.index = test.index
y_pred_out = y_pred_df["Predictions"]


import matplotlib.pyplot as plt
plt.close("all")
plt.figure(figsize=(30,15))
# plt.plot(train.index, train.get('pm10'), color = "black", label = "Training set")
plt.plot(train.index, train.get('co2'), color = "black", label = "Training set")
# plt.plot(test.index, test.get('pm10'), color = "red", label = "Testing set")
plt.plot(test.index, test.get('co2'), color = "red", label = "Testing set")
plt.xlabel("Index", fontsize=16)
plt.ylabel("Value", fontsize=16)
# plt.title("Training/Testing/Predictions for PM10 on 1 month", fontsize=18)
plt.title("Training/Testing/Predictions for CO2 on 1 month", fontsize=18)
plt.plot(y_pred_out, color='purple', label = 'ARIMA Predictions')
plt.legend(fontsize=16)
plt.savefig('plots_from_csv/CSV_Tests_1month_ARIMA', bbox_inches = 'tight')
plt.show()


import numpy as np
from sklearn.metrics import mean_squared_error

# arima_rmse2 = np.sqrt(mean_squared_error(test["pm10"].values, y_pred_df["Predictions"]))
arima_rmse2 = np.sqrt(mean_squared_error(test["co2"].values, y_pred_df["Predictions"]))
print("RMSE: ", arima_rmse2)

In [22]:
# %%time

# Fit a base model
ARMAmodel = ARIMA(y_train)

In [25]:
ARMAmodel.param_names

['const', 'sigma2']

In [8]:
n_estimators = np.arange(100, 2000, step=100)
max_features = ["auto", "sqrt", "log2"]
max_depth = list(np.arange(10, 100, step=10)) + [None]
min_samples_split = np.arange(2, 10, step=2)
min_samples_leaf = [1, 2, 4]
bootstrap = [True, False]

param_grid = {
    "n_estimators": n_estimators,
    "max_features": max_features,
    "max_depth": max_depth,
    "min_samples_split": min_samples_split,
    "min_samples_leaf": min_samples_leaf,
    "bootstrap": bootstrap,
}

param_grid

{'n_estimators': array([ 100,  200,  300,  400,  500,  600,  700,  800,  900, 1000, 1100,
        1200, 1300, 1400, 1500, 1600, 1700, 1800, 1900]),
 'max_features': ['auto', 'sqrt', 'log2'],
 'max_depth': [10, 20, 30, 40, 50, 60, 70, 80, 90, None],
 'min_samples_split': array([2, 4, 6, 8]),
 'min_samples_leaf': [1, 2, 4],
 'bootstrap': [True, False]}

In [9]:
from sklearn.model_selection import RandomizedSearchCV

forest = RandomForestRegressor()

random_cv = RandomizedSearchCV(
    forest, param_grid, n_iter=100, cv=3, scoring="r2", n_jobs=-1
)

In [10]:
%%time

_ = random_cv.fit(X, y)

print("Best params:\n")
print(random_cv.best_params_)

random_cv.best_score_

Best params:

{'n_estimators': 700, 'min_samples_split': 6, 'min_samples_leaf': 4, 'max_features': 'sqrt', 'max_depth': 60, 'bootstrap': True}
CPU times: total: 7.47 s
Wall time: 11min 37s


-1.1387588318038444

In [11]:
n_iterations = 1

for value in param_grid.values():
    n_iterations *= len(value)
    
n_iterations

13680

In [32]:
new_params = {
    "n_estimators": [600, 700, 800],
    "max_features": ['sqrt'],
    "max_depth": [55, 60, 65],
    "min_samples_split": [4, 6, 8],
    "min_samples_leaf": [3, 4, 5],
    "bootstrap": [True],
}

new_params

{'n_estimators': [600, 700, 800],
 'max_features': ['sqrt'],
 'max_depth': [55, 60, 65],
 'min_samples_split': [4, 6, 8],
 'min_samples_leaf': [3, 4, 5],
 'bootstrap': [True]}

In [16]:
n_iterations = 1

for value in new_params.values():
    n_iterations *= len(value)
    
n_iterations

81

In [17]:
from sklearn.model_selection import GridSearchCV

forest = RandomForestRegressor()

grid_cv = GridSearchCV(forest, new_params, n_jobs=-1)

In [18]:
%%time

_ = grid_cv.fit(X, y)

print('Best params:\n')
print(grid_cv.best_params_, '\n')

Best params:

{'bootstrap': True, 'max_depth': 55, 'max_features': 'sqrt', 'min_samples_leaf': 5, 'min_samples_split': 6, 'n_estimators': 600} 

CPU times: total: 6.36 s
Wall time: 8min 38s


In [19]:
grid_cv.best_score_

-1.2896261762348877