In [6]:
import numpy as np
from functions import splitting, evaluate_model
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import GridSearchCV
import time
import pickle

In [7]:
X_train, X_test, y_train, y_test = splitting('date')

# Assuming y_train and y_test are 2D arrays
y_train = np.ravel(y_train)
y_test = np.ravel(y_test)

0    2014010101
1    2014010102
2    2014010103
3    2014010104
4    2014010105
Name: datetime, dtype: int64


In [8]:
# Define the decision tree regression model
rf_model = RandomForestRegressor(max_features='sqrt', random_state=42)

# Define the parameter grid for XGBoost
param_grid = {
    'n_estimators': [50, 100, 150],
    'max_depth': [None, 5, 10, 20],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'bootstrap': [True, False]
}

In [9]:
total_time_start = time.time()
# Perform grid search cross-validation
grid_search_rf = GridSearchCV(rf_model, param_grid, cv=5, scoring='neg_mean_squared_error', n_jobs=-1)
grid_search_rf.fit(X_train, y_train)

# Get the best hyperparameters
best_params_rf = grid_search_rf.best_params_

start_time = time.time()
# Initialize the decision tree regression model with the best hyperparameters
rf_model_best = RandomForestRegressor(**best_params_rf, random_state=42)
rf_model_best.fit(X_train, y_train)

end_time = time.time()
training_time = end_time - start_time
total_time = end_time - total_time_start

# Save the trained model to a .pickle file
with open('/Users/sahilnakrani/Documents/weather forecast/src/Machine-Learning/Regression-Models/trained_models/RandomForestRegression/with_only_date/RandomForestRegressor.pkl', 'wb') as f:   
    pickle.dump(rf_model_best, f)

# Save training times to a text file
with open("/Users/sahilnakrani/Documents/weather forecast/src/Machine-Learning/Regression-Models/trained_models/RandomForestRegression/with_only_date/model_training_time.txt", "w") as f:
    f.write("RandomForestRegressor Training Time : {:.6f} seconds\n".format(training_time))
    f.write("Total Time with GridSearchCV : {:.6f} seconds\n".format(total_time))


In [10]:
# Make predictions
y_pred_dt_best = rf_model_best.predict(X_test)

#evaluting the Model's performance.
evaluate_model(rf_model_best, X_test, y_test, 'RandomForestRegression', 'date')

Model: RandomForestRegression
Mean Squared Error: 51.223273996145785
Root Mean Squared Error: 7.157043663143727
R2 Score: -0.0030972969933891203
