In [2]:
import pandas as pd
import numpy as np
import os
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV, StratifiedKFold, train_test_split, RandomizedSearchCV
from sklearn.metrics import f1_score, accuracy_score, recall_score, precision_score, roc_auc_score

seed = 42

In [112]:
#Load data
df = pd.read_csv(os.path.abspath("../data/normalized_labeled_training_data.csv"))

#Select all columns except the last
X = df.iloc[:, :-1]

#X = X.drop(['snow', 'snowdepth', 'visibility', 'humidity', 'cloudcover', 'weekday','holiday', 'hour_of_day', 'temp'], axis=1)
X = X[['temp', 'hour_of_day', 'windspeed', 'day_of_week', 'precip']]

#Viktiga hour_of_day, windspeed, temp

X.info()

#Select label column
y = df['increase_stock']

X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.2, random_state=seed)

skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=seed)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1600 entries, 0 to 1599
Data columns (total 5 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   temp         1600 non-null   float64
 1   hour_of_day  1600 non-null   float64
 2   windspeed    1600 non-null   float64
 3   day_of_week  1600 non-null   float64
 4   precip       1600 non-null   float64
dtypes: float64(5)
memory usage: 62.6 KB


In [113]:
model = RandomForestClassifier( 
    random_state=seed,
    # n_estimators=145,
    # max_depth=24,
    class_weight='balanced_subsample',
    min_samples_leaf=1,
    min_samples_split=2,
    max_features='sqrt',
    bootstrap=False,
    criterion='gini',
    n_jobs=-1
)

#model.fit(X_train, y_train)

param_grid = {
    'n_estimators': list(np.linspace(135, 155, num=20, dtype=int)),      
    'max_depth': list(np.linspace(15, 35, num=20, dtype=int)),              
    #'min_samples_split': list(np.linspace(2, 20, num=18, dtype=int)),      
    #'min_samples_leaf': list(np.linspace(1, 8, num=8, dtype=int)),        
    #'max_features': ['sqrt', 'log2'],                             
    #'bootstrap': [True, False],
    #'criterion' : ['gini', 'entropy', 'log_loss'],
    #'class_weight': ['balanced', 'balanced_subsample']                                        
}

scoring = [
    'f1_weighted',
    'accuracy',
    'recall_weighted',
    'precision_weighted',
    'roc_auc'
]

grid_search = GridSearchCV(
    estimator=model,
    param_grid=param_grid,
    scoring=scoring,
    refit='f1_weighted', 
    cv=skf,
    n_jobs=-1,
    verbose=3,
    return_train_score=False
)

# grid_search = RandomizedSearchCV(
#     estimator=model,
#     param_distributions=param_grid,
#     n_iter=6000,
#     refit='f1_weighted', 
#     cv=skf,
#     n_jobs=-1,
#     verbose=3,
#     return_train_score=False,
#     random_state=seed,
# )

grid_search.fit(X_train, y_train)

print("Best Parameters:")
print(grid_search.best_params_)
print(f"\nBest {grid_search.refit} Score: {grid_search.best_score_:.4f}")

best_model = grid_search.best_estimator_

Fitting 5 folds for each of 400 candidates, totalling 2000 fits
Best Parameters:
{'max_depth': 17, 'n_estimators': 135}

Best f1_weighted Score: 0.8964


In [114]:
y_pred = best_model.predict(X_test)
y_proba = best_model.predict_proba(X_test)[:, 1]

print(f"Test Accuracy: {accuracy_score(y_test, y_pred):.4f}")
print(f"Test F1-Weighted: {f1_score(y_test, y_pred, average='weighted'):.4f}")
print(f"Test Recall-Weighted: {recall_score(y_test, y_pred, average='weighted'):.4f}")
print(f"Test Precision-Weighted: {precision_score(y_test, y_pred, average='weighted'):.4f}")
print(f"Test ROC AUC: {roc_auc_score(y_test, y_proba):.4f}")


# Best Parameters:
# {'n_estimators': 145, 'max_depth': 24, 'criterion': 'gini', 'class_weight': 'balanced_subsample', 'bootstrap': False}

# Best f1_weighted Score: 0.9062


# Test Accuracy: 0.8812
# Test F1-Weighted: 0.8803
# Test Recall-Weighted: 0.8812
# Test Precision-Weighted: 0.8794
# Test ROC AUC: 0.8886

Test Accuracy: 0.8594
Test F1-Weighted: 0.8640
Test Recall-Weighted: 0.8594
Test Precision-Weighted: 0.8703
Test ROC AUC: 0.8831
