In [13]:
import pandas as pd
import numpy as np
import os
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import StratifiedKFold, train_test_split, RandomizedSearchCV
from sklearn.metrics import f1_score, accuracy_score, recall_score, precision_score, roc_auc_score

seed = 42

In [14]:
#Load data
df = pd.read_csv(os.path.abspath("../data/normalized_labeled_training_data.csv"))

#Select all columns except the last
X = df.iloc[:, :-1]

#X = X.drop(['snow', 'snowdepth'], axis=1)

#, 'holiday', 'visibility', 'month'

#Create day or night variable
#X['day_or_night'] = X['hour_of_day'].apply(lambda x: 1 if 0.2608695652173913 <= x < 0.782608695652174 else 0)

#X = X.drop(['hour_of_day'], axis=1)

#Viktiga hour_of_day, windspeed, temp

X.info()

#Select label column
y = df['increase_stock']

X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.2, random_state=seed)

skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=seed)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1600 entries, 0 to 1599
Data columns (total 15 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   hour_of_day  1600 non-null   float64
 1   day_of_week  1600 non-null   float64
 2   month        1600 non-null   float64
 3   holiday      1600 non-null   float64
 4   weekday      1600 non-null   float64
 5   summertime   1600 non-null   float64
 6   temp         1600 non-null   float64
 7   dew          1600 non-null   float64
 8   humidity     1600 non-null   float64
 9   precip       1600 non-null   float64
 10  snow         1600 non-null   float64
 11  snowdepth    1600 non-null   float64
 12  windspeed    1600 non-null   float64
 13  cloudcover   1600 non-null   float64
 14  visibility   1600 non-null   float64
dtypes: float64(15)
memory usage: 187.6 KB


In [None]:
model = RandomForestClassifier( 
    random_state=seed,
    # n_estimators=145,
    # max_depth=24,
    class_weight='balanced_subsample',
    # min_samples_leaf=1,
    # min_samples_split=2,
    max_features='sqrt',
    # bootstrap=False,
    criterion='gini',
    n_jobs=-1
)

#best_model.fit(X_train, y_train)

param_grid = {
    'n_estimators': list(np.linspace(50, 155, num=50, dtype=int)),      
    'max_depth': list(np.linspace(5, 35, num=20, dtype=int)),              
    'min_samples_split': list(np.linspace(2, 20, num=18, dtype=int)),      
    'min_samples_leaf': list(np.linspace(1, 8, num=8, dtype=int)),                                   
    'bootstrap': [True, False],                                       
}

# scoring = [
#     'f1_weighted',
#     'accuracy',
#     'recall_weighted',
#     'precision_weighted',
#     'roc_auc'
# ]

random_search = RandomizedSearchCV(
    estimator=model,
    param_distributions=param_grid,
    n_iter=2000,
    scoring='f1_weighted',
    refit=True, 
    cv=skf,
    n_jobs=-1,
    verbose=3,
    return_train_score=False,
    random_state=seed,
)

random_search.fit(X_train, y_train)

print("Best Parameters:")
print(random_search.best_params_)
print(f"\nBest {random_search.refit} Score: {random_search.best_score_:.4f}")

best_model = random_search.best_estimator_

Fitting 5 folds for each of 2000 candidates, totalling 10000 fits
[CV 2/5] END bootstrap=True, max_depth=30, min_samples_leaf=8, min_samples_split=11, n_estimators=67;, score=0.859 total time=   0.4s
[CV 2/5] END bootstrap=False, max_depth=5, min_samples_leaf=4, min_samples_split=5, n_estimators=86;, score=0.801 total time=   0.3s
[CV 1/5] END bootstrap=False, max_depth=5, min_samples_leaf=4, min_samples_split=5, n_estimators=86;, score=0.844 total time=   0.3s
[CV 3/5] END bootstrap=True, max_depth=30, min_samples_leaf=8, min_samples_split=11, n_estimators=67;, score=0.871 total time=   0.5s
[CV 1/5] END bootstrap=True, max_depth=30, min_samples_leaf=8, min_samples_split=11, n_estimators=67;, score=0.871 total time=   0.4s
[CV 5/5] END bootstrap=False, max_depth=5, min_samples_leaf=4, min_samples_split=5, n_estimators=86;, score=0.766 total time=   0.4s
[CV 4/5] END bootstrap=True, max_depth=30, min_samples_leaf=8, min_samples_split=11, n_estimators=67;, score=0.848 total time=   0.5s

KeyboardInterrupt: 

In [None]:
y_pred = best_model.predict(X_test)
y_proba = best_model.predict_proba(X_test)[:, 1]

print(f"Test Accuracy: {accuracy_score(y_test, y_pred):.4f}")
print(f"Test F1-Weighted: {f1_score(y_test, y_pred, average='weighted'):.4f}")
print(f"Test Recall-Weighted: {recall_score(y_test, y_pred, average='weighted'):.4f}")
print(f"Test Precision-Weighted: {precision_score(y_test, y_pred, average='weighted'):.4f}")
print(f"Test ROC AUC: {roc_auc_score(y_test, y_proba):.4f}")


# Best Parameters: (all features)
# {'n_estimators': 145, 'max_depth': 24, 'criterion': 'gini', 'class_weight': 'balanced_subsample', 'bootstrap': False}
# Best f1_weighted Score: 0.9062

#Best test results
# Test Accuracy: 0.8812
# Test F1-Weighted: 0.8803
# Test Recall-Weighted: 0.8812
# Test Precision-Weighted: 0.8794
# Test ROC AUC: 0.8886

Test Accuracy: 0.8938
Test F1-Weighted: 0.8919
Test Recall-Weighted: 0.8938
Test Precision-Weighted: 0.8905
Test ROC AUC: 0.8899
