In [267]:
import pandas as pd
import numpy as np
import os
from xgboost import XGBClassifier
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV, StratifiedKFold
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score, accuracy_score, recall_score, precision_score, roc_auc_score, classification_report
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline

In [268]:
seed = 42

#Load data
df = pd.read_csv(os.path.abspath("../data/labeled_training_data.csv"))

#Select all columns except the last
X = df.iloc[:, :-1]

#Create new features
X['day_or_night'] = X['hour_of_day'].apply(lambda x: 1 if 8 <= x < 21 else 0)

X['normal_day'] = (~((X['summertime'] == 1) | (X['holiday'] == 1) | (X['weekday'] == 0))).astype(int)

X['cold'] = X['temp'].apply(lambda x: 1 if x <= 8 else 0)

#Remove bad features
X = X.drop(['snow', 'snowdepth', 'holiday', 'visibility', 'precip', 'dew'], axis=1)

#Select target column
y = df['increase_stock']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=seed)


In [269]:
model = XGBClassifier(
    random_state=seed,
    eval_metric='logloss'
)


In [270]:
pipeline = Pipeline([
    ('scaler', StandardScaler()),
    ('xg', model)])


skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=seed)


param_grid = {
    'xg__max_depth': list(np.linspace(40, 50, num=5, dtype=int)),        
    'xg__min_child_weight': list(np.linspace(7, 8, num=5, dtype=int)), 
    'xg__learning_rate': list(np.linspace(0.01, 0.2, num=5)), 
    'xg__gamma': list(np.linspace(0, 10, num=5)),  
    'xg__lambda': list(np.linspace(1, 30, num=4)),  
    'xg__alpha': list(np.linspace(0, 5, num=3)), 
}

scoring = [
    'f1_weighted',
    'accuracy',
    'recall_weighted',
    'precision_weighted',
    'roc_auc'
]


#Perform randomized search over the parameter grid 
random_search = RandomizedSearchCV(
    estimator=pipeline,
    param_distributions=param_grid,
    n_iter=3000,
    scoring='f1_weighted',
    refit=True, 
    cv=skf,
    n_jobs=-1,
    verbose=3,
    return_train_score=False,
    random_state=seed,
)


random_search.fit(X_train, y_train)

print("Best Parameters:")
print(random_search.best_params_)
print(f"\nBest Score: {random_search.best_score_:.4f}")

best_model = random_search.best_estimator_


# #Perform grid search over the parameter grid 
# grid_search = GridSearchCV(
#     estimator=model,
#     param_grid=param_grid,
#     scoring=scoring,           
#     refit='f1_weighted',       
#     cv=5,                      
#     n_jobs=-1,                
#     verbose=3,                 
#     return_train_score=False  
# )

# grid_search.fit(X_train, y_train)

# # Print best parameters and scores
# print("Best Parameters:", grid_search.best_params_)
# print(f"Best F1-Weighted Score: {grid_search.best_score_:.4f}")

# # Retrieve the best model
# best_model = grid_search.best_estimator_

Fitting 5 folds for each of 3000 candidates, totalling 15000 fits
Best Parameters:
{'xg__min_child_weight': np.int64(7), 'xg__max_depth': np.int64(42), 'xg__learning_rate': np.float64(0.2), 'xg__lambda': np.float64(1.0), 'xg__gamma': np.float64(2.5), 'xg__alpha': np.float64(2.5)}

Best Score: 0.8868


In [271]:
y_pred = best_model.predict(X_test)
y_proba = best_model.predict_proba(X_test)[:, 1]

print(f"Test Accuracy: {accuracy_score(y_test, y_pred):.4f}")
print(f"Test F1-Weighted: {f1_score(y_test, y_pred, average='weighted'):.4f}")
print(f"Test Recall-Weighted: {recall_score(y_test, y_pred, average='weighted'):.4f}")
print(f"Test Precision-Weighted: {precision_score(y_test, y_pred, average='weighted'):.4f}")
print(f"Test ROC AUC: {roc_auc_score(y_test, y_proba):.4f}")

print(classification_report(y_test, y_pred))

Test Accuracy: 0.9250
Test F1-Weighted: 0.9215
Test Recall-Weighted: 0.9250
Test Precision-Weighted: 0.9226
Test ROC AUC: 0.9505
              precision    recall  f1-score   support

           0       0.93      0.98      0.96       262
           1       0.87      0.69      0.77        58

    accuracy                           0.93       320
   macro avg       0.90      0.83      0.86       320
weighted avg       0.92      0.93      0.92       320

