In [1]:
import pandas as pd
import numpy as np
import os
import seaborn as sns
import matplotlib.pyplot as plt
from xgboost import XGBClassifier
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV, StratifiedKFold
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score, accuracy_score, recall_score, precision_score, roc_auc_score, classification_report

In [2]:
seed = 42

#Load data
df = pd.read_csv(os.path.abspath("../data/normalized_labeled_training_data.csv"))

#Select all columns except the last
X = df.iloc[:, :-1]

#Create new features
X['day_or_night'] = X['hour_of_day'].apply(lambda x: 1 if 8 <= x < 21 else 0)

X['normal_day'] = (~((X['summertime'] == 1) | (X['holiday'] == 1) | (X['weekday'] == 0))).astype(int)

X['cold'] = X['temp'].apply(lambda x: 1 if x <= 8 else 0)

#Remove bad features
X = X.drop(['snow', 'snowdepth', 'holiday', 'visibility', 'precip', 'dew'], axis=1)

#Select target column
y = df['increase_stock']

#stratify split, gör så att andelen negativt och positivt är densamma som hela datasettet 
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, stratify=y, random_state=seed)


In [3]:
model = XGBClassifier(
    random_state=seed,
    #use_label_encoder=False,
    eval_metric='logloss'
)


In [4]:
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=seed)

param_grid = {
    'max_depth': list(np.linspace(3, 15, num=5, dtype=int)),        # Tree depth, (if too low -> overfitting)
    'min_child_weight': list(np.linspace(1, 10, num=5, dtype=int)), # Minimum child weight, (if too low -> overfitting)
    'subsample': list(np.linspace(0.5, 1.0, num=5)),               
    'colsample_bytree': list(np.linspace(0.5, 1.0, num=5)),        
    'learning_rate': list(np.linspace(0.01, 0.2, num=5)),       
    # 'gamma': list(np.linspace(0, 1, num=5)),                       
    # 'n_estimators': list(np.linspace(50, 200, num=4, dtype=int)),  
    # 'reg_alpha': [0, 0.01, 0.1, 1],                               
    # 'reg_lambda': [0, 0.01, 0.1, 1],                               
    # 'scale_pos_weight': [1, 2, 3]                                      
}

scoring = [
    'f1_weighted',
    'accuracy',
    'recall_weighted',
    'precision_weighted',
    'roc_auc'
]


# Perform Grid Search
grid_search = GridSearchCV(
    estimator=model,
    param_grid=param_grid,
    scoring=scoring,           
    refit='f1_weighted',       
    cv=5,                      
    n_jobs=-1,                
    verbose=3,                 
    return_train_score=False  
)

grid_search.fit(X_train, y_train)

# Print best parameters and scores
print("Best Parameters:", grid_search.best_params_)
print(f"Best F1-Weighted Score: {grid_search.best_score_:.4f}")

# Retrieve the best model
best_model = grid_search.best_estimator_

Fitting 5 folds for each of 3125 candidates, totalling 15625 fits
Best Parameters: {'colsample_bytree': np.float64(0.5), 'learning_rate': np.float64(0.105), 'max_depth': np.int64(9), 'min_child_weight': np.int64(3), 'subsample': np.float64(0.625)}
Best F1-Weighted Score: 0.8989


In [5]:
y_pred = best_model.predict(X_test)
y_proba = best_model.predict_proba(X_test)[:, 1]

print(f"Test Accuracy: {accuracy_score(y_test, y_pred):.4f}")
print(f"Test F1-Weighted: {f1_score(y_test, y_pred, average='weighted'):.4f}")
print(f"Test Recall-Weighted: {recall_score(y_test, y_pred, average='weighted'):.4f}")
print(f"Test Precision-Weighted: {precision_score(y_test, y_pred, average='weighted'):.4f}")
print(f"Test ROC AUC: {roc_auc_score(y_test, y_proba):.4f}")

print(classification_report(y_test, y_pred))

Test Accuracy: 0.9208
Test F1-Weighted: 0.9189
Test Recall-Weighted: 0.9208
Test Precision-Weighted: 0.9182
Test ROC AUC: 0.9396
              precision    recall  f1-score   support

         0.0       0.94      0.96      0.95       394
         1.0       0.82      0.72      0.77        86

    accuracy                           0.92       480
   macro avg       0.88      0.84      0.86       480
weighted avg       0.92      0.92      0.92       480



In [6]:
from xgboost import plot_importance
import matplotlib.pyplot as plt

# Plot feature importance
plot_importance(model) 
plot_importance(model, importance_type='total_gain') 
plt.title("Feature Importance")
plt.show()

NotFittedError: need to call fit or load_model beforehand

In [None]:
X.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1600 entries, 0 to 1599
Data columns (total 15 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   hour_of_day  1600 non-null   float64
 1   day_of_week  1600 non-null   float64
 2   month        1600 non-null   float64
 3   holiday      1600 non-null   float64
 4   weekday      1600 non-null   float64
 5   summertime   1600 non-null   float64
 6   temp         1600 non-null   float64
 7   dew          1600 non-null   float64
 8   humidity     1600 non-null   float64
 9   precip       1600 non-null   float64
 10  snow         1600 non-null   float64
 11  snowdepth    1600 non-null   float64
 12  windspeed    1600 non-null   float64
 13  cloudcover   1600 non-null   float64
 14  visibility   1600 non-null   float64
dtypes: float64(15)
memory usage: 187.6 KB


In [None]:
print(5**5)

3125
