# Introduction

The notebook is intended to perform a first Exploratory Data Analysis for a Binary Classification problem over the feature 'class'.

In [None]:
# Import Standard Modules
import pandas as pd
import numpy as np

from sklearn.preprocessing import MinMaxScaler
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2
from sklearn.experimental import enable_halving_search_cv
from sklearn.model_selection import train_test_split, GridSearchCV, HalvingGridSearchCV

import plotly.express as ex

from xgboost import XGBClassifier

from hyperpot import hp

import mlflow

# Read Data

In [None]:
# Define start and end years
start_year = 2014
end_year = 2016

# Define range year
range_year = np.arange(start_year, end_year + 1, 1, dtype=np.int32)

In [None]:
# Init empty DataFrame
data = pd.DataFrame()

for year in range_year:

    year_data = pd.read_csv(f'./../data/{year}_Financial_Data.csv', 
                            sep=',', 
                            encoding='latin1', 
                            index_col=0)

    data = pd.concat([data, year_data])

In [None]:
data.info()

In [None]:
data.head(5)

# Data Pre-processing

In [None]:
# Define label
y = data['Class']

# Define featuers
# NOTE: Drop '2015 PRICE VAR [%]' because it is directly related to the Class and it is only valid a posteriori
X = data.drop(['Class', 'Sector', '2015 PRICE VAR [%]', '2016 PRICE VAR [%]', '2017 PRICE VAR [%]'], axis=1)

## Check Data Distribution

In [None]:
# Check X feature distribution
X.describe().transpose()

In [None]:
# Check 'Class' distribution
figure = ex.histogram(data_frame=y, 
                        x='Class', 
                        title='Class Distribution', 
                        histnorm='percent', 
                        nbins=len(np.unique(y.values)))
figure.show()

Quite well class distribution. No skewed classes.

In [None]:
# Check y for null values
len(y) - y.count()

## Fill NaN Values

In [None]:
# Fill NaN values with the mean
X = X.fillna(X.mean())

## Feature Normalization

In [None]:
# Instantiate MinMaxScaler
min_max_scaler = MinMaxScaler()

In [None]:
# Scale the data
X_scaled = pd.DataFrame(min_max_scaler.fit_transform(X.values), index=X.index, columns=X.columns)

# Feature Selection

## Univariate Selection

In [None]:
# Define the number of desired features
n_features = 180

In [None]:
# Fit the feature seletor
feature_selector = SelectKBest(score_func=chi2, k=n_features).fit(X_scaled, y)

In [None]:
# Extract the most important features
X_feature_selected = pd.DataFrame(feature_selector.transform(X_scaled), 
                                    index=X_scaled.index, 
                                    columns=feature_selector.get_feature_names_out())

# Exploratory Data Analysis

## Market Cap

In [None]:
# Plot the distribution of the market Cap
# TODO
#figure = ex.histogram(data_frame=X_feature_selected, 
#                        x='Market Cap', 
#                        title='Market Cap Distribution', 
#                        nbins=X_feature_selected['Market Cap'].nunique())
#figure.show()

# Split Data into Training and Test Sets

In [None]:
# Split data
test_size = 0.33
X_train, X_test, y_train, y_test = train_test_split(X_feature_selected, y, test_size=test_size)

# Model Definition

## Hyperparameters Tuning

In [None]:
# Set initial space of hyperparameters
parameters = {
    'max_depth': np.arange(5, 10, 1),
    'min_child_weight': np.arange(5, 15, 1),
    'eta': np.linspace(0.01, 0.1, 10)
}

hyperopt_parameters_space = {
    'max_depth': hp.quniform("max_depth", 3, 18, 1),
    'gamma': hp.uniform ('gamma', 1, 9),
    'reg_alpha' : hp.quniform('reg_alpha', 40, 180, 1),
    'reg_lambda' : hp.uniform('reg_lambda', 0,1),
    'colsample_bytree' : hp.uniform('colsample_bytree', 0.5, 1),
    'min_child_weight' : hp.quniform('min_child_weight', 0, 10, 1),
    'n_estimators': 180,
    'seed': 0
}

# Define the estimator
xgboost_classifier = XGBClassifier(objective='binary:logistic', 
                                   eval_metric='logloss', 
                                   use_label_encoder=False)

### Grid Search

In [None]:
# Define GridSearch function
def grid_search():
    
    # Perform a Grid Search for hyperparameters tuning
    grid_search = GridSearchCV(estimator=xgboost_classifier, 
                               param_grid=parameters,
                               scoring='roc_auc',
                               n_jobs=2,
                               cv=2,
                               verbose=2)

    grid_search.fit(X_train, y_train)
    
    print(f"Grid Search best score: {grid_search.best_score_ * 100}")
    print(f"Grid Search best parameters: {grid_search.best_params_}")

    return grid_search.best_params_

### Halving Grid Search

In [None]:
# Define HalvingGridSearchCV function
def halving_grid_search():
    
    # Perform an Halving Grid Search for hyperparameters tuning
    halving_grid_search = HalvingGridSearchCV(estimator=xgboost_classifier, 
                                              param_grid=parameters, 
                                              scoring='roc_auc', 
                                              n_jobs=2, 
                                              cv=2, 
                                              verbose=2)
    
    halving_grid_search.fit(X_train, y_train)
    
    print(f"Halving Grid Search best score: {halving_grid_search.best_score_ * 100}")
    print(f"Halving  Search best parameters: {halving_grid_search.best_params_}")

    return halving_grid_search.best_params_

### Bayesian Optimization with HYPEROPT

In [None]:
# Perform the Hyperparameters Tuning
parameters = halving_grid_search()

## Training

In [None]:
# Set Experiment Name
experiment_name = "US Stock Binary Classification - XGBoost"

In [None]:
# Retrieve experiment
experiment = mlflow.get_experiment_by_name(experiment_name)

# Check if Experiment exists
if experiment is None:

    print('Creating MLFlow experiment')

    # Create experiment
    _ = mlflow.create_experiment(experiment_name)

    # Retrieve experiment
    experiment = mlflow.get_experiment_by_name(experiment_name)

In [None]:
# Start experiment
with mlflow.start_run(experiment_id=experiment.experiment_id):

    # Define hyperparameters (GridSearch)
    max_depth = parameters['max_depth']
    min_child_weight = parameters['min_child_weight']
    eta = parameters['eta']
    

    # Create and fit the model
    model = XGBClassifier(objective='binary:logistic',
                          eval_metric='logloss',
                          eta=eta,
                          max_depth=max_depth,
                          min_child_weight=min_child_weight,
                          use_label_encoder=False)

    # Model Training
    model.fit(X_train, y_train)

    # Model Evaluation
    logloss_training = round(model.score(X_train, y_train), 2) * 100
    logloss_test = round(model.score(X_test, y_test), 2) * 100

    # Log MLFlow
    mlflow.log_param('start_year', start_year)
    mlflow.log_param('end_year', end_year)
    mlflow.log_param('data_dimension', len(data))
    mlflow.log_param('max_depth', max_depth)
    mlflow.log_param('min_child_weight', min_child_weight)
    mlflow.log_param('n_features', n_features)
    mlflow.log_metric('logloss_training', logloss_training)
    mlflow.log_metric('logloss_test', logloss_test)

## Model Evaluation

In [None]:
# Model Score
print('Model score on the training set: {:.2f}'.format(logloss_training))
print('Model score on the test set: {:.2f}'.format(logloss_test))

In [None]:
# Retrieve MLFlow Runs
mlflow.search_runs(experiment_ids=experiment.experiment_id)

<a style='text-decoration:none;line-height:16px;display:flex;color:#5B5B62;padding:10px;justify-content:end;' href='https://deepnote.com?utm_source=created-in-deepnote-cell&projectId=f1efe245-29af-4be1-bb79-055f4abb0e16' target="_blank">
 </img>
Created in <span style='font-weight:600;margin-left:4px;'>Deepnote</span></a>