# Introduction

The notebook is intended to perform a first Exploratory Data Analysis for a Binary Classification problem over the feature 'class'.

In [None]:
# Import Standard Modules
import pandas as pd
import numpy as np

from sklearn.preprocessing import MinMaxScaler
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2
from sklearn.model_selection import train_test_split, GridSearchCV

import plotly.express as ex

from xgboost import XGBClassifier

import mlflow

# Read Data

In [None]:
# Define start and end years
start_year = 2014
end_year = 2016

# Define range year
range_year = np.arange(start_year, end_year + 1, 1, dtype=np.int32)

In [None]:
# Init empty DataFrame
data = pd.DataFrame()

for year in range_year:

    year_data = pd.read_csv(f'./../data/{year}_Financial_Data.csv', 
                            sep=',', 
                            encoding='latin1', 
                            index_col=0)

    data = pd.concat([data, year_data])

In [None]:
data.info()

<class 'pandas.core.frame.DataFrame'>
Index: 12725 entries, PG to WTT
Columns: 226 entries, Revenue to 2017 PRICE VAR [%]
dtypes: float64(224), int64(1), object(1)
memory usage: 22.0+ MB


In [None]:
data.head(5)

Unnamed: 0,Revenue,Revenue Growth,Cost of Revenue,Gross Profit,R&D Expenses,SG&A Expense,Operating Expenses,Operating Income,Interest Expense,Earnings before Tax,...,Asset Growth,Book Value per Share Growth,Debt Growth,R&D Expense Growth,SG&A Expenses Growth,Sector,2015 PRICE VAR [%],Class,2016 PRICE VAR [%],2017 PRICE VAR [%]
PG,74401000000.0,-0.0713,39030000000.0,35371000000.0,0.0,21461000000.0,21461000000.0,13910000000.0,709000000.0,14494000000.0,...,0.0359,0.0316,0.1228,0.0,-0.1746,Consumer Defensive,-9.323276,0,,
VIPS,3734148000.0,1.1737,2805625000.0,928522600.0,108330300.0,344141400.0,793926700.0,134595900.0,12148690.0,175382300.0,...,,,,1.6484,1.7313,Consumer Defensive,-25.512193,0,,
KR,98375000000.0,0.0182,78138000000.0,20237000000.0,0.0,15196000000.0,17512000000.0,2725000000.0,443000000.0,2270000000.0,...,0.1886,0.3268,0.2738,0.0,0.0234,Consumer Defensive,33.118297,1,,
RAD,25526410000.0,0.0053,18202680000.0,7323734000.0,0.0,6561162000.0,6586482000.0,737252000.0,424591000.0,250218000.0,...,-0.0189,0.1963,-0.0458,0.0,-0.006,Consumer Defensive,2.752291,1,,
GIS,17909600000.0,0.0076,11539800000.0,6369800000.0,0.0,3474300000.0,3412400000.0,2957400000.0,302400000.0,2707700000.0,...,0.0215,0.0274,0.1025,0.0,-0.022,Consumer Defensive,12.897715,1,,


# Data Pre-processing

In [None]:
# Define label
y = data['Class']

# Define featuers
# NOTE: Drop '2015 PRICE VAR [%]' because it is directly related to the Class and it is only valid a posteriori
X = data.drop(['Class', 'Sector', '2015 PRICE VAR [%]'], axis=1)

## Check Data Distribution

In [None]:
# Check X feature distribution
X.describe().transpose()

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
Revenue,12125.0,5.204594e+09,3.535010e+10,-6.276160e+08,5.581000e+07,4.150780e+08,2.226271e+09,1.886894e+12
Revenue Growth,11500.0,4.667105e+00,3.945140e+02,-1.276930e+01,-3.430000e-02,4.350000e-02,1.691000e-01,4.213866e+04
Cost of Revenue,11818.0,3.307229e+09,2.858291e+10,-2.665346e+09,2.865317e+06,1.422087e+08,1.155679e+09,1.581527e+12
Gross Profit,12116.0,1.957375e+09,9.354688e+09,-1.280800e+10,2.836850e+07,1.823530e+08,8.430000e+08,4.621600e+11
R&D Expenses,11611.0,9.706296e+07,6.864618e+08,-8.610000e+07,0.000000e+00,0.000000e+00,1.111300e+07,1.608500e+10
...,...,...,...,...,...,...,...,...
Debt Growth,11228.0,2.288833e+00,6.274580e+01,-1.051000e+00,-6.440000e-02,0.000000e+00,1.637000e-01,5.443000e+03
R&D Expense Growth,11355.0,3.377717e-01,1.501652e+01,-2.662200e+00,0.000000e+00,0.000000e+00,0.000000e+00,1.542611e+03
SG&A Expenses Growth,11379.0,3.133546e-01,4.694965e+00,-2.099000e+00,-2.220000e-02,5.650000e-02,1.947000e-01,3.249268e+02
2016 PRICE VAR [%],4120.0,1.030776e+02,3.756531e+03,-9.994783e+01,-5.172844e+00,1.727893e+01,4.056798e+01,2.127000e+05


In [None]:
# Check 'Class' distribution
figure = ex.histogram(data_frame=y, 
                        x='Class', 
                        title='Class Distribution', 
                        histnorm='percent', 
                        nbins=len(np.unique(y.values)))
figure.show()

Quite well class distribution. No skewed classes.

In [None]:
# Check y for null values
len(y) - y.count()

0

## Fill NaN Values

In [None]:
# Fill NaN values with the mean
X = X.fillna(X.mean())

## Feature Normalization

In [None]:
# Instantiate MinMaxScaler
min_max_scaler = MinMaxScaler()

In [None]:
# Scale the data
X_scaled = pd.DataFrame(min_max_scaler.fit_transform(X.values), index=X.index, columns=X.columns)

# Feature Selection

## Univariate Selection

In [None]:
# Define the number of desired features
n_features = 50

In [None]:
# Fit the feature seletor
feature_selector = SelectKBest(score_func=chi2, k=n_features).fit(X_scaled, y)

In [None]:
# Extract the most important features
X_feature_selected = pd.DataFrame(feature_selector.transform(X_scaled), 
                                    index=X_scaled.index, 
                                    columns=feature_selector.get_feature_names_out())

In [None]:
X_feature_selected

Unnamed: 0,Revenue,R&D Expenses,SG&A Expense,Dividend per Share,Cash and cash equivalents,Short-term investments,Cash and short-term investments,Total current assets,Goodwill and Intangible Assets,Long-term investments,...,R&D to Revenue,Intangibles to Total Assets,Stock-based compensation to Revenue,Tangible Asset Value,Average Receivables,Average Inventory,Inventory Turnover,Weighted Average Shares Diluted Growth,Asset Growth,Debt Growth
PG,0.039750,0.005324,0.115827,0.000242,0.008896,0.002501,0.010816,0.025350,0.218932,0.000005,...,0.000096,0.588767,0.003428,0.033042,0.026919,0.145384,0.000497,1.045833e-04,0.000189,0.000216
VIPS,0.002311,0.012023,0.002133,0.000000,0.000804,0.000715,0.001399,0.001711,0.000494,0.000068,...,0.000100,0.070029,0.003431,0.010535,0.003678,0.007476,0.001232,3.017763e-07,0.000376,0.000613
KR,0.052451,0.005324,0.082096,0.000030,0.000417,0.000000,0.000406,0.007080,0.007346,0.000005,...,0.000096,0.097358,0.003426,0.019943,0.004524,0.114846,0.000833,2.866209e-07,0.000217,0.000243
RAD,0.013856,0.005324,0.035606,0.000000,0.000152,0.000000,0.000148,0.003436,0.001117,0.000005,...,0.000096,0.062393,0.003425,0.012097,0.003922,0.065403,0.000379,1.045833e-04,0.000179,0.000185
GIS,0.009821,0.005324,0.018986,0.000153,0.000902,0.000000,0.000878,0.003523,0.035385,0.000005,...,0.000096,0.593188,0.003426,0.013265,0.006117,0.033026,0.000527,1.045833e-04,0.000187,0.000212
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
TSRI,0.000365,0.005324,0.000329,0.000000,0.000005,0.000002,0.000006,0.000011,0.000000,0.000005,...,0.000096,0.000000,0.003425,0.009538,0.000034,0.000000,0.000000,2.960042e-07,0.000183,0.000193
TZOO,0.000393,0.005887,0.000717,0.000000,0.000029,0.000000,0.000028,0.000036,0.000000,0.000005,...,0.000107,0.000000,0.003430,0.009554,0.000064,0.000000,0.000000,2.814408e-07,0.000143,0.000009
USATP,0.000373,0.011327,0.000399,0.000156,0.000020,0.001690,0.003079,0.000042,0.000034,0.002733,...,0.000688,0.149954,0.004048,0.015953,0.003678,0.007476,0.001232,1.045833e-04,0.000376,0.000613
WSTG,0.000420,0.005324,0.000381,0.000067,0.000014,0.000000,0.000014,0.000081,0.000000,0.000005,...,0.000096,0.000000,0.003431,0.009577,0.000336,0.000046,0.003516,2.876273e-07,0.000221,0.000193


# Exploratory Data Analysis

## Market Cap

In [None]:
# Plot the distribution of the market Cap
# TODO
#figure = ex.histogram(data_frame=X_feature_selected, 
#                        x='Market Cap', 
#                        title='Market Cap Distribution', 
#                        nbins=X_feature_selected['Market Cap'].nunique())
#figure.show()

# Split Data into Training and Test Sets

In [None]:
# Split data
test_size = 0.33
X_train, X_test, y_train, y_test = train_test_split(X_feature_selected, y, test_size=test_size)

# Model Definition

## Hyperparameters Tuning

In [None]:
# Set initial space of hyperparameters
parameters = {
    'max_depth': range(1, 10, 1),
    'min_child_weight': range(1, 20, 1)
}

In [None]:
# Perform a Grid Search for hyperparameters tuning
grid_search = GridSearchCV(estimator=XGBClassifier(eval_metric='logloss', use_label_encoder=False), 
                            param_grid=parameters,
                            scoring='accuracy',
                            n_jobs=-1,
                            cv=2,
                            verbose=3)

grid_search.fit(X_train, y_train)

Fitting 2 folds for each of 171 candidates, totalling 342 fits


GridSearchCV(cv=2,
             estimator=XGBClassifier(base_score=None, booster=None,
                                     callbacks=None, colsample_bylevel=None,
                                     colsample_bynode=None,
                                     colsample_bytree=None,
                                     early_stopping_rounds=None,
                                     enable_categorical=False,
                                     eval_metric='logloss', gamma=None,
                                     gpu_id=None, grow_policy=None,
                                     importance_type=None,
                                     interaction_constraints=None,
                                     learning_rate=None, max_bin=None,
                                     m...nehot=None,
                                     max_delta_step=None, max_depth=None,
                                     max_leaves=None, min_child_weight=None,
                                     missing=na

In [None]:
# TBest hyperparameters combinations
grid_search.best_params_

{'max_depth': 2, 'min_child_weight': 18}

In [None]:
# Best score
grid_search.best_score_

0.6459823172622501

## Training

In [None]:
# Set Experiment Name
experiment_name = "US Stock Binary Classification - XGBoost"

In [None]:
# Retrieve experiment
experiment = mlflow.get_experiment_by_name(experiment_name)

# Check if Experiment exists
if experiment is None:

    print('Creating MLFlow experiment')

    # Create experiment
    _ = mlflow.create_experiment(experiment_name)

    # Retrieve experiment
    experiment = mlflow.get_experiment_by_name(experiment_name)

In [None]:
# Start experiment
with mlflow.start_run(experiment_id=experiment.experiment_id):

    # Define parameters
    max_depth = grid_search.best_params_['max_depth']
    min_child_weight = grid_search.best_params_['min_child_weight']

    # Create and fit the model
    model = XGBClassifier(eval_metric='logloss', 
                        max_depth=max_depth,
                        min_child_weight=min_child_weight,
                        use_label_encoder=False)

    # Model Training
    model.fit(X_train, y_train)

    # Model Evaluation
    logloss_training = round(model.score(X_train, y_train), 2) * 100
    logloss_test = round(model.score(X_test, y_test), 2) * 100

    # Log MLFlow
    mlflow.log_param('start_year', start_year)
    mlflow.log_param('end_year', end_year)
    mlflow.log_param('data_dimension', len(data))
    mlflow.log_param('max_depth', max_depth)
    mlflow.log_param('min_child_weight', min_child_weight)
    mlflow.log_param('n_features', n_features)
    mlflow.log_metric('logloss_training', logloss_training)
    mlflow.log_metric('logloss_test', logloss_test)

## Model Evaluation

In [None]:
# Model Score
print('Model score on the training set: {:.2f}'.format(logloss_training))
print('Model score on the test set: {:.2f}'.format(logloss_test))

Model score on the training set: 71.00
Model score on the test set: 65.00


In [None]:
# Retrieve MLFlow Runs
mlflow.search_runs(experiment_ids=experiment.experiment_id)

Unnamed: 0,run_id,experiment_id,status,artifact_uri,start_time,end_time,metrics.logloss_training,metrics.logloss_test,params.min_child_weight,params.max_depth,params.start_year,params.data_dimension,params.end_year,params.n_features,tags.mlflow.source.name,tags.mlflow.source.type,tags.mlflow.user
0,4e52328c4cda40279f219ef99975478a,1,FINISHED,file:///work/us_stocks/notebooks/mlruns/1/4e52...,2022-04-20 13:01:32.722000+00:00,2022-04-20 13:01:35.726000+00:00,71.0,65.0,18,2,2014,12725,2016,50,/shared-libs/python3.7/py-core/lib/python3.7/s...,LOCAL,root
1,88b51df6f0854f30adaedd2bd7ad3a12,1,FINISHED,file:///work/us_stocks/notebooks/mlruns/1/88b5...,2022-04-20 12:41:41.125000+00:00,2022-04-20 12:41:42.922000+00:00,66.0,64.0,1,1,2014,12725,2016,50,/shared-libs/python3.7/py-core/lib/python3.7/s...,LOCAL,root


<a style='text-decoration:none;line-height:16px;display:flex;color:#5B5B62;padding:10px;justify-content:end;' href='https://deepnote.com?utm_source=created-in-deepnote-cell&projectId=f1efe245-29af-4be1-bb79-055f4abb0e16' target="_blank">
 </img>
Created in <span style='font-weight:600;margin-left:4px;'>Deepnote</span></a>