# Introduction

The notebook is intended to perform a first Exploratory Data Analysis for a Binary Classification problem over the feature 'class'.

In [1]:
# Import Standard Modules
import pandas as pd
import numpy as np

from sklearn.preprocessing import MinMaxScaler
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2
from sklearn.model_selection import train_test_split, GridSearchCV

import plotly.express as ex

from xgboost import XGBClassifier

import mlflow

# Read Data

In [2]:
data = pd.read_csv('./../data/2014_Financial_Data.csv', index_col=0)

In [3]:
data.info()

<class 'pandas.core.frame.DataFrame'>
Index: 3808 entries, PG to WTT
Columns: 224 entries, Revenue to Class
dtypes: float64(222), int64(1), object(1)
memory usage: 6.5+ MB


In [4]:
data.head(5)

Unnamed: 0,Revenue,Revenue Growth,Cost of Revenue,Gross Profit,R&D Expenses,SG&A Expense,Operating Expenses,Operating Income,Interest Expense,Earnings before Tax,...,Receivables growth,Inventory Growth,Asset Growth,Book Value per Share Growth,Debt Growth,R&D Expense Growth,SG&A Expenses Growth,Sector,2015 PRICE VAR [%],Class
PG,74401000000.0,-0.0713,39030000000.0,35371000000.0,0.0,21461000000.0,21461000000.0,13910000000.0,709000000.0,14494000000.0,...,-0.0187,-0.0217,0.0359,0.0316,0.1228,0.0,-0.1746,Consumer Defensive,-9.323276,0
VIPS,3734148000.0,1.1737,2805625000.0,928522600.0,108330300.0,344141400.0,793926700.0,134595900.0,12148690.0,175382300.0,...,,,,,,1.6484,1.7313,Consumer Defensive,-25.512193,0
KR,98375000000.0,0.0182,78138000000.0,20237000000.0,0.0,15196000000.0,17512000000.0,2725000000.0,443000000.0,2270000000.0,...,0.0618,0.0981,0.1886,0.3268,0.2738,0.0,0.0234,Consumer Defensive,33.118297,1
RAD,25526410000.0,0.0053,18202680000.0,7323734000.0,0.0,6561162000.0,6586482000.0,737252000.0,424591000.0,250218000.0,...,0.0211,-0.051,-0.0189,0.1963,-0.0458,0.0,-0.006,Consumer Defensive,2.752291,1
GIS,17909600000.0,0.0076,11539800000.0,6369800000.0,0.0,3474300000.0,3412400000.0,2957400000.0,302400000.0,2707700000.0,...,0.0257,0.009,0.0215,0.0274,0.1025,0.0,-0.022,Consumer Defensive,12.897715,1


# Data Pre-processing

In [5]:
# Define label
y = data['Class']

# Define featuers
# NOTE: Drop '2015 PRICE VAR [%]' because it is directly related to the Class and it is only valid a posteriori
X = data.drop(['Class', 'Sector', '2015 PRICE VAR [%]'], axis=1)

## Check Data Distribution

In [6]:
# Check X feature distribution
X.describe().transpose()

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
Revenue,3764.0,5.879050e+09,3.901741e+10,-6.276160e+08,5.788880e+07,4.349010e+08,2.393625e+09,1.824698e+12
Revenue Growth,3572.0,1.295424e+01,7.056055e+02,-1.773200e+00,-2.350000e-03,6.185000e-02,1.888750e-01,4.213866e+04
Cost of Revenue,3734.0,3.700973e+09,3.040688e+10,-5.455740e+08,3.135714e+06,1.414420e+08,1.199844e+09,1.537249e+12
Gross Profit,3756.0,2.188214e+09,1.159028e+10,-1.105000e+09,3.092900e+07,1.908760e+08,8.922534e+08,4.621600e+11
R&D Expenses,3672.0,9.401830e+07,6.408912e+08,-1.500000e+05,0.000000e+00,0.000000e+00,9.911000e+06,1.153700e+10
...,...,...,...,...,...,...,...,...
Asset Growth,3518.0,2.279999e+00,9.242892e+01,-9.796000e-01,-1.540000e-02,6.220000e-02,2.171000e-01,5.468426e+03
Book Value per Share Growth,3439.0,8.139312e-01,2.563021e+01,-2.300000e+02,-8.455000e-02,3.710000e-02,1.419000e-01,1.360125e+03
Debt Growth,3506.0,1.361780e+00,1.873430e+01,-1.051000e+00,-4.820000e-02,0.000000e+00,2.103000e-01,7.295766e+02
R&D Expense Growth,3561.0,5.965290e-01,2.589405e+01,-1.043700e+00,0.000000e+00,0.000000e+00,4.700000e-03,1.542611e+03


In [7]:
# Check 'Class' distribution
figure = ex.histogram(data_frame=y, 
                        x='Class', 
                        title='Class Distribution', 
                        histnorm='percent', 
                        nbins=len(np.unique(y.values)))
figure.show()

Quite well class distribution. No skewed classes.

In [8]:
# Check y for null values
len(y) - y.count()

0

## Fill NaN Values

In [9]:
# Fill NaN values with the mean
X = X.fillna(X.mean())

## Feature Normalization

In [10]:
# Instantiate MinMaxScaler
min_max_scaler = MinMaxScaler()

In [11]:
# Scale the data
X_scaled = pd.DataFrame(min_max_scaler.fit_transform(X.values), index=X.index, columns=X.columns)

# Feature Selection

## Univariate Selection

In [12]:
# Define the number of desired features
n_features = 10

In [13]:
# Fit the feature seletor
feature_selector = SelectKBest(score_func=chi2, k=n_features).fit(X_scaled, y)

In [14]:
# Extract the most important features
X_feature_selected = pd.DataFrame(feature_selector.transform(X_scaled), 
                                    index=X_scaled.index, 
                                    columns=feature_selector.get_feature_names_out())

In [15]:
X_feature_selected

Unnamed: 0,Short-term investments,Cash and short-term investments,"Property, Plant & Equipment Net",Deferred revenue,cashPerShare,Cash per Share,Tangible Book Value per Share,Interest Debt per Share,Market Cap,Intangibles to Total Assets
PG,0.046213,0.124678,2.442893e-02,0.000000,6.556929e-08,6.556929e-08,2.014900e-08,9.968516e-08,2.210802e-03,0.631874
VIPS,0.013212,0.016123,3.379979e-04,0.001249,1.086036e-06,1.086036e-06,4.102932e-09,2.179910e-07,6.500749e-05,0.075156
KR,0.000000,0.004679,1.850241e-02,0.000000,8.087833e-09,8.087833e-09,2.348602e-08,9.119011e-08,1.899815e-04,0.104486
RAD,0.000000,0.001708,2.143806e-03,0.000000,6.280977e-08,6.280977e-08,1.230587e-07,5.314085e-07,6.611504e-05,0.066961
GIS,0.000000,0.010119,4.317449e-03,0.000000,2.926216e-08,2.926216e-08,1.408668e-08,1.046081e-07,3.464951e-04,0.636619
...,...,...,...,...,...,...,...,...,...,...
TSRI,0.000033,0.000051,3.734327e-08,0.000059,3.003134e-08,3.003134e-08,6.310658e-09,4.639872e-08,6.261987e-08,0.000000
TZOO,0.000000,0.000681,1.040288e-05,0.000059,8.189216e-08,8.189216e-08,5.768336e-09,4.667434e-08,1.932576e-06,0.000000
USATP,0.005674,0.009913,2.300069e-05,0.000040,8.644791e-04,8.644791e-04,3.598638e-04,8.512586e-04,3.642339e-04,0.153366
WSTG,0.000000,0.000270,4.512517e-07,0.000000,1.028592e-07,1.028592e-07,1.860512e-08,4.639872e-08,8.769462e-07,0.000000


# Exploratory Data Analysis

## Market Cap

In [16]:
# Plot the distribution of the market Cap
# TODO
figure = ex.histogram(data_frame=X_feature_selected, 
                        x='Market Cap', 
                        title='Market Cap Distribution', 
                        nbins=X_feature_selected['Market Cap'].nunique())
figure.show()

# Split Data into Training and Test Sets

In [17]:
# Split data
test_size = 0.33
X_train, X_test, y_train, y_test = train_test_split(X_feature_selected, y, test_size=test_size)

# Model Definition

## Hyperparameters Tuning

In [18]:
# Set initial space of hyperparameters
parameters = {
    'max_depth': range(1, 20, 1),
    'min_child_weight': range(1, 10, 1)
}

In [19]:
# Perform a Grid Search for hyperparameters tuning
grid_search = GridSearchCV(estimator=XGBClassifier(eval_metric='logloss', use_label_encoder=False), 
                            param_grid=parameters,
                            scoring='accuracy',
                            n_jobs=1,
                            cv=2)

grid_search.fit(X_train, y_train)

GridSearchCV(cv=2,
             estimator=XGBClassifier(base_score=None, booster=None,
                                     colsample_bylevel=None,
                                     colsample_bynode=None,
                                     colsample_bytree=None,
                                     enable_categorical=False,
                                     eval_metric='logloss', gamma=None,
                                     gpu_id=None, importance_type=None,
                                     interaction_constraints=None,
                                     learning_rate=None, max_delta_step=None,
                                     max_depth=None, min_child_weight=None,
                                     missing=nan, monotone_constraints=None,
                                     n_estimators=100, n_jobs=None,
                                     num_parallel_tree=None, predictor=None,
                                     random_state=None, reg_alpha=None,
          

In [20]:
# TBest hyperparameters combinations
grid_search.best_params_

{'max_depth': 4, 'min_child_weight': 1}

In [21]:
# Best score
grid_search.best_score_

0.5919202163624071

## Training

In [23]:
# Start experiment
with mlflow.start_run():

    # Define parameters
    max_depth = grid_search.best_params_['max_depth']
    min_child_weight = grid_search.best_params_['min_child_weight']

    # Create and fit the model
    model = XGBClassifier(eval_metric='logloss', 
                        max_depth=max_depth,
                        min_child_weight=min_child_weight,
                        use_label_encoder=False)

    # Model Training
    model.fit(X_train, y_train)

    # Model Evaluation
    logloss_training = model.score(X_train, y_train)
    logloss_test = model.score(X_test, y_test)

    # Log MLFlow
    mlflow.log_param('max_depth', max_depth)
    mlflow.log_param('min_child_weight', min_child_weight)
    mlflow.log_metric('logloss_training', logloss_training)
    mlflow.log_metric('logloss_test', logloss_test)

## Model Evaluation

In [24]:
# Model Score
print('Model score on the training set: {:.2f}'.format(logloss_training))
print('Model score on the test set: {:.2f}'.format(logloss_test))

Model score on the training set: 0.90
Model score on the test set: 0.63


<a style='text-decoration:none;line-height:16px;display:flex;color:#5B5B62;padding:10px;justify-content:end;' href='https://deepnote.com?utm_source=created-in-deepnote-cell&projectId=f1efe245-29af-4be1-bb79-055f4abb0e16' target="_blank">
 </img>
Created in <span style='font-weight:600;margin-left:4px;'>Deepnote</span></a>