# Appendix B: Variables selection




In [None]:
# Packages import and installation

import pandas as pd
import numpy as np 

%pip install matplotlib
import matplotlib.pyplot as plt

%pip install seaborn
import seaborn as sns
sns.set_style("whitegrid")

from sklearn.preprocessing import MinMaxScaler
from pandas.plotting import scatter_matrix

from sklearn.model_selection import train_test_split
from sklearn.feature_selection import VarianceThreshold

import scipy.stats as stats

pd.set_option('display.max_rows', 25)

In [None]:
# Reading the cleaned and merged (Targe+Non-target) dataset into a datafre. This dataset was pickled and named 'Merged numerical financial dataset' in Appendix A 

num_df = pd.read_pickle("Merged numerical financial dataset")

# Visualizing the DataFrame
num_df.head()
num_df.shape

In [None]:

# Dropping the unneccessary columns, to make my dataset ready for the filter methods

num_df_filter = num_df.drop(['Index','Instrument','AD', 'AD-30'], axis=1)
num_df_filter.shape

# APPLYING FILTER METHODS FOR VARIABLES SELECTION



In [None]:
# Getting the dataset ready, by splitting into test and train

x = num_df_filter
y = num_df_filter['Target/Non-Target']

trainX, testX, trainY, testY = train_test_split(x, y, test_size = 0.2, random_state=45)

# Next we will use VarianceThreshold function to remove quasi constant features.

constant_filter = VarianceThreshold(threshold=0)
constant_filter.fit(trainX)

# Similarly, to find the number of constant features the following code was used:

constant_columns = [column for column in trainX.columns
                    if column not in trainX.columns[constant_filter.get_support()]]

print(len(constant_columns))

In [None]:
# Now we want to get all the features that are not constant (features we want to keep):

len(trainX.columns[constant_filter.get_support()]) # All columns appear to be non constant

In [None]:
# Removing Quasi-Constant features

qconstant_filter = VarianceThreshold(threshold=0.01)
qconstant_filter.fit(trainX)

In [None]:
# Identify the Quasi constant features

qconstant_columns = [column for column in trainX.columns
                    if column not in trainX.columns[qconstant_filter.get_support()]]

print(len(qconstant_columns)) # 0

**RESULTS**: No constant features nor duplicated features were identified in the dataset.

# VARIABLE SELECTION WITH A TRIANGULATION METHOD (PEARSON, SPEARMAN & t-test)

In [None]:
# Dropping the unneccessary columns

num_df_noindex = num_df.drop(['Index', 'Target/Non-Target','Instrument','AD', 'AD-30'], axis=1)

In [None]:
# Using Scikit-learn to transform the dataset with maximum absolute scaling, for better visualization

scaler = MinMaxScaler()
scaler.fit(num_df_noindex)
scaled = scaler.transform(num_df_noindex)
scaled_df = pd.DataFrame(scaled, columns=num_df_noindex.columns)

scaled_df.head()
scaled_df.shape

In [None]:
# Crating a Scatter Matrix plot of the dataset

axes = pd.plotting.scatter_matrix(scaled_df, figsize  = [20, 20], alpha=0.2)
for ax in axes.flatten():
    ax.xaxis.label.set_rotation(90)
    ax.yaxis.label.set_rotation(0)
    ax.yaxis.label.set_ha('right')

plt.suptitle('Scatter Matrix representation of the Variables', y=1, fontsize=25)

# Setting the layout
plt.tight_layout()
plt.gcf().subplots_adjust(wspace=0.2, hspace=0.2)

# Displaying the title

plt.show()

In [None]:
# Calculating Pearson's correlation coefficient 

num_df_noindex.corr()

In [None]:
# Plotting the results into a Heatmap

# Define figure size

plt.figure(figsize=(22,20))

# Define title and dimensions of the words
plt.title("Pearson Correlation Coefficient Heatmap", y=1, fontsize=25)

# Printing a heat map of the correlation

matrix = num_df_noindex.corr().round(2)
sns.heatmap(matrix, annot=True, vmax=1, vmin=-1, center=0, cmap='vlag')

In [None]:
# Selecting Only Strong Correlations in the Correlation Matrix (Checking also for negative correlation)

Matrix_strong = num_df_noindex.corr()
Matrix_strong = Matrix_strong.unstack()
Matrix_strong = Matrix_strong[abs(Matrix_strong) >= 0.7]


Most_correlated = Matrix_strong.sort_values(ascending=False)
Most_correlated
pd.set_option('display.max_rows', 10000)


print(Most_correlated)

In [None]:
# Most correlated variables with score greater than 0.7

correlated_features = set()
threshold = 0.70

for i in range(len(matrix.columns)):
    for j in range(i):
        if abs(matrix.iloc[i, j]) >= threshold:
            colname = matrix.columns[i]
            correlated_features.add(colname)
        

In [None]:
print(correlated_features)

**RESULTS**: Most correlated variables according to Pearson results, by number of variables correlation:

9	Total Capital /
9	Total Shareholders' Equity incl Minority Intr & Hybrid Debt /
8	Revenue from Business Activities - Total.2 /
8	Revenue from Business Activities - Total.1 /
8	Enterprise Value (Daily Time Series) /
8	Cash & Cash Equivalents - Total /
8	Revenue from Business Activities - Total.3 /
8	Revenue from Business Activities - Total / 
8	Debt - Total /
4	Earnings before Interest Taxes Depreciation & Amortization /
2	Free Cash Flow /
1	Price To Sales Per Share (Daily Time Series Ratio) /
1	Operating Margin - %, TTM


In [None]:
# Calculating Spearman’s correlation

Matrix_Spearman = num_df_noindex.corr(method="spearman")



In [None]:
# Showing Spearman's coefficients on a Heatmap

# Defining figure size

plt.figure(figsize=(22,20))

# Defining title

plt.title("Spearman Correlation Coefficient Heatmap", y=1, fontsize=25)

# Printing a heat map of the correlation

sns.heatmap(Matrix_Spearman, annot=True, vmax=1, vmin=-1, center=0, cmap='vlag')
plt.show()




In [None]:
# Selecting only the most correlated variables (negatively and positively)

Matrix_P_strong = num_df_noindex.corr(method="spearman")
Matrix_P_strong = Matrix_P_strong.unstack()
Matrix_P_strong = Matrix_P_strong[abs(Matrix_P_strong) >= 0.7]


Most_correlated_P = Matrix_P_strong.sort_values(ascending=False)
Most_correlated_P

print(Most_correlated_P)

In [None]:
# Check with Spearman as well! The most correlated values

Spearman_correlated_features = set()
threshold = 0.70

for i in range(len(Matrix_Spearman.columns)):
    for j in range(i):
        if abs(Matrix_Spearman.iloc[i, j]) >= threshold:
            colname = Matrix_Spearman.columns[i]
            Spearman_correlated_features.add(colname)

In [None]:
print(Spearman_correlated_features)

**RESULTS**: Most correlated variables according to Spearman results, by number of variables correlation:

9 Total Capital /
9 Revenue from Business Activities - Total /
9 Revenue from Business Activities - Total.3 /
9 Revenue from Business Activities - Total.1 /
9 Enterprise Value (Daily Time Series) /
8 Earnings before Interest Taxes Depreciation & Amortization /
8 Total Shareholders' Equity incl Minority Intr & Hybrid Debt /
8 Debt - Total /
6 Cash & Cash Equivalents - Total




# Calculating t-test, to be used on Pearsman coefficients

In [None]:
# Getting the dataset with the Terget column included,so I can apply t-test on each column, comparing target and non target values

num_df_target = num_df.drop(['Index', 'Instrument','AD-30','AD'], axis=1)
num_df_target.head()

In [None]:
# Getting the dataset with only the highly correlated variables

Correlated_T_test = num_df_target.drop(num_df_target.columns[[1, 2, 4, 5, 10, 11, 12, 17]], axis=1) # With Target and no index

In [None]:
Correlated_T_test.head()

In [None]:
# Running the T-test to help me decide which variable to remove


T_test = stats.ttest_ind(Correlated_T_test.loc[Correlated_T_test['Target/Non-Target']==1], Correlated_T_test.loc[Correlated_T_test['Target/Non-Target']==0])

In [None]:
# Storing the calculated results in a dataframe, so we can visualize it:

T_test_data = pd.DataFrame()
T_test_data["Variable"] = Correlated_T_test.columns # Setting the Variables columns
T_test_data["T_test"] = T_test[0] # Setting the T-test values values per column
T_test_data["p-value"] = T_test[1] # Setting the p-value values per column

# Creating the acuat dataframe:
T_test_data = T_test_data.T
T_test_data.rename(columns = T_test_data.iloc[0], inplace = True)
T_test_data = T_test_data.iloc[1:]

# Printing the outcome:
T_test_data

**RESULTS**: The only variable which was kept in the original dataset and not dropped, was ‘Debt-Total’: with a p value of .7425, there was a high 74.25% chance that the relationship, and therefore high correlation, occurred by chance. Since this p-value was not less than .05, we failed to reject the null hypothesis, so in this case, with a high p value and lower T-test the relationship was deemed as not statistically significant as the data didn’t allow to reject the null hypothesis and didn’t provide support for the alternative hypothesis. The decision to keep the variable was then made.
The threshold for the p-value was set at .7425, so all the other variables were deemed statistically significant and dropped.


In [None]:
# Dropping unneccessary columns

num_df_features = num_df.drop(num_df.columns[[3,4,7,10,11,12,13,17,18,19,22]], axis=1) # With Target
num_df_features.head()

In [None]:
# Checking the columns

num_df_features.columns

In [None]:
# Cheching the dataframe shape

num_df_features.shape


In [None]:
# Exporting the final, variable selected dataframe with pickle, so we can use this dataset later for further processing

num_df_features.to_pickle("Variable selected financial dataset")