In [2]:
import numpy as np
import pandas as pd

## Cehcking the amount of features in the FS data sets

In [3]:
# Importing the complete set of features
mda = pd.read_csv('mda_model_data.csv')
ect = pd.read_csv('ect_model_data.csv')
ff = pd.read_csv('baseline_data.csv')

In [4]:
# Importing the pso features
pso_rlr = pd.read_csv('rlr_pso_data.csv')
pso_knn = pd.read_csv('knn_pso_data.csv')
pso_lgbm = pd.read_csv('lgbm_pso_data.csv')
sapso_rlr = pd.read_csv('rlr_sapso_data.csv')
sapso_knn = pd.read_csv('knn_sapso_data.csv')
sapso_lgbm = pd.read_csv('lgbm_sapso_data.csv')

### Extracting column names

In [5]:
# Extracting the baseline columns (including CIK, and Target etc.)
ff_cols = ff.columns

In [6]:
# Extracting the unique mda and ect cols
mda_cols = mda.columns
mda_cols = [col for col in mda_cols if col not in ff_cols]
ect_cols = ect.columns
ect_cols = [col for col in ect_cols if col not in ff_cols]

In [7]:
# Removing the non-features
ff = ff.drop(columns=['Target', 'fraud', 'res', 'cik'])
ff_cols = ff_cols = ff.columns

In [8]:
# The FS features
pso_rlr = pso_rlr.drop(columns='Target')
pso_rlr_cols = pso_rlr.columns
pso_knn = pso_knn.drop(columns='Target')
pso_knn_cols = pso_knn.columns
pso_lgbm = pso_lgbm.drop(columns='Target')
pso_lgbm_cols = pso_lgbm.columns
sapso_rlr = sapso_rlr.drop(columns='Target')
sapso_rlr_cols = sapso_rlr.columns
sapso_knn = sapso_knn.drop(columns='Target')
sapso_knn_cols = sapso_knn.columns
sapso_lgbm = sapso_lgbm.drop(columns='Target')
sapso_lgbm_cols = sapso_lgbm.columns

In [9]:
# Storing them in one object
fs_cols = {'PSO-RLR': pso_rlr_cols, 'PSO-KNN': pso_knn_cols, 'PSO-LGBM': pso_lgbm_cols,
          'SAPSO-RLR': sapso_rlr_cols, 'SAPSO-KNN': sapso_knn_cols, 'SAPSO-LGBM': sapso_lgbm_cols}

### Checking the distribution of FS methods

In [10]:
for name, cols in fs_cols.items():
    fs_ff_cols = set(ff_cols).intersection(cols)
    fs_mda_cols = set(mda_cols).intersection(cols)
    fs_ect_cols = set(ect_cols).intersection(cols)
    print('The percentage of baseline features included in',name,':',round(len(fs_ff_cols)/len(ff_cols),2))
    print('The percentage of MD&A features included in',name,':',round(len(fs_mda_cols)/len(mda_cols),2))
    print('The percentage of ECT features included in',name,':',round(len(fs_ect_cols)/len(ect_cols),2))

The percentage of baseline features included in PSO-RLR : 0.43
The percentage of MD&A features included in PSO-RLR : 0.39
The percentage of ECT features included in PSO-RLR : 0.4
The percentage of baseline features included in PSO-KNN : 0.4
The percentage of MD&A features included in PSO-KNN : 0.3
The percentage of ECT features included in PSO-KNN : 0.37
The percentage of baseline features included in PSO-LGBM : 0.44
The percentage of MD&A features included in PSO-LGBM : 0.41
The percentage of ECT features included in PSO-LGBM : 0.43
The percentage of baseline features included in SAPSO-RLR : 0.45
The percentage of MD&A features included in SAPSO-RLR : 0.46
The percentage of ECT features included in SAPSO-RLR : 0.4
The percentage of baseline features included in SAPSO-KNN : 0.4
The percentage of MD&A features included in SAPSO-KNN : 0.43
The percentage of ECT features included in SAPSO-KNN : 0.43
The percentage of baseline features included in SAPSO-LGBM : 0.38
The percentage of MD&A f

### Checking whether cosine similariy is included

In [11]:
for name, cols in fs_cols.items():
    if 'Cos_Sim' in cols:
        print('Cos_Sim is included in:', name)
    else:
        print('Cos_Sim is not included in:', name)

Cos_Sim is not included in: PSO-RLR
Cos_Sim is not included in: PSO-KNN
Cos_Sim is not included in: PSO-LGBM
Cos_Sim is not included in: SAPSO-RLR
Cos_Sim is not included in: SAPSO-KNN
Cos_Sim is not included in: SAPSO-LGBM


### Checking the distribution of ECT quarters

In [12]:
Qs = ['Q1', 'Q2', 'Q3', 'Q4']

In [13]:
for name, cols in fs_cols.items():
    fs_ect_cols = set(ect_cols).intersection(cols)
    for i in Qs:
        fs_Q_cols = [col for col in fs_ect_cols if i in col]
        print('The percentage of ECT features included in',name,'that orginate from',i,':',round(len(fs_Q_cols)/len(fs_ect_cols),2))

The percentage of ECT features included in PSO-RLR that orginate from Q1 : 0.25
The percentage of ECT features included in PSO-RLR that orginate from Q2 : 0.23
The percentage of ECT features included in PSO-RLR that orginate from Q3 : 0.28
The percentage of ECT features included in PSO-RLR that orginate from Q4 : 0.23
The percentage of ECT features included in PSO-KNN that orginate from Q1 : 0.25
The percentage of ECT features included in PSO-KNN that orginate from Q2 : 0.3
The percentage of ECT features included in PSO-KNN that orginate from Q3 : 0.22
The percentage of ECT features included in PSO-KNN that orginate from Q4 : 0.2
The percentage of ECT features included in PSO-LGBM that orginate from Q1 : 0.24
The percentage of ECT features included in PSO-LGBM that orginate from Q2 : 0.21
The percentage of ECT features included in PSO-LGBM that orginate from Q3 : 0.23
The percentage of ECT features included in PSO-LGBM that orginate from Q4 : 0.3
The percentage of ECT features included

## Variable Importance

In [18]:
# Importing the sapso features
sapso_coef = pd.read_csv('sapso_coef_data.csv')
sapso_fi = pd.read_csv('sapso_fi_data.csv')

In [19]:
# Renaming columns before merging
sapso_coef = sapso_coef.rename(columns={'coef': 'coef_sapso','abs_coef': 'abs_coef_sapso'})

# Top 10s
sapso_coef_top_10 = sapso_coef.sort_values('abs_coef_sapso', ascending=False).head(10)
gain_sapso_top_10 = sapso_fi.sort_values('fi_sapso_gain', ascending=False).head(10)
split_sapso_top_10 = sapso_fi.sort_values('fi_sapso_split', ascending=False).head(10)

In [20]:
sapso_coef_top_10

Unnamed: 0,feature,coef_sapso,abs_coef_sapso
4,Zscore,-0.088686,0.088686
3,AFDAtNS,0.079095,0.079095
0,ARtS,0.07582,0.07582
30,Assets,-0.070891,0.070891
9,PPEtTA,-0.062194,0.062194
45,Length_MDA,-0.056464,0.056464
53,Pol_Q2,-0.053135,0.053135
16,Sales,-0.051309,0.051309
28,PCiAPtI,-0.050218,0.050218
37,PCiE,0.046197,0.046197


In [21]:
split_sapso_top_10

Unnamed: 0,feature,fi_sapso_split,fi_sapso_gain
12,UEP,240,6059.74023
44,Length_Q2,209,2995.517839
43,Length_MDA,205,1370.453212
5,DCS,200,5148.545844
7,cROE,187,2375.734635
18,DTaxE,180,2267.344279
4,DtE,168,1227.18681
14,IoAR,158,7609.31803
45,Length_Q4,155,1256.851979
41,FOG_Q1,148,626.65937


In [22]:
gain_sapso_top_10

Unnamed: 0,feature,fi_sapso_split,fi_sapso_gain
13,IStMV,130,14841.227073
34,Assets,91,8094.565235
14,IoAR,158,7609.31803
12,UEP,240,6059.74023
5,DCS,200,5148.545844
26,ROA,97,3974.833496
15,IoGMP,137,3846.281862
44,Length_Q2,209,2995.517839
7,cROE,187,2375.734635
18,DTaxE,180,2267.344279


In [23]:
# filter the rows where 'MDA' appears in the 'feature' column
rlr_mda_rows = sapso_coef[sapso_coef['feature'].str.contains('_MDA')]
rlr_ect_rows = sapso_coef[sapso_coef['feature'].str.contains('_Q1|_Q2|_Q3|_Q4')]
rlr_other_rows = sapso_coef[~sapso_coef.index.isin(rlr_ect_rows.index) & ~sapso_coef.index.isin(rlr_mda_rows.index)]

# filter the rows where 'MDA' appears in the 'feature' column
lgbm_mda_rows = sapso_fi[sapso_fi['feature'].str.contains('_MDA')]
lgbm_ect_rows = sapso_fi[sapso_fi['feature'].str.contains('_Q1|_Q2|_Q3|_Q4')]
lgbm_other_rows = sapso_fi[~sapso_fi.index.isin(lgbm_ect_rows.index) & ~sapso_fi.index.isin(lgbm_mda_rows.index)]

## RLR Coefficients

In [39]:
# compute the sum of the 'coef' column for the filtered rows
mda_sum_coef = rlr_mda_rows['abs_coef_sapso'].sum()
ect_sum_coef = rlr_ect_rows['abs_coef_sapso'].sum()
other_sum_coef = rlr_other_rows['abs_coef_sapso'].sum()
total_sum_coef = sapso_coef['abs_coef_sapso'].sum()

# print the result
print("Sum of coef MD&A features:", round(mda_sum_coef,2),
      "percentage of total:",round(round(mda_sum_coef,4)/round(total_sum_coef,4),4)*100)
print("Sum of coef ECT features:", round(ect_sum_coef,2),
      "percentage of total:",round(round(ect_sum_coef,4)/round(total_sum_coef,4),4)*100)
print("Sum of coef FF features:", round(other_sum_coef,2),
      "percentage of total:",round(round(other_sum_coef,4)/round(total_sum_coef,4),4)*100)
print("Total sum of coef:", round(total_sum_coef,2))

Sum of coef MD&A features: 0.16 percentage of total: 7.53
Sum of coef ECT features: 0.39 percentage of total: 18.52
Sum of coef FF features: 1.55 percentage of total: 73.95
Total sum of coef: 2.1


In [48]:
# Readability - Length & FOG
rlr_read_rows = sapso_coef[sapso_coef['feature'].str.contains('Length_|FOG_')]
# Sentiment - Pos, Neg, Pol & Obj
rlr_sen_rows = sapso_coef[sapso_coef['feature'].str.contains('Pos_|Neg_|Pol_|Sub_')]
# Topics - Topic
rlr_topic_rows = sapso_coef[sapso_coef['feature'].str.contains('Topic_')]

read_sum_coef = rlr_read_rows['abs_coef_sapso'].sum()
sen_sum_coef = rlr_sen_rows['abs_coef_sapso'].sum()
topic_sum_coef = rlr_topic_rows['abs_coef_sapso'].sum()

# print the result
print("Sum of coef Readability features:", round(read_sum_coef,2),
      "percentage of total:",round(round(read_sum_coef,4)/round(total_sum_coef,4),4)*100)
print("Sum of coef Sentiment features:", round(sen_sum_coef,2),
      "percentage of total:",round(round(sen_sum_coef,4)/round(total_sum_coef,4),4)*100)
print("Sum of coef Topic features:", round(topic_sum_coef,2),
      "percentage of total:",round(round(topic_sum_coef,4)/round(total_sum_coef,4),4)*100)

Sum of coef Readability features: 0.11 percentage of total: 5.37
Sum of coef Sentiment features: 0.19 percentage of total: 9.13
Sum of coef Topic features: 0.24 percentage of total: 11.559999999999999


In [54]:
# All quarters
rlr_Q1_rows = sapso_coef[sapso_coef['feature'].str.contains('_Q1')]
rlr_Q2_rows = sapso_coef[sapso_coef['feature'].str.contains('_Q2')]
rlr_Q3_rows = sapso_coef[sapso_coef['feature'].str.contains('_Q3')]
rlr_Q4_rows = sapso_coef[sapso_coef['feature'].str.contains('_Q4')]

Q1_sum_coef = rlr_Q1_rows['abs_coef_sapso'].sum()
Q2_sum_coef = rlr_Q2_rows['abs_coef_sapso'].sum()
Q3_sum_coef = rlr_Q3_rows['abs_coef_sapso'].sum()
Q4_sum_coef = rlr_Q4_rows['abs_coef_sapso'].sum()

# print the result
print("Sum of coef Q1 features:", round(Q1_sum_coef,2),
      "percentage of total:",round(round(Q1_sum_coef,4)/round(total_sum_coef,4),4)*100)
print("Sum of coef Q2 features:", round(Q2_sum_coef,2),
      "percentage of total:",round(round(Q2_sum_coef,4)/round(total_sum_coef,4),4)*100)
print("Sum of coef Q3 features:", round(Q3_sum_coef,2),
      "percentage of total:",round(round(Q3_sum_coef,4)/round(total_sum_coef,4),4)*100)
print("Sum of coef Q4 features:", round(Q4_sum_coef,2),
      "percentage of total:",round(round(Q4_sum_coef,4)/round(total_sum_coef,4),4)*100)

Sum of coef Q1 features: 0.09 percentage of total: 4.21
Sum of coef Q2 features: 0.17 percentage of total: 7.9
Sum of coef Q3 features: 0.04 percentage of total: 1.68
Sum of coef Q4 features: 0.1 percentage of total: 4.72


## LGBM Split

In [30]:
# compute the sum of the 'split' column for the filtered rows
mda_sum_split = lgbm_mda_rows['fi_sapso_split'].sum()
ect_sum_split = lgbm_ect_rows['fi_sapso_split'].sum()
other_sum_split = lgbm_other_rows['fi_sapso_split'].sum()
total_sum_split = sapso_fi['fi_sapso_split'].sum()

# print the result
print("Sum of split MD&A features:", round(mda_sum_split,2),
      "percentage of total:",round(round(mda_sum_split,4)/round(total_sum_split,4),4)*100)
print("Sum of split ECT features:", round(ect_sum_split,2),
      "percentage of total:",round(round(ect_sum_split,4)/round(total_sum_split,4),4)*100)
print("Sum of split FF features:", round(other_sum_split,2),
      "percentage of total:",round(round(other_sum_split,4)/round(total_sum_split,4),4)*100)
print("Total sum of split:", total_sum_split)

Sum of split MD&A features: 560 percentage of total: 9.64
Sum of split ECT features: 1225 percentage of total: 21.09
Sum of split FF features: 4024 percentage of total: 69.27
Total sum of split: 5809


In [38]:
# Readability - Length & FOG
lgbm_read_rows = sapso_fi[sapso_fi['feature'].str.contains('Length_|FOG_')]
# Sentiment - Pos, Neg, Pol & Obj
lgbm_sen_rows = sapso_fi[sapso_fi['feature'].str.contains('Pos_|Neg_|Pol_|Sub_')]
# Topics - Topic
lgbm_topic_rows = sapso_fi[sapso_fi['feature'].str.contains('Topic_')]

read_sum_split = lgbm_read_rows['fi_sapso_split'].sum()
sen_sum_split = lgbm_sen_rows['fi_sapso_split'].sum()
topic_sum_split = lgbm_topic_rows['fi_sapso_split'].sum()

# print the result
print("Sum of split Readability features:", round(read_sum_split,2),
      "percentage of total:",round(round(read_sum_split,4)/round(total_sum_split,4),4)*100)
print("Sum of split Sentiment features:", round(sen_sum_split,2),
      "percentage of total:",round(round(sen_sum_split,4)/round(total_sum_split,4),4)*100)
print("Sum of split Topic features:", round(topic_sum_split,2),
      "percentage of total:",round(round(topic_sum_split,4)/round(total_sum_split,4),4)*100)

Sum of split Readability features: 845 percentage of total: 14.549999999999999
Sum of split Sentiment features: 790 percentage of total: 13.600000000000001
Sum of split Topic features: 150 percentage of total: 2.58


In [51]:
# All quarters
lgbm_Q1_rows = sapso_fi[sapso_fi['feature'].str.contains('_Q1')]
lgbm_Q2_rows = sapso_fi[sapso_fi['feature'].str.contains('_Q2')]
lgbm_Q3_rows = sapso_fi[sapso_fi['feature'].str.contains('_Q3')]
lgbm_Q4_rows = sapso_fi[sapso_fi['feature'].str.contains('_Q4')]

Q1_sum_split = lgbm_Q1_rows['fi_sapso_split'].sum()
Q2_sum_split = lgbm_Q2_rows['fi_sapso_split'].sum()
Q3_sum_split = lgbm_Q3_rows['fi_sapso_split'].sum()
Q4_sum_split = lgbm_Q4_rows['fi_sapso_split'].sum()

# print the result
print("Sum of split Q1 features:", round(Q1_sum_split,2),
      "percentage of total:",round(round(Q1_sum_split,4)/round(total_sum_split,4),4)*100)
print("Sum of split Q2 features:", round(Q2_sum_split,2),
      "percentage of total:",round(round(Q2_sum_split,4)/round(total_sum_split,4),4)*100)
print("Sum of split Q3 features:", round(Q3_sum_split,2),
      "percentage of total:",round(round(Q3_sum_split,4)/round(total_sum_split,4),4)*100)
print("Sum of split Q4 features:", round(Q4_sum_split,2),
      "percentage of total:",round(round(Q4_sum_split,4)/round(total_sum_split,4),4)*100)

Sum of split Q1 features: 453 percentage of total: 7.8
Sum of split Q2 features: 519 percentage of total: 8.93
Sum of split Q3 features: 0 percentage of total: 0.0
Sum of split Q4 features: 253 percentage of total: 4.36


## LGBM Gain

In [26]:
# compute the sum of the 'gain' column for the filtered rows
mda_sum_gain = lgbm_mda_rows['fi_sapso_gain'].sum()
ect_sum_gain = lgbm_ect_rows['fi_sapso_gain'].sum()
other_sum_gain = lgbm_other_rows['fi_sapso_gain'].sum()
total_sum_gain = sapso_fi['fi_sapso_gain'].sum()

# print the result
print("Sum of gain MD&A features:", round(mda_sum_gain,2),
      "percentage of total:",round(round(mda_sum_gain,4)/round(total_sum_gain,4),4)*100)
print("Sum of gain ECT features:", round(ect_sum_gain,2),
      "percentage of total:",round(round(ect_sum_gain,4)/round(total_sum_gain,4),4)*100)
print("Sum of gain FF features:", round(other_sum_gain,2),
      "percentage of total:",round(round(other_sum_gain,4)/round(total_sum_gain,4),4)*100)
print("Total sum of gain:", round(total_sum_gain,2))

Sum of gain MD&A features: 4514.67 percentage of total: 4.97
Sum of gain ECT features: 12264.93 percentage of total: 13.5
Sum of gain FF features: 74080.92 percentage of total: 81.53
Total sum of gain: 90860.52


In [36]:
# Readability - Length & FOG
lgbm_read_rows = sapso_fi[sapso_fi['feature'].str.contains('Length_|FOG_')]
# Sentiment - Pos, Neg, Pol & Obj
lgbm_sen_rows = sapso_fi[sapso_fi['feature'].str.contains('Pos_|Neg_|Pol_|Sub_')]
# Topics - Topic
lgbm_topic_rows = sapso_fi[sapso_fi['feature'].str.contains('Topic_')]

read_sum_gain = lgbm_read_rows['fi_sapso_gain'].sum()
sen_sum_gain = lgbm_sen_rows['fi_sapso_gain'].sum()
topic_sum_gain = lgbm_topic_rows['fi_sapso_gain'].sum()

# print the result
print("Sum of gain Readability features:", round(read_sum_gain,2),
      "percentage of total:",round(round(read_sum_gain,4)/round(total_sum_gain,4),4)*100)
print("Sum of gain Sentiment features:", round(sen_sum_gain,2),
      "percentage of total:",round(round(sen_sum_gain,4)/round(total_sum_gain,4),4)*100)
print("Sum of gain Topic features:", round(topic_sum_gain,2),
      "percentage of total:",round(round(topic_sum_gain,4)/round(total_sum_gain,4),4)*100)

Sum of gain Readability features: 7015.85 percentage of total: 7.720000000000001
Sum of gain Sentiment features: 5868.18 percentage of total: 6.460000000000001
Sum of gain Topic features: 3895.57 percentage of total: 4.29


In [52]:
# All quarters
lgbm_Q1_rows = sapso_fi[sapso_fi['feature'].str.contains('_Q1')]
lgbm_Q2_rows = sapso_fi[sapso_fi['feature'].str.contains('_Q2')]
lgbm_Q3_rows = sapso_fi[sapso_fi['feature'].str.contains('_Q3')]
lgbm_Q4_rows = sapso_fi[sapso_fi['feature'].str.contains('_Q4')]

Q1_sum_gain = lgbm_Q1_rows['fi_sapso_gain'].sum()
Q2_sum_gain = lgbm_Q2_rows['fi_sapso_gain'].sum()
Q3_sum_gain = lgbm_Q3_rows['fi_sapso_gain'].sum()
Q4_sum_gain = lgbm_Q4_rows['fi_sapso_gain'].sum()

# print the result
print("Sum of gain Q1 features:", round(Q1_sum_gain,2),
      "percentage of total:",round(round(Q1_sum_gain,4)/round(total_sum_gain,4),4)*100)
print("Sum of gain Q2 features:", round(Q2_sum_gain,2),
      "percentage of total:",round(round(Q2_sum_gain,4)/round(total_sum_gain,4),4)*100)
print("Sum of gain Q3 features:", round(Q3_sum_gain,2),
      "percentage of total:",round(round(Q3_sum_gain,4)/round(total_sum_gain,4),4)*100)
print("Sum of gain Q4 features:", round(Q4_sum_gain,2),
      "percentage of total:",round(round(Q4_sum_gain,4)/round(total_sum_gain,4),4)*100)

Sum of gain Q1 features: 4896.62 percentage of total: 5.390000000000001
Sum of gain Q2 features: 4997.24 percentage of total: 5.5
Sum of gain Q3 features: 0.0 percentage of total: 0.0
Sum of gain Q4 features: 2371.07 percentage of total: 2.6100000000000003
