In [1]:
import numpy as np
import pandas as pd

## Cehcking the amount of features in the FS data sets

In [2]:
# Importing the complete set of features
mda = pd.read_csv('mda_model_data.csv')
ect = pd.read_csv('ect_model_data.csv')
ff = pd.read_csv('baseline_data.csv')

In [3]:
# Importing the pso features
pso_rlr = pd.read_csv('rlr_pso_data.csv')
pso_knn = pd.read_csv('knn_pso_data.csv')
pso_lgbm = pd.read_csv('lgbm_pso_data.csv')
sapso_rlr = pd.read_csv('rlr_sapso_data.csv')
sapso_knn = pd.read_csv('knn_sapso_data.csv')

### Extracting column names

In [4]:
# Extracting the baseline columns (including CIK, and Target etc.)
ff_cols = ff.columns

In [5]:
# Extracting the unique mda and ect cols
mda_cols = mda.columns
mda_cols = [col for col in mda_cols if col not in ff_cols]
ect_cols = ect.columns
ect_cols = [col for col in ect_cols if col not in ff_cols]

In [6]:
# Removing the non-features
ff = ff.drop(columns=['Target', 'fraud', 'res', 'cik'])
ff_cols = ff_cols = ff.columns

In [7]:
# The FS features
pso_rlr = pso_rlr.drop(columns='Target')
pso_rlr_cols = pso_rlr.columns
pso_knn = pso_knn.drop(columns='Target')
pso_knn_cols = pso_knn.columns
pso_lgbm = pso_lgbm.drop(columns='Target')
pso_lgbm_cols = pso_lgbm.columns
sapso_rlr = sapso_rlr.drop(columns='Target')
sapso_rlr_cols = sapso_rlr.columns
sapso_knn = sapso_knn.drop(columns='Target')
sapso_knn_cols = sapso_knn.columns

In [8]:
# Storing them in one object
fs_cols = {'PSO-RLR': pso_rlr_cols, 'PSO-KNN': pso_knn_cols, 'PSO-LGBM': pso_lgbm_cols,
          'SAPSO-RLR': sapso_rlr_cols, 'SAPSO-KNN': sapso_knn_cols}

### Checking the distribution of FS methods

In [9]:
for name, cols in fs_cols.items():
    fs_ff_cols = set(ff_cols).intersection(cols)
    fs_mda_cols = set(mda_cols).intersection(cols)
    fs_ect_cols = set(ect_cols).intersection(cols)
    print('The percentage of baseline features included in',name,':',round(len(fs_ff_cols)/len(ff_cols),2))
    print('The percentage of MD&A features included in',name,':',round(len(fs_mda_cols)/len(mda_cols),2))
    print('The percentage of ECT features included in',name,':',round(len(fs_ect_cols)/len(ect_cols),2))

The percentage of baseline features included in PSO-RLR : 0.43
The percentage of MD&A features included in PSO-RLR : 0.39
The percentage of ECT features included in PSO-RLR : 0.4
The percentage of baseline features included in PSO-KNN : 0.4
The percentage of MD&A features included in PSO-KNN : 0.3
The percentage of ECT features included in PSO-KNN : 0.37
The percentage of baseline features included in PSO-LGBM : 0.44
The percentage of MD&A features included in PSO-LGBM : 0.41
The percentage of ECT features included in PSO-LGBM : 0.43
The percentage of baseline features included in SAPSO-RLR : 0.45
The percentage of MD&A features included in SAPSO-RLR : 0.46
The percentage of ECT features included in SAPSO-RLR : 0.4
The percentage of baseline features included in SAPSO-KNN : 0.4
The percentage of MD&A features included in SAPSO-KNN : 0.43
The percentage of ECT features included in SAPSO-KNN : 0.43


### Checking whether cosine similariy is included

In [10]:
for name, cols in fs_cols.items():
    if 'Cos_Sim' in cols:
        print('Cos_Sim is included in:', name)
    else:
        print('Cos_Sim is not included in:', name)

Cos_Sim is not included in: PSO-RLR
Cos_Sim is not included in: PSO-KNN
Cos_Sim is not included in: PSO-LGBM
Cos_Sim is not included in: SAPSO-RLR
Cos_Sim is not included in: SAPSO-KNN


### Checking the distribution of ECT quarters

In [11]:
Qs = ['Q1', 'Q2', 'Q3', 'Q4']

In [12]:
for name, cols in fs_cols.items():
    fs_ect_cols = set(ect_cols).intersection(cols)
    for i in Qs:
        fs_Q_cols = [col for col in fs_ect_cols if i in col]
        print('The percentage of ECT features included in',name,'that orginate from',i,':',round(len(fs_Q_cols)/len(fs_ect_cols),2))

The percentage of ECT features included in PSO-RLR that orginate from Q1 : 0.25
The percentage of ECT features included in PSO-RLR that orginate from Q2 : 0.23
The percentage of ECT features included in PSO-RLR that orginate from Q3 : 0.28
The percentage of ECT features included in PSO-RLR that orginate from Q4 : 0.23
The percentage of ECT features included in PSO-KNN that orginate from Q1 : 0.25
The percentage of ECT features included in PSO-KNN that orginate from Q2 : 0.3
The percentage of ECT features included in PSO-KNN that orginate from Q3 : 0.22
The percentage of ECT features included in PSO-KNN that orginate from Q4 : 0.2
The percentage of ECT features included in PSO-LGBM that orginate from Q1 : 0.24
The percentage of ECT features included in PSO-LGBM that orginate from Q2 : 0.21
The percentage of ECT features included in PSO-LGBM that orginate from Q3 : 0.23
The percentage of ECT features included in PSO-LGBM that orginate from Q4 : 0.3
The percentage of ECT features included