## Importacion de librerias

In [19]:
import os
import glob
import numpy as np
import pandas as pd
import pickle
import matplotlib.pyplot as plt
from matplotlib.backends.backend_pdf import PdfPages
from sklearn.impute import SimpleImputer
from collections import OrderedDict
import warnings
warnings.filterwarnings('ignore')

## Lectura de datasets

In [3]:
orig_dir = os.getcwd()
os.chdir("..")
os.chdir('01. Dataset')
dsets_files = glob.glob('*.csv')
for fil in dsets_files:
    if fil == 'labels_bankruptcy.csv': 
        f_labels = os.path.abspath(fil)
    if fil == 'dataset_normalizado.csv':
        f_data = os.path.abspath(fil)
os.chdir(orig_dir)

## Dataset and labels, read and adaptation

In [5]:
df_col = pd.read_csv(f_labels, sep=' ',names=['codes','Names'])
names_to_codes = dict(zip(df_col['Names'],df_col['codes'])) # this will serve later for brevity
codes_to_names = dict(zip(df_col['codes'],df_col['Names']))
cols = ['flag'] # add flag column
cols = cols + list(df_col['Names'])
df = pd.read_csv(f_data, encoding = "cp1252", names=cols, skiprows=1)
df.head(5)

Unnamed: 0,flag,Cost_of_Interest-bearing_Debt,Cash_Reinvestment_Ratio,Current_Ratio,Acid_Test,Interest_Expenses/Total_Revenue,Total_Liability/Equity_Ratio,Liability/Total_Assets,Interest-bearing_Debt/Equity,Contingent_Liability/Equity,...,CFO_to_Assets,Cash_Flow_to_Equity,Realized_Gross_Profit_Growth_Rate,Operating_Income_Growth,Net_Income_Growth,Continuing_Operating_Income_after_Tax_Growth,Net_Income-Excluding_Disposal_Gain_or_Loss_Growth,Total_Asset_Growth,Total_Equity_Growth,Return_on_Total_Asset_Growth
0,1,0.370594,0.424389,0.40575,0.601457,0.601457,0.998969,0.796887,0.808809,0.302646,...,0.716845,0.009219,0.622879,0.601453,0.82789,0.290202,0.026601,0.56405,1,0.016469
1,1,0.464291,0.538214,0.51673,0.610235,0.610235,0.998946,0.79738,0.809301,0.303556,...,0.795297,0.008323,0.623652,0.610237,0.839969,0.283846,0.264577,0.570175,1,0.020794
2,1,0.426071,0.499019,0.472295,0.60145,0.601364,0.998857,0.796403,0.808388,0.302035,...,0.77467,0.040003,0.623841,0.601449,0.836774,0.290189,0.026555,0.563706,1,0.016474
3,1,0.399844,0.451265,0.457733,0.583541,0.583541,0.9987,0.796967,0.808966,0.30335,...,0.739555,0.003252,0.622929,0.583538,0.834697,0.281721,0.026697,0.564663,1,0.023982
4,1,0.465022,0.538432,0.522298,0.598783,0.598783,0.998973,0.797366,0.809304,0.303475,...,0.795016,0.003878,0.623521,0.598782,0.839973,0.278514,0.024752,0.575617,1,0.03549


## Preprocesamiento

In [6]:
# Cambiamos los nombres de columnas por facilidad : 
# x1 a x64 atributos y Y como etiqueta de quiebra o no quiebra
df_bankruptcy = df.rename(columns=names_to_codes,inplace=False)
df_bankruptcy.head(5)

Unnamed: 0,flag,X1,X2,X3,X4,X5,X6,X7,X8,X9,...,X86,X87,X88,X89,X90,X91,X92,X93,X94,X95
0,1,0.370594,0.424389,0.40575,0.601457,0.601457,0.998969,0.796887,0.808809,0.302646,...,0.716845,0.009219,0.622879,0.601453,0.82789,0.290202,0.026601,0.56405,1,0.016469
1,1,0.464291,0.538214,0.51673,0.610235,0.610235,0.998946,0.79738,0.809301,0.303556,...,0.795297,0.008323,0.623652,0.610237,0.839969,0.283846,0.264577,0.570175,1,0.020794
2,1,0.426071,0.499019,0.472295,0.60145,0.601364,0.998857,0.796403,0.808388,0.302035,...,0.77467,0.040003,0.623841,0.601449,0.836774,0.290189,0.026555,0.563706,1,0.016474
3,1,0.399844,0.451265,0.457733,0.583541,0.583541,0.9987,0.796967,0.808966,0.30335,...,0.739555,0.003252,0.622929,0.583538,0.834697,0.281721,0.026697,0.564663,1,0.023982
4,1,0.465022,0.538432,0.522298,0.598783,0.598783,0.998973,0.797366,0.809304,0.303475,...,0.795016,0.003878,0.623521,0.598782,0.839973,0.278514,0.024752,0.575617,1,0.03549


In [7]:
df_bankruptcy.dtypes # X94 se considerara pero sus valores no parecen importantes (constante)

flag      int64
X1      float64
X2      float64
X3      float64
X4      float64
         ...   
X91     float64
X92     float64
X93     float64
X94       int64
X95     float64
Length: 96, dtype: object

In [8]:
# convertimos todas las columnas a float por precaucion
def convert_datatype(df):
    index = 1
    while (index <= 95):
        colname = df.columns[index]
        col = getattr(df, colname)
        df[colname] = col.astype(float)
        index += 1


convert_datatype(df_bankruptcy)

In [9]:
# Convertimos las etiquetas a int (tambien por precaucion)
def labels_to_binary(df):
    col = getattr(df, 'flag')
    df['flag'] = col.astype(int)

labels_to_binary(df_bankruptcy)

In [10]:
# Eliminamos los valores Nan
def drop_nans(df_bankruptcy, verbose=False):
    clean_dataframes = df_bankruptcy.dropna(axis=0, how='any')
    return clean_dataframes

nan_dropped_df = drop_nans(df_bankruptcy, verbose=True)

In [15]:
# imputamos considerando la mediana => se puede cambiar a media
def imputation(df, strategy='median'):
    imputer = SimpleImputer(missing_values=np.nan, strategy=strategy)
    mean_imputed_df = pd.DataFrame(imputer.fit_transform(df))
    mean_imputed_df.columns = df.columns
    return mean_imputed_df

median_imputed_df = imputation(df_bankruptcy)

In [18]:
# Se podrian emplear df combinados como un dict ordenado, a ver en el futuro
imputed_dict = OrderedDict()
imputed_dict['Median'] = median_imputed_df
print(mean_imputed_df.head(5))

   flag        X1        X2        X3        X4        X5        X6        X7  \
0   1.0  0.370594  0.424389  0.405750  0.601457  0.601457  0.998969  0.796887   
1   1.0  0.464291  0.538214  0.516730  0.610235  0.610235  0.998946  0.797380   
2   1.0  0.426071  0.499019  0.472295  0.601450  0.601364  0.998857  0.796403   
3   1.0  0.399844  0.451265  0.457733  0.583541  0.583541  0.998700  0.796967   
4   1.0  0.465022  0.538432  0.522298  0.598783  0.598783  0.998973  0.797366   

         X8        X9  ...       X86       X87       X88       X89       X90  \
0  0.808809  0.302646  ...  0.716845  0.009219  0.622879  0.601453  0.827890   
1  0.809301  0.303556  ...  0.795297  0.008323  0.623652  0.610237  0.839969   
2  0.808388  0.302035  ...  0.774670  0.040003  0.623841  0.601449  0.836774   
3  0.808966  0.303350  ...  0.739555  0.003252  0.622929  0.583538  0.834697   
4  0.809304  0.303475  ...  0.795016  0.003878  0.623521  0.598782  0.839973   

        X91       X92       X93 

In [26]:
pickle.dump(imputed_dict,open(os.path.join('results','imputed_data.pkl'),'wb'))

In [25]:
median_imputed_df.to_pickle(os.path.join('results','median_imputed_data.pkl'))