## Pré processamento dos dados

- Carregamentos de imports e dados.

In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
import os

df = pd.read_csv("../data/raw/CC GENERAL.csv")

df.head()

Unnamed: 0,CUST_ID,BALANCE,BALANCE_FREQUENCY,PURCHASES,ONEOFF_PURCHASES,INSTALLMENTS_PURCHASES,CASH_ADVANCE,PURCHASES_FREQUENCY,ONEOFF_PURCHASES_FREQUENCY,PURCHASES_INSTALLMENTS_FREQUENCY,CASH_ADVANCE_FREQUENCY,CASH_ADVANCE_TRX,PURCHASES_TRX,CREDIT_LIMIT,PAYMENTS,MINIMUM_PAYMENTS,PRC_FULL_PAYMENT,TENURE
0,C10001,40.900749,0.818182,95.4,0.0,95.4,0.0,0.166667,0.0,0.083333,0.0,0,2,1000.0,201.802084,139.509787,0.0,12
1,C10002,3202.467416,0.909091,0.0,0.0,0.0,6442.945483,0.0,0.0,0.0,0.25,4,0,7000.0,4103.032597,1072.340217,0.222222,12
2,C10003,2495.148862,1.0,773.17,773.17,0.0,0.0,1.0,1.0,0.0,0.0,0,12,7500.0,622.066742,627.284787,0.0,12
3,C10004,1666.670542,0.636364,1499.0,1499.0,0.0,205.788017,0.083333,0.083333,0.0,0.083333,1,1,7500.0,0.0,,0.0,12
4,C10005,817.714335,1.0,16.0,16.0,0.0,0.0,0.083333,0.083333,0.0,0.0,0,1,1200.0,678.334763,244.791237,0.0,12


- Usando mediana para substituir os valores nulos na coluna MINIMUM_PAYMENTS pois são um total de 3,5%.

In [3]:
df = df.drop(columns=['CUST_ID'])

df['MINIMUM_PAYMENTS'] = df['MINIMUM_PAYMENTS'].fillna(df['MINIMUM_PAYMENTS'].median())

- Reduzir a assimetria tratando os zeros corretamente com a aplicação de uma transformação logarítmica.

In [None]:
log_cols = [
    'BALANCE', 'PURCHASES', 'ONEOFF_PURCHASES',
    'INSTALLMENTS_PURCHASES', 'CASH_ADVANCE',
    'PAYMENTS', 'MINIMUM_PAYMENTS', 'CREDIT_LIMIT'
]

for col in log_cols:
    df[col] = np.log1p(df[col])

- Criando grupos temáticos para a criação de diferentes datasets.

In [10]:
risk_cols = [
    'PAYMENTS',
    'MINIMUM_PAYMENTS',
    'PRC_FULL_PAYMENT',
    'BALANCE'
]

credit_behavior_cols = [
    'CREDIT_LIMIT',
    'BALANCE',
    'CASH_ADVANCE',
    'CASH_ADVANCE_FREQUENCY',
    'CASH_ADVANCE_TRX'
]

consumption_cols = [
    'PURCHASES',
    'ONEOFF_PURCHASES',
    'INSTALLMENTS_PURCHASES',
    'PURCHASES_FREQUENCY',
    'ONEOFF_PURCHASES_FREQUENCY',
    'PURCHASES_INSTALLMENTS_FREQUENCY',
    'PURCHASES_TRX'
]

full_cols = df.columns.tolist()


- Corrigindo o problema de escala com a aplicação do ZScore.

In [11]:
def scale_dataset(data, columns):
    scaler = StandardScaler()
    scaled = scaler.fit_transform(data[columns])
    return pd.DataFrame(scaled, columns=columns)

df_risk = scale_dataset(df, risk_cols)
df_credit = scale_dataset(df, credit_behavior_cols)
df_consumption = scale_dataset(df, consumption_cols)
df_full = scale_dataset(df, full_cols)

- Salvar os dados pré processados para consumo.

In [12]:
processed_path = '../data/processed/'
os.makedirs(processed_path, exist_ok=True)

df_risk.to_csv(processed_path + 'creditcard_risk_zscore.csv', index=False)
df_credit.to_csv(processed_path + 'creditcard_credit_behavior_zscore.csv', index=False)
df_consumption.to_csv(processed_path + 'creditcard_consumption_zscore.csv', index=False)
df_full.to_csv(processed_path + 'creditcard_full_zscore.csv', index=False)