# Libraries

In [1]:
import pandas               as pd
import numpy                as np
import seaborn              as sns
import matplotlib.pyplot    as plt
import yaml

from IPython.core.display   import HTML
from IPython.display        import Image

from sklearn                import model_selection as ms

plt.rcParams["figure.figsize"]=12,8
pd.set_option('display.max_columns',30)

# Auxiliary functions

In [2]:
def import_config(path_yaml):
    """
    Load a YAML configuration file and return its contents as a dictionary.

    Parameters:
    path_yaml (str): Path to the YAML configuration file.

    Returns:
    dict: A dictionary containing the configuration data loaded from the YAML file.

    Raises:
    FileNotFoundError: If the specified YAML file does not exist.
    yaml.YAMLError: If there is an error while parsing the YAML file.

    Example:
    If 'config.yaml' contains:
    ```
    key1: value1
    key2: value2
    ```
    Calling import_config('config.yaml') will return:
    {'key1': 'value1', 'key2': 'value2'}
    """
    try:
        with open(path_yaml, 'r') as config_file:
            config = yaml.safe_load(config_file)
    except FileNotFoundError:
        raise FileNotFoundError(f"YAML file not found at path: {path_yaml}")
    except yaml.YAMLError as e:
        raise yaml.YAMLError(f"Error parsing YAML file: {e}")

    return config


# 1.0. IMPORT DATA AND CONFIG

## 1.1. CONFIG

In [3]:
config = import_config('config.yaml')

## 1.2. DATA

In [4]:
df_raw = pd.read_csv(config['data']['raw_path'])
df_raw.head()

Unnamed: 0,escolaridade,renda_mensal_informal,dependentes,estado_civil,idade,conta_poupanca,conta_salario,qtd_fonte_renda,cheque_sem_fundo,conta_conjunta,valor_conta_corrente,valor_conta_poupanca,valor_emprestimo,multa,juros,valor_emprestimo_atualizado,pago,id,genero,data,estado
0,,,S,solteiro,36,S,N,,S,N,,,8174.32,263.25,5844.98,14282.55,sim,309652396,feminino,2020-01-01,minas gerais
1,nivel medio,1894.5078,N,solteiro,39,S,N,1.0,N,N,,,41775.63,1607.29,7652.26,51035.18,sim,498764591,feminino,2020-01-01,espirito santo
2,nivel medio,,S,solteiro,26,S,N,1.0,N,N,,,522940.27,24860.4,319315.62,867116.29,sim,504868288,masculino,2020-01-01,sao paulo
3,,,N,casado(a) com comunhao de bens,23,N,N,,S,N,,,1418.0,21472.36,2967.48,25857.84,sim,541365315,feminino,2020-01-01,sao paulo
4,,,S,solteiro,33,S,N,,S,N,,,2078.97,19.44,46.35,2144.76,sim,563622907,masculino,2020-01-01,espirito santo


### 1.2.1. Split Data

#### 1.2.1.1. Split data: base and test

In [5]:
X = df_raw.drop('pago', axis = 1)
y = df_raw['pago'].copy()

X_base,X_test, y_base, y_test = ms.train_test_split(X, y, test_size=0.15, stratify=y, shuffle=True)

df_test = pd.concat([X_test, y_test], axis = 1)

print("Dimension of split data")
print(f'X_base:{X_base.shape}\ny_base:{y_base.shape}\nX_test:{X_test.shape}\ny_test:{y_test.shape}\n')

Dimension of split data
X_base:(42831, 20)
y_base:(42831,)
X_test:(7559, 20)
y_test:(7559,)



#### 1.2.1.2. Split data: train and validation sets

In [6]:
base = pd.concat([X_base, y_base], axis = 1)

X = base.drop('pago', axis = 1)
y = base['pago'].copy()

X_train, X_val, y_train, y_val = ms.train_test_split(X,y, test_size = 0.15,stratify=y, shuffle=True)

print("Dimension of split base into train and validation sets")
print(f'X_train:{X_train.shape}\ny_train:{y_train.shape}\nX_val:{X_val.shape}\ny_val:{y_val.shape}\n')

df1_train = pd.concat([X_train, y_train], axis = 1)
df1_val = pd.concat([X_val, y_val], axis = 1)

Dimension of split base into train and validation sets
X_train:(36406, 20)
y_train:(36406,)
X_val:(6425, 20)
y_val:(6425,)

