# 0.0 Imports

In [35]:
import pandera
import joblib
import pandas as pd
from pandera import Check, Column, DataFrameSchema
from sklearn.pipeline import Pipeline
import numpy as np
from sklearn.model_selection import train_test_split

from feature_engine.discretisation import EqualFrequencyDiscretiser
#from sklearn_engine.imputation import MeanMedianImputer
#from sklearn_engine.wrappers import SklearnTransformerWrapper
from sklearn.preprocessing import RobustScaler, StandardScaler
from sklearn.metrics import roc_auc_score
from sklearn.linear_model import LogisticRegression

# 0.1 Utils

In [36]:
columns_to_use = ['target', 'TaxaDeUtilizacaoDeLinhasNaoGarantidas',
       'Idade', 'NumeroDeVezes30-59DiasAtrasoNaoPior', 'TaxaDeEndividamento',
       'RendaMensal', 'NumeroDeLinhasDeCreditoEEmprestimosAbertos',
       'NumeroDeVezes90DiasAtraso', 'NumeroDeEmprestimosOuLinhasImobiliarias',
       'NumeroDeVezes60-89DiasAtrasoNaoPior', 'NumeroDeDependentes']

# 1.0 Data Load

In [37]:
class DataLoad:
    """Class data load"""

    def __init__(self) -> None:
        pass

    def load_data(self) -> pd.DataFrame:
        
        loaded_data = pd.read_csv("../data/raw/train.csv")
        return loaded_data
            

In [38]:
dl = DataLoad()
df = dl.load_data()[columns_to_use]
df.head()

Unnamed: 0,target,TaxaDeUtilizacaoDeLinhasNaoGarantidas,Idade,NumeroDeVezes30-59DiasAtrasoNaoPior,TaxaDeEndividamento,RendaMensal,NumeroDeLinhasDeCreditoEEmprestimosAbertos,NumeroDeVezes90DiasAtraso,NumeroDeEmprestimosOuLinhasImobiliarias,NumeroDeVezes60-89DiasAtrasoNaoPior,NumeroDeDependentes
0,1,0.766127,45,2,0.802982,9120.0,13,0,6,0,2.0
1,0,0.957151,40,0,0.121876,2600.0,4,0,0,0,1.0
2,0,0.65818,38,1,0.085113,3042.0,2,1,0,0,0.0
3,0,0.23381,30,0,0.03605,3300.0,5,0,0,0,0.0
4,0,0.907239,49,1,0.024926,63588.0,7,0,1,0,0.0


# 2.0 Data validation

In [39]:
class DataValidation:
    def __init__(self, columns_to_use) -> None:
        self.columns_to_use = columns_to_use

    def check_shape_data(self, dataframe: pd.DataFrame) -> bool:
        try:
            print("Validacao started")
            dataframe.columns = self.columns_to_use
            return True
        except Exception as e:
            print(f"Validacao errou: {e}")
            return False
        
    def check_columns(self, dataframe: pd.DataFrame) -> bool:
        schema = DataFrameSchema(
                {
                    "target": Column(int, Check.isin([0, 1]), Check(lambda x: x > 0), coerce=True),
                    "TaxaDeUtilizacaoDeLinhasNaoGarantidas": Column(float, nullable=True),
                    "Idade": Column(int, nullable=True),
                    "NumeroDeVezes30-59DiasAtrasoNaoPior": Column(int, nullable=True),
                    "TaxaDeEndividamento": Column(float, nullable=True),
                    "RendaMensal": Column(float, nullable=True),
                    "NumeroDeLinhasDeCreditoEEmprestimosAbertos": Column(int, nullable=True),
                    "NumeroDeVezes90DiasAtraso": Column(int, nullable=True),
                    "NumeroDeEmprestimosOuLinhasImobiliarias": Column(int, nullable=True),
                    "NumeroDeVezes60-89DiasAtrasoNaoPior": Column(int, nullable=True),
                    "NumeroDeDependentes": Column(float, nullable=True)
                }
        )
        try:
            schema.validate(dataframe)
            print("Validation columns passed...")
            return True
        except pandera.errors.SchemaErrors as exc:
            print("Validation columns failed...")
            pandera.display(exc.failure_cases)
        return False
    
    def run(self, dataframe: pd.DataFrame) -> bool:
        if self.check_shape_data(dataframe) and self.check_columns(dataframe):
            print('Validation with success')
            return True 
        else:
            print('Validation failed')
            return False

In [40]:
dv = DataValidation(columns_to_use)
dv.run(df)

Validacao started
Validation columns passed...
Validation with success


True

# 3.0 Data Transformation

In [41]:
class DataTransformation:
    def __init__(self, dataframe: pd.DataFrame, 
                       target_name: str):
        self.dataframe = dataframe 
        self.target_name = target_name 
        
    def train_test_splitting(self):
        X = self.dataframe.drop(self.target_name, axis=1)
        y = self.dataframe[self.target_name]
        
        X_train, X_valid, y_train, y_valid = train_test_split(X, y, stratify=y)
        
        return X_train, X_valid, y_train, y_valid

In [42]:
dt = DataTransformation(df, 'target')
X_train, X_valid, y_train, y_valid = dt.train_test_spliting()
X_train.shape

(112500, 10)

In [43]:
X_valid.shape

(37500, 10)

# 4.0 Data preprocessing

In [None]:
class DataPreprocess:
    def __init__