In [None]:
# imports

In [2]:
import joblib
import pandas as pd
import pandera
from sklearn.model_selection import train_test_split
from pandera import Check, Column, DataFrameSchema
from sklearn.pipeline import Pipeline
from feature_engine.discretisation import EqualFrequencyDiscretiser, EqualWidthDiscretiser
from feature_engine.imputation import MeanMedianImputer
from feature_engine.wrappers import SklearnTransformerWrapper
from sklearn.preprocessing import RobustScaler, StandardScaler
from sklearn.metrics import roc_auc_score
from sklearn.linear_model import LogisticRegression

In [None]:
# utils

In [None]:
columns_to_use = ['target', 'TaxaDeUtilizacaoDeLinhasNaoGarantidas',
       'Idade', 'NumeroDeVezes30-59DiasAtrasoNaoPior', 'TaxaDeEndividamento',
       'RendaMensal', 'NumeroDeLinhasDeCreditoEEmprestimosAbertos',
       'NumeroDeVezes90DiasAtraso', 'NumeroDeEmprestimosOuLinhasImobiliarias',
       'NumeroDeVezes60-89DiasAtrasoNaoPior', 'NumeroDeDependentes']

In [None]:
# data load

In [None]:
class DataLoad:
    """Class data load"""
    def __init__(self) -> None:
        pass
    
    def load_data(self) -> pd.DataFrame:
        """This function will load the dataset
        return:
        pandas DataFrame"""
        loaded_data = pd.read_csv('../data/raw/train.csv')
        return loaded_data

In [None]:
dl = DataLoad()

In [None]:
df = dl.load_data()[columns_to_use]
df.head()

In [None]:
# data validation

In [None]:
class DataValidation:
    def __init__(self, columns_to_use) -> None:
        self.columns_to_use = columns_to_use

    def check_shape_data(self, dataframe: pd.DataFrame) -> bool:
        try:
            print('Initiating validation...')
            dataframe.columns = self.columns_to_use
            return True
        except Exception as e:
            print(f'Error on validation: {e}')
            return False
        
    def chek_columns(self, dataframe: pd.DataFrame) -> bool:
        schema = DataFrameSchema(
            {
                "target": Column(int, Check.isin([0, 1]), Check(lambda x: x > 0), coerce=True),
                "TaxaDeUtilizacaoDeLinhasNaoGarantidas": Column(float, nullable=True),
                "Idade": Column(int, nullable=True),
                "NumeroDeVezes30-59DiasAtrasoNaoPior": Column(int, nullable=True),
                "TaxaDeEndividamento": Column(float, nullable=True),
                "RendaMensal": Column(float, nullable=True),
                "NumeroDeLinhasDeCreditoEEmprestimosAbertos": Column(int, nullable=True),
                "NumeroDeVezes90DiasAtraso": Column(int, nullable=True),
                "NumeroDeEmprestimosOuLinhasImobiliarias": Column(int, nullable=True),
                "NumeroDeVezes60-89DiasAtrasoNaoPior": Column(int, nullable=True),
                "NumeroDeDependentes": Column(float, nullable=True)
            }
        )
        try:
            schema.validate(dataframe)
            print('Validation columns passed...')
            return True
        except pandera.errors.SchemaErrors as exc:
            print('Validation columns failed...')
            pandera.display(exc.failure_cases)
            return False
        
    def run(self, dataframe : pd.DataFrame) -> bool:
        if self.check_shape_data(dataframe) and self.chek_columns(dataframe):
            print('Success on validate data')
            return True
        else:
            print('Failed on validation')
            return False



In [None]:
dv = DataValidation()

In [None]:
dv.run(df)

In [None]:
# data transformation

In [None]:
class DataTransformation:
    def __init__(self, dataframe : pd.DataFrame, target_name : str):
        self.dataframe = dataframe
        self.target_name = target_name

    def train_test_spliting(self):
        X = self.dataframe.drop( self.target_name, axis = 1)
        y = self.dataframe[self.target_name]

        X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.25, stratify=y, random_state=42)
        return X_train, X_val, y_train, y_val

In [None]:
dt = DataTransformation(df, 'target')

In [None]:
X_train, X_val, y_train, y_val = dt.train_test_spliting()

In [None]:
print('Train shape: ',X_train.shape)
print('Test shape: ',X_val.shape)

In [None]:
# data preprocessing

In [None]:
class DataPreprocess:
    def __init__(self, dataframe: pd.DataFrame,pipe: Pipeline):
        self.dataframe = dataframe
        self.pipe = pipe 
        
    def pipeline(self):
        train_pipe = self.pipe
        train_pipe.fit(self.dataframe)
        return train_pipe 
    
    def run(self):
        print('Initiating preprocessing...')
        trained_pipeline = self.pipeline()
        data_preprocessed = trained_pipeline.transform(self.dataframe)
        return data_preprocessed

In [None]:

pipe = Pipeline(
    [
    ('imputer', MeanMedianImputer(variables=['RendaMensal','NumeroDeDependentes'])),
        ('discretizer', EqualFrequencyDiscretiser(variables=['TaxaDeUtilizacaoDeLinhasNaoGarantidas', 'TaxaDeEndividamento', 'RendaMensal'])),
            ('scaler', SklearnTransformerWrapper(StandardScaler()))
        ]
            )

In [None]:

dp = DataPreprocess(X_train, pipe)

In [None]:
X_train_processed = dp.run()

In [None]:
X_train.head()

In [None]:
X_train_processed.head()

In [None]:
joblib.dump(dp.pipeline(),'preprocessor.pkl')

In [None]:
# train models

In [None]:
class TrainModels:
    def __init__(self, dados_X: pd.DataFrame,dados_y: pd.DataFrame):
        self.dados_X = dados_X 
        self.dados_y = dados_y 
        
    def train(self, model):
        model.fit(self.dados_X, self.dados_y)
        joblib.dump(model, 'modelo.pkl')
        return model 
    
    def predict(self, dados_para_prever: pd.DataFrame):
        model_fitted = self._load_model()
        dados_pred = model_fitted.predict_proba(dados_para_prever)
        return dados_pred
    
    def _load_model(self):
        model = joblib.load('modelo.pkl')
        return model

In [None]:
tm = TrainModels(dados_X=X_train_processed,dados_y = y_train)

In [None]:
tm.train(model=LogisticRegression())

In [None]:
y_train_pred = tm.predict(X_train_processed)

In [None]:
y_train_pred

In [None]:
# model evaluation

In [None]:
preprocessor = dp.pipeline()

In [None]:
preprocessor

In [None]:
X_val_processed = preprocessor.transform(X_val)

In [None]:
y_val_pred = tm.predict(X_val_processed)

In [None]:
class ModelEvaluation:
    def __init__(self):
        pass 
    
    def eval_metrics(self, dados_reais, dados_preditos):
        roc_auc = roc_auc_score(dados_reais, dados_preditos)
        return roc_auc

In [None]:
me = ModelEvaluation()

In [None]:
y_train_pred[:, 1]

In [None]:
me.eval_metrics(y_train, y_train_pred[:, 1])

In [None]:
me.eval_metrics(y_val, y_val_pred[:, 1])

In [None]:
# experiments

In [None]:
## experiment 1

In [None]:
# 1. etapa
pipe = Pipeline(
    [
        ('imputer', MeanMedianImputer(variables=['RendaMensal','NumeroDeDependentes'])),
            ('discretizer', EqualFrequencyDiscretiser(variables=['TaxaDeUtilizacaoDeLinhasNaoGarantidas', 'TaxaDeEndividamento','RendaMensal'])),
            ('scaler', SklearnTransformerWrapper(RobustScaler()))
                 ]
                 )
dp = DataPreprocess(X_train, pipe)
X_train_processed = dp.run()

#---------------------#
#2. etapa
tm = TrainModels(dados_X=X_train_processed,dados_y = y_train)
tm.train(model=LogisticRegression(penalty='l2', max_iter=1500, solver='newton-cholesky'))
y_val_pred = tm.predict(X_val_processed)

#---------------------#
# 3.etapa
me = ModelEvaluation()
me.eval_metrics(y_val, y_val_pred[:, 1])