## Modify

In [1]:
import pandas as pd
import numpy as np

from feature_engine.imputation import EndTailImputer, AddMissingIndicator, CategoricalImputer
from feature_engine.selection import DropFeatures
from feature_engine.wrappers import SklearnTransformerWrapper
from feature_engine.discretisation import DecisionTreeDiscretiser
from feature_engine.encoding import OneHotEncoder

from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
from sklearn.tree import DecisionTreeRegressor
from sklearn.pipeline import Pipeline


In [2]:
file_path_1 = '/home/tomas/Documentos/gh-repos/Trabalho-APC-ASN/X_train.csv'
X_train = pd.read_csv(file_path_1)

file_path_2 = '/home/tomas/Documentos/gh-repos/Trabalho-APC-ASN/y_train.csv'
y_train = pd.read_csv(file_path_2)

file_path_3 ='/home/tomas/Documentos/gh-repos/Trabalho-APC-ASN/X_val.csv'
X_val = pd.read_csv(file_path_3)

file_path_4 = '/home/tomas/Documentos/gh-repos/Trabalho-APC-ASN/y_val.csv'
y_val = pd.read_csv(file_path_4)

file_path_5 = '/home/tomas/Documentos/gh-repos/Trabalho-APC-ASN/X_test.csv'
X_test = pd.read_csv(file_path_5)

file_path_6 = '/home/tomas/Documentos/gh-repos/Trabalho-APC-ASN/y_test.csv'
y_test = pd.read_csv(file_path_6)

In [3]:
class DataTypeTransformer(BaseEstimator, TransformerMixin):
    def __init__(self, columns_to_convert=None, dtype='object'):
        self.columns_to_convert = columns_to_convert
        self.dtype = dtype
    
    def fit(self, X, y=None):
        return self
    
    def transform(self, X):
        if self.columns_to_convert is None:
            columns = X.columns
        else:
            columns = self.columns_to_convert
        
        for col in columns:
            X[col] = X[col].astype(self.dtype)
        
        return X


In [4]:
num_var = X_train.select_dtypes(include=np.number).columns.tolist()


cat = ['REASON', 'JOB']
Features_Droper = DropFeatures(features_to_drop = cat,)
#
#
#
#
End_Tail_Imputer = EndTailImputer(imputation_method = 'gaussian', 
                                  tail = 'right', 
                                  fold = 3, 
                                  variables = ['VALUE', 'DEBTINC'])
#
#
#
#
variables = ['LOAN', 'MORTDUE', 'YOJ', 
             'DEROG', 'DELINQ', 'CLAGE', 
             'NINQ', 'CLNO']
#
Add_Missing_Indicator = AddMissingIndicator(missing_only = True,
                                            variables = num_var)
#
#
#
#
_nonLin = IterativeImputer(estimator=DecisionTreeRegressor(max_features='sqrt', random_state=59),
                           max_iter = 1000,
                           random_state = 59)
#
imputer_nonLin = SklearnTransformerWrapper(transformer = _nonLin, variables = variables)
#
#
#
#
treeDisc = DecisionTreeDiscretiser(cv = 5,
                                   scoring = 'accuracy',
                                   variables = num_var,
                                   regression = False,
                                   param_grid = {'min_samples_leaf': [100, 200]}
                                   )
#
#
#
#
dtype_transformer = DataTypeTransformer(columns_to_convert = num_var, 
                                        dtype ='object')
#
#
#
#
cols_one_hot = ['LOAN', 'MORTDUE', 'VALUE', 'YOJ',
                'DEROG', 'DELINQ', 'CLAGE',
                'NINQ', 'CLNO', 'DEBTINC',]

One_Hot_Encoder = OneHotEncoder(drop_last = True,
                                variables = cols_one_hot,)
#
#
#
#
pipeline = Pipeline([('FeaturesDroper', Features_Droper),
                     ('EndTailImputer', End_Tail_Imputer),
                     ('AddMissingIndicator', Add_Missing_Indicator),
                     ('ImputerNonLin', imputer_nonLin),
                     ('TreeDiscretiser', treeDisc),
                     ('DTypeTransformer', dtype_transformer),
                     ('OneHotEncoder', One_Hot_Encoder),
                    ])


pipeline.fit(X_train, y_train)



In [5]:
train_t = pipeline.transform(X_train)
val_t = pipeline.transform(X_val)
test_t = pipeline.transform(X_test)

In [6]:
train_t.shape

(3576, 174)

In [7]:
print(train_t.isna().sum().sum())
print(val_t.isna().sum().sum())
print(test_t.isna().sum().sum())

0
0
0


In [None]:
file_path_1 = '/home/tomas/Documentos/gh-repos/Trabalho-APC-ASN/X_train_M.csv'
train_t.to_csv(file_path_1, index_label=False)

file_path_2 ='/home/tomas/Documentos/gh-repos/Trabalho-APC-ASN/X_val_M.csv'
val_t.to_csv(file_path_2, index_label=False)

file_path_3 = '/home/tomas/Documentos/gh-repos/Trabalho-APC-ASN/X_test_M.csv'
test_t.to_csv(file_path_3, index_label=False)