In [1]:
from pathlib import Path
import pandas as pd
import numpy as np

In [2]:
path_data_folder = Path(r"D:\coding\Jupyter_notebooks\ML\credit_risk_data")
path_train = path_data_folder / "application_train.csv"
path_test = path_data_folder / "application_test.csv"
path_col_description = path_data_folder / "HomeCredit_columns_description.csv"

In [3]:
# create sklearn data pipeline using EDA results

from sklearn.base import TransformerMixin, BaseEstimator

class PrimaryTransformer(TransformerMixin, BaseEstimator):
    unnecessary_columns = ["SK_ID_CURR","NAME_TYPE_SUITE","DAYS_BIRTH","DAYS_EMPLOYED","DAYS_REGISTRATION","DAYS_ID_PUBLISH","WEEKDAY_APPR_PROCESS_START","HOUR_APPR_PROCESS_START","REGION_RATING_CLIENT","DAYS_LAST_PHONE_CHANGE","OBS_30_CNT_SOCIAL_CIRCLE","DEF_30_CNT_SOCIAL_CIRCLE","OBS_60_CNT_SOCIAL_CIRCLE","FLAG_MOBIL","FLAG_EMP_PHONE","FLAG_WORK_PHONE","FLAG_PHONE","FLAG_EMAIL","FLAG_CONT_MOBILE"]
    feat_categorical = {"TARGET", "NAME_CONTRACT_TYPE",  "CODE_GENDER",  "FLAG_OWN_CAR",  "FLAG_OWN_REALTY",  "NAME_INCOME_TYPE",  "NAME_EDUCATION_TYPE",  "NAME_FAMILY_STATUS",  "NAME_HOUSING_TYPE", "OCCUPATION_TYPE", "REGION_RATING_CLIENT_W_CITY", "REG_REGION_NOT_LIVE_REGION", "REG_REGION_NOT_WORK_REGION", "LIVE_REGION_NOT_WORK_REGION", "REG_CITY_NOT_LIVE_CITY", "REG_CITY_NOT_WORK_CITY", "LIVE_CITY_NOT_WORK_CITY", "ORGANIZATION_TYPE"}
    feat_high_correlating = ["CNT_CHILDREN", "AMT_GOODS_PRICE", "AMT_ANNUITY"]
    
    def transform(self, X, **transform_params):
        X.drop(columns=self.unnecessary_columns, inplace=True)
        X.drop(columns=self.feat_high_correlating, inplace=True)
        X.drop(columns=[i for i in X.columns if i.startswith("FLAG_DOCUMENT_")], inplace=True)
        
        for feature in self.feat_categorical:
            X[feature] = X[feature].astype("category")
        return X
    
class RequestsColumnTransformer(BaseEstimator, TransformerMixin):
    requests_columns = ["AMT_REQ_CREDIT_BUREAU_HOUR","AMT_REQ_CREDIT_BUREAU_DAY","AMT_REQ_CREDIT_BUREAU_WEEK","AMT_REQ_CREDIT_BUREAU_MON","AMT_REQ_CREDIT_BUREAU_QRT","AMT_REQ_CREDIT_BUREAU_YEAR"]
    new_requests_col = "AMT_REQ_CREDIT_BUREAU"
    
    def transform(self, X, **transform_params):
        X["AMT_REQ_CREDIT_BUREAU"] = np.sum(X[self.requests_columns], axis=1)
        X.drop(columns=self.requests_columns, inplace=True)
        return X

class BuildingColumnsTransformer(BaseEstimator, TransformerMixin):
    building_columns = ["APARTMENTS_AVG","BASEMENTAREA_AVG","YEARS_BEGINEXPLUATATION_AVG","YEARS_BUILD_AVG","COMMONAREA_AVG","ELEVATORS_AVG","ENTRANCES_AVG","FLOORSMAX_AVG","FLOORSMIN_AVG","LANDAREA_AVG","LIVINGAPARTMENTS_AVG","LIVINGAREA_AVG","NONLIVINGAPARTMENTS_AVG","NONLIVINGAREA_AVG","APARTMENTS_MODE","BASEMENTAREA_MODE","YEARS_BEGINEXPLUATATION_MODE","YEARS_BUILD_MODE","COMMONAREA_MODE","ELEVATORS_MODE","ENTRANCES_MODE","FLOORSMAX_MODE","FLOORSMIN_MODE","LANDAREA_MODE","LIVINGAPARTMENTS_MODE","LIVINGAREA_MODE","NONLIVINGAPARTMENTS_MODE","NONLIVINGAREA_MODE","APARTMENTS_MEDI","BASEMENTAREA_MEDI","YEARS_BEGINEXPLUATATION_MEDI","YEARS_BUILD_MEDI","COMMONAREA_MEDI","ELEVATORS_MEDI","ENTRANCES_MEDI","FLOORSMAX_MEDI","FLOORSMIN_MEDI","LANDAREA_MEDI","LIVINGAPARTMENTS_MEDI","LIVINGAREA_MEDI","NONLIVINGAPARTMENTS_MEDI","NONLIVINGAREA_MEDI","FONDKAPREMONT_MODE","HOUSETYPE_MODE","TOTALAREA_MODE","WALLSMATERIAL_MODE","EMERGENCYSTATE_MODE"]
    
    def transform(self, X, **params):
        X["BUILDING_INFO_AVAILABLE"] = ~X[self.building_columns].isnull().any(axis=1)
        X.drop(columns=self.building_columns, inplace=True)
        return X

class CarOwnAgeTransformer(BaseEstimator, TransformerMixin):
    def transform(self, X, **params):
        X["FLAG_OWN_CAR"] = ~X["OWN_CAR_AGE"].isnull()
        X.drop(columns=["OWN_CAR_AGE"], inplace=True)
        return X
    
class DropExternalSourcesColumns(BaseEstimator, TransformerMixin):
    
    def transform(self, X, **params):
        X.drop(columns=[col for col in X.columns if col.startswith("EXT_SOURCE_")], inplace=True)
        return X
    
    def fit(self, X, y=None, **params):
        return X

In [4]:
from sklearn.pipeline import Pipeline, make_pipeline

data_pipeline = make_pipeline(
    PrimaryTransformer(),
    CarOwnAgeTransformer(),
    RequestsColumnTransformer(),
    BuildingColumnsTransformer(),
    DropExternalSourcesColumns()
)

In [None]:
data_original = pd.read_csv(path_train)

In [7]:
data = data_pipeline.transform(data_original.copy())

In [None]:
data.describe()