In [None]:
!pip install category_encoders --quiet

In [None]:
import pandas as pd
from sklearn.pipeline import Pipeline
from category_encoders import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OrdinalEncoder, StandardScaler,PolynomialFeatures
from sklearn.model_selection import train_test_split,GridSearchCV
import lightgbm as lgb
from sklearn.impute import SimpleImputer


In [None]:
app_train = pd.read_csv("../input/home-credit-default-risk/application_train.csv")
app_test = pd.read_csv("../input/home-credit-default-risk/application_test.csv")

In [None]:
app_train.head()

In [None]:
app_train['DAYS_BIRTH'] = abs(app_train['DAYS_BIRTH'])
app_test['DAYS_BIRTH'] = abs(app_test['DAYS_BIRTH'])

In [None]:
app_train['CREDIT_INCOME_PERCENT'] = app_train['AMT_CREDIT'] / app_train['AMT_INCOME_TOTAL']
app_train['ANNUITY_INCOME_PERCENT'] = app_train['AMT_ANNUITY'] / app_train['AMT_INCOME_TOTAL']
app_train['CREDIT_TERM'] = app_train['AMT_ANNUITY'] / app_train['AMT_CREDIT']
app_train['DAYS_EMPLOYED_PERCENT'] = app_train['DAYS_EMPLOYED'] / app_train['DAYS_BIRTH']

In [None]:
app_test['CREDIT_INCOME_PERCENT'] = app_test['AMT_CREDIT'] / app_test['AMT_INCOME_TOTAL']
app_test['ANNUITY_INCOME_PERCENT'] = app_test['AMT_ANNUITY'] / app_test['AMT_INCOME_TOTAL']
app_test['CREDIT_TERM'] = app_test['AMT_ANNUITY'] / app_test['AMT_CREDIT']
app_test['DAYS_EMPLOYED_PERCENT'] = app_test['DAYS_EMPLOYED'] / app_test['DAYS_BIRTH']

**BUREAU**

In [None]:
# Read in bureau
bureau = pd.read_csv('../input/home-credit-default-risk/bureau.csv')

**Bureau Balance**

In [None]:
bureau_balance = pd.read_csv('../input/home-credit-default-risk/bureau_balance.csv')

In [None]:
bureau_balance_agg = bureau_balance.groupby('SK_ID_BUREAU').agg(['sum']).reset_index(drop=True)

In [None]:
bureau_balance_agg.head()

In [None]:
bureau_balance_agg.columns = ['SK_ID_BUREAU',"bureau_balance_months_balance"]

**MERGE WITH BUREAU**

In [None]:
bureau = bureau.merge(bureau_balance_agg, on = 'SK_ID_BUREAU', how = 'left')

**MERGE WITH app_train**

In [None]:
previous_loan_counts = bureau.groupby('SK_ID_CURR', as_index=False)['SK_ID_BUREAU'].count().rename(columns = {'SK_ID_BUREAU': 'previous_loan_counts'})

In [None]:
previous_loan_counts.head()

In [None]:
app_train = app_train.merge(previous_loan_counts, on = 'SK_ID_CURR', how = 'left')
app_test = app_test.merge(previous_loan_counts, on = 'SK_ID_CURR', how = 'left')

In [None]:
app_train.head()

**Pipelines**

In [None]:
label_encoder_vars = [col for col in app_train.select_dtypes("object").columns if len(app_train[col].unique()) <= 2]

In [None]:
dummies_vars = [col for col in app_train.select_dtypes("object").columns if len(app_train[col].unique()) > 2]

In [None]:
numerical_vars = [col for col in app_train.select_dtypes("number").columns if col not in ["SK_ID_CURR", "TARGET"]]

In [None]:
X = app_train.drop(["SK_ID_CURR", "TARGET"], axis=1)
y = app_train["TARGET"] 

In [None]:
# dummies
dummies_pipe = Pipeline(steps=[    
  ('one_hot_encoder', OneHotEncoder())
])

# ordinal encoder
ordinal_encoder_pipe = Pipeline(steps=[
  ("label_encoder", OrdinalEncoder())
])

# standard scaler
numerical_pipe = Pipeline(steps=[
  ("standard_scaler", StandardScaler())
])


# Polynomial Features
polynomial_pipe = Pipeline(steps=[ 
    ("imputer_median", SimpleImputer(strategy='median')),
    ("polynomial_pipe", PolynomialFeatures(degree=3))
])

In [None]:
poly_colums = ['EXT_SOURCE_1', 'EXT_SOURCE_2', 'EXT_SOURCE_3', 'DAYS_BIRTH']

# column transformer
column_transformer = ColumnTransformer(transformers=[    
    ("cat_label_encoder", ordinal_encoder_pipe, label_encoder_vars),
    ("cat_dummies", dummies_pipe, dummies_vars),
    ("numerical", numerical_pipe, numerical_vars),    
    ("polynomial",polynomial_pipe, poly_colums),
])

model = lgb.LGBMClassifier(n_estimators=2000, objective='binary', class_weight='balanced', learning_rate=0.05, reg_alpha=0.1, reg_lambda=0.1, subsample=0.8, n_jobs=-1, random_state=50)

In [None]:
info = {"nome":"Matheus", "sobrenome":"Almeida", "idade":24}
info.get("sobrenome", "Não Tem")

In [None]:
class CreditRiskPipeline:
    def __init__(self, modelo, preprocessamento):
        self.modelo = modelo
        self.preprocessamento = preprocessamento
    
    def fit(self, X, y, args_treino):
        self.eval_metric = args_treino.get("eval_metric", "auc")
        self.early_stopping_rounds = args_treino.get("early_stopping_rounds", 100)
        X_ajustado = self.preprocessamento.fit_transform(X)
        
        eval_set = args_treino.get("eval_set", None)
        if eval_set is not None:
            for i in range(len(eval_set)):
                eval_set[i][0] = self.preprocessamento.transform(eval_set[i][0])
        
        self.modelo.fit(
            X_ajustado, y,
            eval_metric = self.eval_metric,
            eval_set = eval_set,
            eval_names = args_treino.get("eval_names", None),
            early_stopping_rounds = self.early_stopping_rounds,
            verbose = args_treino.get("verbose", 200)
        )
        return self.modelo
    
    def predict(self, X, y=None, probs=False):
        X_ajustado = self.preprocessamento.transform(X)
        if probs:
            return self.modelo.predict_proba(X_ajustado)    
        else:
            return self.modelo.predict(X_ajustado)

In [None]:
from sklearn.model_selection import train_test_split
X_treino, X_teste, y_treino, y_teste = train_test_split(X, y, test_size=0.33, random_state=42)

In [None]:
classe_pipeline = CreditRiskPipeline(modelo=model, preprocessamento=column_transformer)

In [None]:
train_args = {
    "eval_metric": 'auc',
    "eval_set": [[X_treino, y_treino], [X_teste, y_teste]],
    "eval_names": ['valid', 'train'],
    "early_stopping_rounds": 100,
    "verbose": 200
}

In [None]:
classe_pipeline.fit(X_treino, y_treino, args_treino=train_args)

In [None]:
classe_pipeline.predict(X_teste, probs=True)[:, 1]

In [None]:
# pipeline_modelo = Pipeline(steps=[
#     ("preprocessamento", classe_pipeline.preprocessamento),
#     ("modelo", classe_pipeline.modelo)
# ])
# pipeline_modelo.predict(X_teste)

In [None]:
submission = app_test[["SK_ID_CURR"]]
submission["TARGET"] = classe_pipeline.predict(app_test.drop("SK_ID_CURR", axis=1), probs=True)[:, 1]

In [None]:
submission.head()

In [None]:
submission.to_csv("submission.csv", index=False)