In [1]:
import pandas as pd
from sklearn.pipeline import Pipeline
from feature_engine.outliers import Winsorizer, OutlierTrimmer
from category_encoders import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OrdinalEncoder, StandardScaler,PolynomialFeatures
from sklearn.model_selection import train_test_split,GridSearchCV
import lightgbm as lgb
from sklearn.impute import SimpleImputer


In [2]:
app_train = pd.read_csv("../CreditRisk/home-credit-default-risk/application_train.csv")
app_test = pd.read_csv("../CreditRisk/home-credit-default-risk/application_test.csv")

In [3]:
app_train.head()

Unnamed: 0,SK_ID_CURR,TARGET,NAME_CONTRACT_TYPE,CODE_GENDER,FLAG_OWN_CAR,FLAG_OWN_REALTY,CNT_CHILDREN,AMT_INCOME_TOTAL,AMT_CREDIT,AMT_ANNUITY,...,FLAG_DOCUMENT_18,FLAG_DOCUMENT_19,FLAG_DOCUMENT_20,FLAG_DOCUMENT_21,AMT_REQ_CREDIT_BUREAU_HOUR,AMT_REQ_CREDIT_BUREAU_DAY,AMT_REQ_CREDIT_BUREAU_WEEK,AMT_REQ_CREDIT_BUREAU_MON,AMT_REQ_CREDIT_BUREAU_QRT,AMT_REQ_CREDIT_BUREAU_YEAR
0,100002,1,Cash loans,M,N,Y,0,202500.0,406597.5,24700.5,...,0,0,0,0,0.0,0.0,0.0,0.0,0.0,1.0
1,100003,0,Cash loans,F,N,N,0,270000.0,1293502.5,35698.5,...,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0
2,100004,0,Revolving loans,M,Y,Y,0,67500.0,135000.0,6750.0,...,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0
3,100006,0,Cash loans,F,N,Y,0,135000.0,312682.5,29686.5,...,0,0,0,0,,,,,,
4,100007,0,Cash loans,M,N,Y,0,121500.0,513000.0,21865.5,...,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0


In [4]:
app_train['DAYS_BIRTH'] = abs(app_train['DAYS_BIRTH'])
app_test['DAYS_BIRTH'] = abs(app_test['DAYS_BIRTH'])

In [5]:
app_train['CREDIT_INCOME_PERCENT'] = app_train['AMT_CREDIT'] / app_train['AMT_INCOME_TOTAL']
app_train['ANNUITY_INCOME_PERCENT'] = app_train['AMT_ANNUITY'] / app_train['AMT_INCOME_TOTAL']
app_train['CREDIT_TERM'] = app_train['AMT_ANNUITY'] / app_train['AMT_CREDIT']
app_train['DAYS_EMPLOYED_PERCENT'] = app_train['DAYS_EMPLOYED'] / app_train['DAYS_BIRTH']

In [6]:
app_test['CREDIT_INCOME_PERCENT'] = app_test['AMT_CREDIT'] / app_test['AMT_INCOME_TOTAL']
app_test['ANNUITY_INCOME_PERCENT'] = app_test['AMT_ANNUITY'] / app_test['AMT_INCOME_TOTAL']
app_test['CREDIT_TERM'] = app_test['AMT_ANNUITY'] / app_test['AMT_CREDIT']
app_test['DAYS_EMPLOYED_PERCENT'] = app_test['DAYS_EMPLOYED'] / app_test['DAYS_BIRTH']

**BUREAU**

In [7]:
# Read in bureau
bureau = pd.read_csv('../CreditRisk/home-credit-default-risk/bureau.csv')

**Bureau Balance**

In [8]:
bureau_balance = pd.read_csv('../CreditRisk/home-credit-default-risk/bureau_balance.csv')

In [9]:
months_balance = bureau_balance.groupby('SK_ID_BUREAU')['MONTHS_BALANCE'].agg(['sum'])

In [10]:
bureau_balance.head()

Unnamed: 0,SK_ID_BUREAU,MONTHS_BALANCE,STATUS
0,5715448,0,C
1,5715448,-1,C
2,5715448,-2,C
3,5715448,-3,C
4,5715448,-4,C


In [11]:
bureau_balance_dummies_vars = [col for col in bureau_balance.select_dtypes("object").columns if len(bureau_balance[col].unique()) > 2]

import numpy as np
# dummies
dummies_pipe = Pipeline(steps=[    
  ('one_hot_encoder', OneHotEncoder())
])
bureau_balance_transformer = ColumnTransformer(transformers=[    
    ("dummies_pipe", dummies_pipe, bureau_balance_dummies_vars),
   
])

bureau_balance_transf = bureau_balance_transformer.fit_transform(bureau_balance)

  elif pd.api.types.is_categorical(cols):


In [12]:
feature_names = bureau_balance_transformer.named_transformers_['dummies_pipe']['one_hot_encoder'].get_feature_names()
bureau_balance_transf = pd.DataFrame(bureau_balance_transf,columns=[feature_names],index=bureau_balance["SK_ID_BUREAU"])

In [13]:
bureau_balance_agg = bureau_balance_transf.groupby('SK_ID_BUREAU').agg(['sum'])

In [14]:
columns=['SK_ID_BUREAU']
# Iterate through the variables names
for var in bureau_balance_agg.columns.levels[0]:
    columns.append('bureau_balance_%s' % (var))
     
        
bureau_balance_agg= bureau_balance_agg.reset_index()
bureau_balance_agg.columns = columns

In [15]:
months_balance = months_balance.reset_index()
months_balance.columns = ['SK_ID_BUREAU',"bureau_balance_months_balance"]

**MERGE WITH BUREAU**

In [16]:
bureau = bureau.merge(months_balance, on = 'SK_ID_BUREAU', how = 'left')

In [17]:
bureau = bureau.merge(bureau_balance_agg, on = 'SK_ID_BUREAU', how = 'left')

In [18]:
bureau[columns] = bureau[columns].fillna(0)

In [19]:
bureau['bureau_balance_months_balance'] =  bureau['bureau_balance_months_balance'].fillna(0)

**MERGE WITH app_train**

In [20]:
previous_loan_counts = bureau.groupby('SK_ID_CURR', as_index=False)['SK_ID_BUREAU'].count().rename(columns = {'SK_ID_BUREAU': 'previous_loan_counts'})

In [21]:
app_train = app_train.merge(previous_loan_counts, on = 'SK_ID_CURR', how = 'left')
app_train['previous_loan_counts'] = app_train['previous_loan_counts'].fillna(0)
app_test = app_test.merge(previous_loan_counts, on = 'SK_ID_CURR', how = 'left')
app_test['previous_loan_counts'] = app_test['previous_loan_counts'].fillna(0)

In [22]:
bureau_dummies_vars = [col for col in bureau.select_dtypes("object").columns if len(bureau[col].unique()) > 2]

In [23]:
import numpy as np
# dummies
dummies_pipe = Pipeline(steps=[    
  ('one_hot_encoder', OneHotEncoder())
])
bureau_column_transformer = ColumnTransformer(transformers=[    
    ("dummies_pipe", dummies_pipe, bureau_dummies_vars),
   
])

In [24]:
bureau_transf = bureau_column_transformer.fit_transform(bureau)

  elif pd.api.types.is_categorical(cols):


In [25]:
feature_names = bureau_column_transformer.named_transformers_['dummies_pipe']['one_hot_encoder'].get_feature_names()
bureau_transf = pd.DataFrame(bureau_transf,columns=[feature_names],index=bureau["SK_ID_CURR"])
categorical_grouped = bureau_transf.groupby('SK_ID_CURR').agg(['sum'])
group_var = 'SK_ID_CURR'

# Need to create new column names
columns = []

# Iterate through the variables names
for var in categorical_grouped.columns.levels[0]:
    # Skip the grouping variable
    if var != group_var:
        columns.append('%s_%s' % (var, 'count'))

#  Rename the columns
categorical_grouped.columns = columns

In [26]:
app_train = app_train.merge(categorical_grouped, left_on = 'SK_ID_CURR', right_index = True, how = 'left')
app_train[columns] = app_train[columns].fillna(0)

In [27]:
app_test = app_test.merge(categorical_grouped, left_on = 'SK_ID_CURR', right_index = True, how = 'left')
app_test[columns] = app_test[columns].fillna(0)

**Pipelines**

In [28]:
label_encoder_vars = [col for col in app_train.select_dtypes("object").columns if len(app_train[col].unique()) <= 2]

In [29]:
dummies_vars = [col for col in app_train.select_dtypes("object").columns if len(app_train[col].unique()) > 2]

In [30]:
numerical_vars = [col for col in app_train.select_dtypes("number").columns if col not in ["SK_ID_CURR", "TARGET"]]

In [31]:
X_treino = app_train.drop(["SK_ID_CURR", "TARGET"], axis=1)
y_treino = app_train["TARGET"] 

In [32]:

# dummies
dummies_pipe = Pipeline(steps=[    
  ('one_hot_encoder', OneHotEncoder())
])

# ordinal encoder
ordinal_encoder_pipe = Pipeline(steps=[
  ("label_encoder", OrdinalEncoder())
])

# standard scaler
numerical_pipe = Pipeline(steps=[
  ("standard_scaler", StandardScaler())
])


# Polynomial Features
polynomial_pipe = Pipeline(steps=[ 
    ("imputer_median", SimpleImputer(strategy = 'median')),
    ("polynomial_pipe", PolynomialFeatures(degree = 3))
])

In [33]:
poly_colums = ['EXT_SOURCE_1', 'EXT_SOURCE_2', 'EXT_SOURCE_3', 'DAYS_BIRTH']
# column transformer
column_transformer = ColumnTransformer(transformers=[    
    ("cat_label_encoder", ordinal_encoder_pipe, label_encoder_vars),
    ("cat_dummies", dummies_pipe, dummies_vars),
    ("numerical", numerical_pipe, numerical_vars),    
    ("polynomial",polynomial_pipe,poly_colums),
])

X_treino_transf = column_transformer.fit_transform(X_treino)

  elif pd.api.types.is_categorical(cols):


In [57]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X_treino_transf, y_treino, test_size=0.33, random_state=42)
# Create the model
model = lgb.LGBMClassifier(n_estimators=2000, objective = 'binary', 
                           class_weight = 'balanced', learning_rate = 0.05, 
                           reg_alpha = 0.1, reg_lambda = 0.1, 
                           subsample = 0.8, n_jobs = -1, random_state = 50)

# Train the model
model.fit(X_treino_transf, y_treino, eval_metric = 'auc',
          eval_set = [(X_train, y_train), (X_test, y_test)],
          eval_names = ['valid', 'train'],
          early_stopping_rounds = 100, verbose = 200)

Training until validation scores don't improve for 100 rounds
[200]	valid's auc: 0.800872	valid's binary_logloss: 0.549281	train's auc: 0.802938	train's binary_logloss: 0.549434
[400]	valid's auc: 0.828213	valid's binary_logloss: 0.524992	train's auc: 0.830124	train's binary_logloss: 0.525388
[600]	valid's auc: 0.850329	valid's binary_logloss: 0.503735	train's auc: 0.851691	train's binary_logloss: 0.50424
[800]	valid's auc: 0.86928	valid's binary_logloss: 0.48388	train's auc: 0.870171	train's binary_logloss: 0.484492
[1000]	valid's auc: 0.885242	valid's binary_logloss: 0.465977	train's auc: 0.885573	train's binary_logloss: 0.466584
[1200]	valid's auc: 0.898941	valid's binary_logloss: 0.449174	train's auc: 0.899209	train's binary_logloss: 0.449771
[1400]	valid's auc: 0.910789	valid's binary_logloss: 0.433496	train's auc: 0.910824	train's binary_logloss: 0.434044
[1600]	valid's auc: 0.921285	valid's binary_logloss: 0.418578	train's auc: 0.921049	train's binary_logloss: 0.419215
[1800]	va

LGBMClassifier(class_weight='balanced', learning_rate=0.05, n_estimators=3000,
               objective='binary', random_state=50, reg_alpha=0.1,
               reg_lambda=0.1, subsample=0.8)

In [63]:
X_text = column_transformer.transform(app_test.drop(["SK_ID_CURR","SK_ID_CURR","TARGET"],axis=1))

In [71]:
prob = model.predict_proba(X_treino_transf)[:,1]


In [73]:
from sklearn.metrics import mean_absolute_error
mean_absolute_error(prob,y_treino)

0.24973983269439248

In [None]:
model.predict_proba(X_text)[:,1]

In [65]:
submission = app_test[["SK_ID_CURR","TARGET"]]

In [66]:
submission.head()

Unnamed: 0,SK_ID_CURR,TARGET
0,100001,0.071937
1,100005,0.360459
2,100013,0.010823
3,100028,0.151261
4,100038,0.665709


In [67]:
submission.to_csv("submission_bureau_25_03_bureau_balance.csv_3000", index=False)