In [156]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
from sklearn.preprocessing import OrdinalEncoder, StandardScaler, OneHotEncoder
from sklearn.model_selection import train_test_split, RandomizedSearchCV, GridSearchCV, StratifiedKFold
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, accuracy_score, roc_curve, auc, confusion_matrix, ConfusionMatrixDisplay, roc_auc_score
from sklearn.pipeline import make_pipeline
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from xgboost import XGBClassifier

warnings.filterwarnings('ignore')

#### Utilizaremos SMOTE para tratar las clases desbalanceadas

In [157]:
train = pd.read_csv('./data/train_mod.csv')
test = pd.read_csv('./data/test_mod.csv')

In [158]:
train.drop(['id'], axis=1, inplace=True)

In [159]:
test_id = test['id']
test.drop(['id'], axis=1, inplace=True)

In [160]:
X = train.drop(['loan_status'], axis=1)
y = train['loan_status']

X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_val = scaler.transform(X_val)

In [161]:
from imblearn.over_sampling import SMOTE

smote = SMOTE()
x_smote, y_smote = smote.fit_resample(X_train, y_train)

In [162]:
# from scipy import stats
# param_dist = {'learning_rate': stats.uniform(0.01, 0.99),
#               'max_depth': stats.randint(2, 12),
#               'subsample': stats.uniform(0.1, 1),
#               'colsample_bytree': stats.uniform(0.1, 1),
#               'n_estimators': stats.randint(50, 1000),
#               'reg_lambda': stats.uniform(0.0001, 1),
#               'reg_alpha': stats.uniform(0.0001, 1)
#               }

# classifier = XGBClassifier(use_label_encoder=False,random_state=42)
# cv = StratifiedKFold(n_splits=5, random_state=42, shuffle=True)
# xgb_cv = RandomizedSearchCV(classifier,param_dist,cv=cv, verbose=1,scoring='roc_auc',n_iter=10,random_state=42)

# xgb_cv.fit(X_train,y_train)
# print(f'AUC CV: {round(xgb_cv.best_score_,2)}')

In [163]:
from skopt.space import Integer, Real,Categorical
search_space = {'learning_rate': Real(0.01, 1.0, 'uniform'),
                 'max_depth': Integer(2, 12),
                 'subsample': Real(0.1, 1.0, 'uniform'),
                 'colsample_bytree': Real(0.1, 1.0, 'uniform'), # subsample ratio of columns by tree
                 'reg_lambda': Real(1e-9, 100., 'uniform'), # L2 regularization
                 'reg_alpha': Real(1e-9, 100., 'uniform'), # L1 regularization
                 'n_estimators': Integer(50, 1000)
   }

In [164]:
from skopt import BayesSearchCV

#Instanciamos el objeto BayesSearchCV
opt = BayesSearchCV(
    XGBClassifier(random_state=42, use_label_encoder=False),
    search_spaces=search_space,
    scoring='roc_auc',
    n_iter=10,
    cv=5,
    random_state=42
)

#Ajustamos el objeto
opt.fit(X, y)

print("val. score: %s" % round(opt.best_score_,2))

val. score: 0.95


In [165]:
# y_pred = opt.predict(X_val)

# print(classification_report(y_val, y_pred))

In [166]:
# print('ROC-AUC score WITHOUT SMOTE - optimizacion bayesiana:',round(roc_auc_score(y_val, y_pred),2))

In [167]:
test = scaler.fit_transform(test)
y_pred = opt.predict(test)
predictions = pd.DataFrame(y_pred)
y_pred

predictions = predictions.reset_index().rename(columns={
    'index':'id',
    0:'loan_status'
})

predictions['id'] = test_id

predictions.head()

Unnamed: 0,id,loan_status
0,58645,1
1,58646,0
2,58647,0
3,58648,0
4,58649,0


In [168]:
predictions.to_csv("xgboost_no_validation.csv", index=False)