In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
import warnings
warnings.filterwarnings('ignore')

In [3]:
import pandas as pd
df=pd.read_csv('Feature_Selection.csv')
df.head()

Unnamed: 0,PriorDefault_t,Employed_t,Income,EducationLevel,YearsEmployed,Age,Approved
0,1,1,560.0,0.653846,3.04,58.67,1
1,1,0,824.0,0.653846,1.5,24.5,1
2,1,1,3.0,0.507937,3.75,27.83,1
3,1,0,322.159652,0.507937,1.71,20.17,1
4,1,0,322.159652,0.421053,2.5,32.08,1


In [4]:
#### Independent And Dependent features
X=df.drop('Approved',axis=1)
y=df['Approved']

In [5]:
#### Train Test Split
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.20,random_state=0)

In [6]:
import xgboost as xgb
xgb_classifier=xgb.XGBClassifier().fit(X_train,y_train)
prediction=xgb_classifier.predict(X_test)

XGBoostLibraryNotFound: Cannot find XGBoost Library in the candidate path, did you install compilers and run build.sh in root path?
List of candidates:
C:\Users\sreeh\AppData\Roaming\Python\Python37\site-packages\xgboost\xgboost.dll
C:\Users\sreeh\AppData\Roaming\Python\Python37\site-packages\xgboost\../../lib/xgboost.dll
C:\Users\sreeh\AppData\Roaming\Python\Python37\site-packages\xgboost\./lib/xgboost.dll
C:\Users\sreeh\anaconda3\xgboost\xgboost.dll
C:\Users\sreeh\AppData\Roaming\Python\Python37\site-packages\xgboost\../../windows/x64/Release/xgboost.dll
C:\Users\sreeh\AppData\Roaming\Python\Python37\site-packages\xgboost\./windows/x64/Release/xgboost.dll

In [None]:
from sklearn.metrics import confusion_matrix,classification_report,accuracy_score
print(f'confusion matrix : {confusion_matrix(y_test,prediction)}')
print(f'Accuracy Score : {accuracy_score(y_test,prediction)}')
print(classification_report(y_test,prediction))

In [None]:
### Manual Hyperparameter Tuning
model=xgb.XGBClassifier(n_estimators=300,learning_rate =0.05,
                             max_depth=5,subsample=0.7,min_child_weight=3).fit(X_train,y_train)
predictions=model.predict(X_test)
print(confusion_matrix(y_test,predictions))
print(accuracy_score(y_test,predictions))
print(classification_report(y_test,predictions))

### Randomized Search CV

In [None]:
from sklearn.model_selection import RandomizedSearchCV
# Number of trees in random forest
n_estimators = [int(x) for x in np.linspace(start = 100, stop = 1200, num = 12)]
# Various learning rate parameters
learning_rate = ['0.05','0.1', '0.2','0.3','0.5','0.6']
# Maximum number of levels in tree
max_depth = [int(x) for x in np.linspace(5, 30, num = 6)]
# max_depth.append(None)
#Subssample parameter values
subsample=[0.7,0.6,0.8]
# Minimum child weight parameters
min_child_weight=[3,4,5,6,7]


# Create the random grid
random_grid = {'n_estimators': n_estimators,
               'learning_rate': learning_rate,
               'max_depth': max_depth,
               'subsample': subsample,
               'min_child_weight': min_child_weight}

In [None]:
# Use the random grid to search for best hyperparameters
# First create the base model to tune
Classifier=xgb.XGBClassifier()

In [None]:
# Random search of parameters, using 3 fold cross validation, 
# search across 100 different combinations
xg_random = RandomizedSearchCV(estimator = Classifier, param_distributions = random_grid,scoring='neg_mean_squared_error', n_iter = 100, cv = 5, verbose=2, random_state=42, n_jobs = 1)

In [None]:
xg_random.fit(X_train,y_train)

In [None]:
xg_random.best_params_

In [None]:
best_random_grid=xg_random.best_estimator_

In [None]:
from sklearn.metrics import accuracy_score
y_pred=best_random_grid.predict(X_test)
print(confusion_matrix(y_test,y_pred))
print("Accuracy Score {}".format(accuracy_score(y_test,y_pred)))
print("Classification report: {}".format(classification_report(y_test,y_pred)))

### Bayesian Optimization

In [None]:
from hyperopt import hp,fmin,tpe,STATUS_OK,Trials

In [None]:
space = {
    'max_depth' : hp.choice('max_depth', range(5, 30, 1)),
    'learning_rate' : hp.quniform('learning_rate', 0.01, 0.5, 0.01),
    'n_estimators' : hp.choice('n_estimators', range(20, 205, 5)),
    'gamma' : hp.quniform('gamma', 0, 0.50, 0.01),
    'min_child_weight' : hp.quniform('min_child_weight', 1, 10, 1),
    'subsample' : hp.quniform('subsample', 0.1, 1, 0.01),
    'colsample_bytree' : hp.quniform('colsample_bytree', 0.1, 1.0, 0.01)}

In [None]:

def objective(space):
    import xgboost as xgb
    model = xgb.XGBClassifier(n_estimators = space['n_estimators'],
                            max_depth = int(space['max_depth']),
                            learning_rate = space['learning_rate'],
                            gamma = space['gamma'],
                            min_child_weight = space['min_child_weight'],
                            subsample = space['subsample'],
                            colsample_bytree = space['colsample_bytree']
                                 )


    
    accuracy = cross_val_score(model, X_train, y_train, cv = 5).mean()

    # We aim to maximize accuracy, therefore we return it as a negative value
    return {'loss': -accuracy, 'status': STATUS_OK }

In [None]:
from sklearn.model_selection import cross_val_score
trials = Trials()
best = fmin(fn= objective,
            space= space,
            algo= tpe.suggest,
            max_evals = 80,
            trials= trials)
best

In [None]:
import xgboost as xgb
trainedforest = xgb.XGBClassifier(n_estimators = best['n_estimators'],
                            max_depth = best['max_depth'],
                            learning_rate = best['learning_rate'],
                            gamma = best['gamma'],
                            min_child_weight = best['min_child_weight'],
                            subsample = best['subsample'],
                            colsample_bytree = best['colsample_bytree']).fit(X_train,y_train)
prediction = trainedforest.predict(X_test)
print(confusion_matrix(y_test,prediction))
print(accuracy_score(y_test,prediction))
print(classification_report(y_test,prediction))
acc5 = accuracy_score(y_test,prediction)

## tpot

In [None]:
from sklearn.model_selection import RandomizedSearchCV
# Number of trees in random forest
n_estimators = [int(x) for x in np.linspace(start = 100, stop = 1200, num = 12)]
# Various learning rate parameters
learning_rate = ['0.05','0.1', '0.2','0.3','0.5','0.6']
# Maximum number of levels in tree
max_depth = [int(x) for x in np.linspace(5, 30, num = 6)]
# max_depth.append(None)
#Subssample parameter values
subsample=[0.7,0.6,0.8]
# Minimum child weight parameters
min_child_weight=[3,4,5,6,7]


# Create the random grid
random_grid = {'n_estimators': n_estimators,
               'learning_rate': learning_rate,
               'max_depth': max_depth,
               'subsample': subsample,
               'min_child_weight': min_child_weight}

In [None]:
from tpot import TPOTClassifier
from xgboost import XGBClassifier

tpot_classifier = TPOTClassifier(generations= 5, population_size= 24, offspring_size= 12,
                                 verbosity= 2, early_stop= 12,
                                # config_dict={'xgboost.XGBClassifier': param}, 
                                 cv = 4, scoring = 'accuracy')
tpot_classifier.fit(X_train,y_train)

In [None]:
accuracy = tpot_classifier.score(X_test, y_test)
print(accuracy)