In [49]:
import pandas as pd
import numpy as np
import pickle
from sklearn.metrics import average_precision_score
from sklearn.model_selection import KFold, StratifiedKFold
import xgboost as xgb
from functions import create_probs_series
import random

In [3]:
#boostowane drzewa decyzyjne

df=pd.read_excel("credit_card_data.xlsx",index_col="ID")

features=df.loc[:,:"PAY_AMT6"].columns.tolist()

In [6]:
#tworzymy funkcje do wywolywania boostowanych drzew decyzyjnych

def run_xgb(train,validate,features,target,
           eta=0.03,max_depth=7,num_boost_round=1000,early_stopping_rounds=50,
           lambdaX=0,alpha=0,gamma=0,min_child_weight=0,
           subsample=0.7, colsample_bytree=0.7, colsample_bylevel=1,
           tree_method='hist',grow_policy='depthwise',eval_metric='aucpr', seed=111,
           objective='binary:logistic', verbosity=0, booster='gbtree',verbose_eval=False):
    dtrain=xgb.DMatrix(train[features],train[target],feature_names=features)
    dvalidate=xgb.DMatrix(validate[features],validate[target],feature_names=features)
    evals=[(dtrain,'train'),(dvalidate,'validate')]
    params=['eta','max_depth','alpha','gamma','min_child_weight','subsample','colsample_bytree',
           'colsample_bylevel','tree_method','grow_policy','eval_metric','seed','objective','verbosity',
           'booster']
    Params=dict()
    for param in params:
        Params[param]=eval(param)
    Params['lambda']=lambdaX
    clf=xgb.train(Params,dtrain,num_boost_round,evals,early_stopping_rounds=early_stopping_rounds,verbose_eval=verbose_eval)
    score=clf.best_score
    valResults=clf.predict(dvalidate,ntree_limit=clf.best_ntree_limit)
    imp=clf.get_fscore()
    imp=[(x,y) for x,y in imp.items()]
    imp.sort(key=lambda x: x[1],reverse=True)
    return score,valResults,imp
    

In [7]:
#na podstawie poprzedniej funkcji tworzymy wrapper do walidacji krzyzowej

def CV_XGB(df=df,n_splits=5,random_state=111,features=features,*args,**kwargs):
    test_precisions=[]
    probs=[]
    indices=[]
    skf=StratifiedKFold(n_splits=n_splits,shuffle=True,random_state=random_state)
    for train,test in skf.split(df[features],df.y):
        score,valResults,imp=run_xgb(df.iloc[train],df.iloc[test],features,'y',seed=random_state,*args,**kwargs)
        probs.append(valResults.tolist())
        indices.append(test.tolist())
        test_precisions.append(score)
    return test_precisions, probs, indices, imp

In [8]:
test_precisions, probs, indices, imp=CV_XGB()
print(np.mean(test_precisions))

0.9140236000000002


In [9]:
#tak jak w przypadku pozostalych algorytmow sprawdze czy dodanie kwadratu wieku poprawi model

features2=features.copy()
features2.append("AGE_SQ")

test_precisions, probs, indices, imp=CV_XGB(features=features2)
print(np.mean(test_precisions))

0.9139047999999999


In [11]:
#nie poprawilo modelu

#sprawdzmy czy dodanie kwadratu zmiennych PAY pomoze

features2=features.copy()

cols=df.loc[:,"PAY_0_SQ":"PAY_6_SQ"].columns.tolist()
features2.extend(cols)

test_precisions, probs, indices, imp=CV_XGB(features=features2)
print(np.mean(test_precisions))

0.9140016


In [12]:
#brak poprawy

In [13]:
#sprawdzmy jak bardzo przetrenowuje sie algorytm

test_precisions, probs, indices, imp=CV_XGB(features=features,verbose_eval=True)
print(np.mean(test_precisions))

[0]	train-aucpr:0.90565	validate-aucpr:0.87858
[1]	train-aucpr:0.91347	validate-aucpr:0.89456
[2]	train-aucpr:0.91559	validate-aucpr:0.89826
[3]	train-aucpr:0.91930	validate-aucpr:0.90151
[4]	train-aucpr:0.92021	validate-aucpr:0.90269
[5]	train-aucpr:0.92005	validate-aucpr:0.90402
[6]	train-aucpr:0.92074	validate-aucpr:0.90472
[7]	train-aucpr:0.92055	validate-aucpr:0.90473
[8]	train-aucpr:0.92120	validate-aucpr:0.90432
[9]	train-aucpr:0.92223	validate-aucpr:0.90513
[10]	train-aucpr:0.92289	validate-aucpr:0.90612
[11]	train-aucpr:0.92276	validate-aucpr:0.90713
[12]	train-aucpr:0.92341	validate-aucpr:0.90773
[13]	train-aucpr:0.92376	validate-aucpr:0.90771
[14]	train-aucpr:0.92374	validate-aucpr:0.90830
[15]	train-aucpr:0.92475	validate-aucpr:0.90909
[16]	train-aucpr:0.92495	validate-aucpr:0.90987
[17]	train-aucpr:0.92504	validate-aucpr:0.90951
[18]	train-aucpr:0.92498	validate-aucpr:0.90971
[19]	train-aucpr:0.92534	validate-aucpr:0.90964
[20]	train-aucpr:0.92595	validate-aucpr:0.91006
[2

[65]	train-aucpr:0.93576	validate-aucpr:0.92084
[66]	train-aucpr:0.93605	validate-aucpr:0.92077
[67]	train-aucpr:0.93639	validate-aucpr:0.92100
[68]	train-aucpr:0.93664	validate-aucpr:0.92079
[69]	train-aucpr:0.93705	validate-aucpr:0.92048
[70]	train-aucpr:0.93722	validate-aucpr:0.92064
[71]	train-aucpr:0.93761	validate-aucpr:0.92087
[72]	train-aucpr:0.93766	validate-aucpr:0.92086
[73]	train-aucpr:0.93799	validate-aucpr:0.92106
[74]	train-aucpr:0.93827	validate-aucpr:0.92093
[75]	train-aucpr:0.93845	validate-aucpr:0.92097
[76]	train-aucpr:0.93869	validate-aucpr:0.92096
[77]	train-aucpr:0.93887	validate-aucpr:0.92104
[78]	train-aucpr:0.93916	validate-aucpr:0.92103
[79]	train-aucpr:0.93948	validate-aucpr:0.92105
[80]	train-aucpr:0.93967	validate-aucpr:0.92096
[81]	train-aucpr:0.93971	validate-aucpr:0.92097
[82]	train-aucpr:0.93999	validate-aucpr:0.92087
[83]	train-aucpr:0.94013	validate-aucpr:0.92083
[84]	train-aucpr:0.94038	validate-aucpr:0.92078
[85]	train-aucpr:0.94073	validate-aucpr:

[23]	train-aucpr:0.92869	validate-aucpr:0.90852
[24]	train-aucpr:0.92873	validate-aucpr:0.90837
[25]	train-aucpr:0.92896	validate-aucpr:0.90846
[26]	train-aucpr:0.92925	validate-aucpr:0.90857
[27]	train-aucpr:0.92945	validate-aucpr:0.90854
[28]	train-aucpr:0.92937	validate-aucpr:0.90866
[29]	train-aucpr:0.92970	validate-aucpr:0.90875
[30]	train-aucpr:0.93002	validate-aucpr:0.90856
[31]	train-aucpr:0.93010	validate-aucpr:0.90863
[32]	train-aucpr:0.93015	validate-aucpr:0.90863
[33]	train-aucpr:0.93012	validate-aucpr:0.90851
[34]	train-aucpr:0.93018	validate-aucpr:0.90863
[35]	train-aucpr:0.93041	validate-aucpr:0.90850
[36]	train-aucpr:0.93059	validate-aucpr:0.90845
[37]	train-aucpr:0.93071	validate-aucpr:0.90849
[38]	train-aucpr:0.93070	validate-aucpr:0.90845
[39]	train-aucpr:0.93075	validate-aucpr:0.90858
[40]	train-aucpr:0.93106	validate-aucpr:0.90861
[41]	train-aucpr:0.93156	validate-aucpr:0.90842
[42]	train-aucpr:0.93170	validate-aucpr:0.90834
[43]	train-aucpr:0.93175	validate-aucpr:

[114]	train-aucpr:0.94869	validate-aucpr:0.91257
[115]	train-aucpr:0.94880	validate-aucpr:0.91251
[116]	train-aucpr:0.94898	validate-aucpr:0.91261
[117]	train-aucpr:0.94915	validate-aucpr:0.91279
[118]	train-aucpr:0.94927	validate-aucpr:0.91271
[119]	train-aucpr:0.94940	validate-aucpr:0.91280
[120]	train-aucpr:0.94966	validate-aucpr:0.91280
[121]	train-aucpr:0.94984	validate-aucpr:0.91270
[122]	train-aucpr:0.95013	validate-aucpr:0.91290
[123]	train-aucpr:0.95047	validate-aucpr:0.91294
[124]	train-aucpr:0.95075	validate-aucpr:0.91286
[125]	train-aucpr:0.95105	validate-aucpr:0.91280
[126]	train-aucpr:0.95165	validate-aucpr:0.91273
[127]	train-aucpr:0.95198	validate-aucpr:0.91257
[128]	train-aucpr:0.95209	validate-aucpr:0.91254
[129]	train-aucpr:0.95223	validate-aucpr:0.91249
[130]	train-aucpr:0.95256	validate-aucpr:0.91258
[131]	train-aucpr:0.95305	validate-aucpr:0.91253
[132]	train-aucpr:0.95330	validate-aucpr:0.91252
[133]	train-aucpr:0.95339	validate-aucpr:0.91249
[134]	train-aucpr:0.

In [16]:
#algorym nie przetrenowuje sie w znaczacym stopniu, wiec tuning hiperparametrow zaczne od parametrow innych niz regularyzacyjne    

#najpierw sprawdze glebokosc drzewa
for k in range(4,16,2):
    test_precisions, probs, indices, imp=CV_XGB(features=features,max_depth=k)
    print(k,np.mean(test_precisions))

4 0.9137292
6 0.9141302
8 0.9140662
10 0.9124104000000001
12 0.9106826
14 0.9094541999999999


In [18]:
# % uzytych obserwacji
for k in range(2,10):
    test_precisions, probs, indices, imp=CV_XGB(features=features,max_depth=6,subsample=k/10)
    print(k/10,np.mean(test_precisions))

0.2 0.9131915999999999
0.3 0.9141221999999999
0.4 0.9136228
0.5 0.9140121999999999
0.6 0.9136793999999998
0.7 0.9141302
0.8 0.9137982000000001
0.9 0.9146618


In [19]:
# % zmiennych uzytych na drzewo
for k in range(2,10):
    test_precisions, probs, indices, imp=CV_XGB(features=features,max_depth=6,subsample=0.9,colsample_bytree=k/10)
    print(k/10,np.mean(test_precisions))

0.2 0.9131487999999999
0.3 0.9136234
0.4 0.9138947999999999
0.5 0.9142846
0.6 0.9145798
0.7 0.9146618
0.8 0.9143218000000001
0.9 0.9138550000000001


In [20]:
# % zmiennych uzytych na poziom
for k in range(2,11):
    test_precisions, probs, indices, imp=CV_XGB(features=features,max_depth=6,subsample=0.9,colsample_bytree=0.7,colsample_bylevel=k/10)
    print(k/10,np.mean(test_precisions))

0.2 0.9136977999999999
0.3 0.913669
0.4 0.9140689999999999
0.5 0.9137080000000001
0.6 0.9139685999999999
0.7 0.9138112000000002
0.8 0.9139886
0.9 0.9140938000000001
1.0 0.9146618


In [21]:
#regularyzacja l2
for k in range(8):
    test_precisions, probs, indices, imp=CV_XGB(features=features,max_depth=6,subsample=0.9,colsample_bytree=0.7,colsample_bylevel=1,lambdaX=k**2)
    print(k**2,np.mean(test_precisions))

0 0.9146618
1 0.9144894000000001
4 0.9148403999999999
9 0.91502
16 0.9152726000000001
25 0.9150320000000001
36 0.9152144
49 0.9152488


In [22]:
#gamma 
for k in [0,0.1,0.25,0.5,1,2,5,6,8,10]:
    test_precisions, probs, indices, imp=CV_XGB(features=features,max_depth=6,subsample=0.9,colsample_bytree=0.7,colsample_bylevel=1,lambdaX=16,gamma=k)
    print(k,np.mean(test_precisions))

0 0.9152726000000001
0.1 0.9150186
0.25 0.9151434
0.5 0.9150893999999999
1 0.9151150000000001
2 0.9151016000000001
5 0.9150978000000001
6 0.914963
8 0.9147164
10 0.9139538


In [23]:
#regularyzacja l1
for k in range(8):
    test_precisions, probs, indices, imp=CV_XGB(features=features,max_depth=6,subsample=0.9,colsample_bytree=0.7,colsample_bylevel=1,lambdaX=16,gamma=0,alpha=k**2)
    print(k**2,np.mean(test_precisions))

0 0.9152726000000001
1 0.9150864000000001
4 0.9152536000000001
9 0.9149711999999999
16 0.9150508
25 0.9147677999999999
36 0.9146928000000001
49 0.9143346000000001


In [24]:
#learning rate
for k in range(1,11):
     test_precisions, probs, indices, imp=CV_XGB(features=features,max_depth=6,subsample=0.9,colsample_bytree=0.7,colsample_bylevel=1,lambdaX=16,gamma=0,alpha=0,eta=k/100)
     print(k/100,np.mean(test_precisions))

0.01 0.9149874
0.02 0.9148158000000001
0.03 0.9152726000000001
0.04 0.9149588
0.05 0.9145544000000001
0.06 0.9148099999999999
0.07 0.914557
0.08 0.9151170000000001
0.09 0.9147994
0.1 0.9147209999999999


In [25]:
test_precisions, probs, indices, imp=CV_XGB(features=features,max_depth=6,subsample=0.9,colsample_bytree=0.7,colsample_bylevel=1,lambdaX=16,gamma=0,alpha=0,eta=0.03)      
precis=np.mean(test_precisions)
print(precis)

0.9152726000000001


In [26]:
#najwazniejsze zmienne w XGB

imp

[('LIMIT_BAL', 1404),
 ('AGE', 1285),
 ('BILL_AMT1', 1185),
 ('PAY_AMT3', 938),
 ('PAY_AMT6', 930),
 ('PAY_AMT2', 922),
 ('PAY_AMT4', 848),
 ('PAY_AMT5', 839),
 ('PAY_AMT1', 837),
 ('BILL_AMT2', 823),
 ('BILL_AMT6', 732),
 ('BILL_AMT4', 727),
 ('BILL_AMT3', 696),
 ('BILL_AMT5', 677),
 ('EDUCATION', 459),
 ('PAY_0', 413),
 ('PAY_6', 306),
 ('PAY_2', 294),
 ('PAY_3', 285),
 ('PAY_4', 276),
 ('PAY_5', 273),
 ('MARRIAGE', 243),
 ('SEX', 163)]

In [27]:
#najczesciej uzywany LIMIT_BAL, sprobujmy dodac kwadrat tej zmiennej

df['LIMIT_BAL_SQ']=df['LIMIT_BAL']**2
features2=features.copy()
features2.append('LIMIT_BAL_SQ')
test_precisions, probs, indices, imp=CV_XGB(features=features2,max_depth=6,subsample=0.9,colsample_bytree=0.7,colsample_bylevel=1,lambdaX=16,gamma=0,alpha=0,eta=0.03)    
precis=np.mean(test_precisions)
print(precis)

0.9148788


In [45]:
#brak poprawy
#sprobujmy zmienic zmienne kategoryczne za pomoca one hot encodingu

df2=df.copy()
df2.loc[:,'SEX':'MARRIAGE']=df2.loc[:,'SEX':'MARRIAGE'].astype('object')
df2=pd.get_dummies(df2,drop_first=True)

features2=features.copy()
features2.remove("MARRIAGE")
features2.remove("SEX")
features2.remove("EDUCATION")
features2.extend(df2.loc[:,"SEX_2":].columns.tolist())

In [48]:
test_precisions, probs, indices, imp=CV_XGB(features=features2,df=df2,max_depth=6,subsample=0.9,colsample_bytree=0.7,colsample_bylevel=1,lambdaX=16,gamma=0,alpha=0,eta=0.03)    
precis=np.mean(test_precisions)
print(precis)
#pogarsza model

0.9147516


In [50]:
#sprobujmy przeprowadzic jeszcze raz tuning hiperparametrow za pomoca random search

results=[]
for k in range(50):
    parameters={'max_depth':random.randint(6,13),
               'subsample':random.uniform(0.55,0.95),
               'colsample_bytree':random.uniform(0.6,1),
               'colsample_bylevel':random.uniform(0.6,1),
               'lambdaX':random.randint(0,20),
               'alpha':random.uniform(0,2),
               'gamma':random.choice([0,0.01,0.05,0.1,0.25,0.5,1,2,3,4,5]),
               'eta':random.choice([0.025,0.03,0.04,0.045])
               }
    test_precisions, probs, indices, imp=CV_XGB(features=features,**parameters)
    results.append((parameters.copy(),np.mean(test_precisions)))
    print(np.mean(test_precisions))

0.9144018
0.9145364
0.915357
0.9144614000000001
0.9144446
0.9148285999999999
0.9147026
0.9152947999999999
0.9153504
0.9151294
0.9143077999999999
0.9146578
0.9148158000000001
0.9150383999999999
0.9146822
0.9153122
0.9128134000000001
0.9153694
0.914671
0.9151703999999998
0.9150561999999999
0.9151046
0.9151507999999999
0.9149263999999999
0.9145812
0.9155704
0.9140273999999999
0.9146951999999999
0.91475
0.9147829999999999
0.9151384
0.9152286000000001
0.9145196
0.9151216
0.9151583999999999
0.9151424
0.9146930000000001
0.9150896
0.9152570000000001
0.9149818
0.9148196000000001
0.914817
0.9149456
0.9153912
0.9146934
0.9151199999999999
0.9147928000000001
0.9154756
0.9151196
0.9148978


In [51]:
sorted(results,key=lambda x: x[1],reverse=True)

[({'max_depth': 8,
   'subsample': 0.5650130469286977,
   'colsample_bytree': 0.6866458637703693,
   'colsample_bylevel': 0.960741207272874,
   'lambdaX': 13,
   'alpha': 1.6900639889512352,
   'gamma': 0,
   'eta': 0.025},
  0.9155704),
 ({'max_depth': 10,
   'subsample': 0.6360378263035554,
   'colsample_bytree': 0.9375753856096332,
   'colsample_bylevel': 0.9040339452910737,
   'lambdaX': 17,
   'alpha': 0.9478336737205075,
   'gamma': 3,
   'eta': 0.025},
  0.9154756),
 ({'max_depth': 12,
   'subsample': 0.7725499902273454,
   'colsample_bytree': 0.9133202744893203,
   'colsample_bylevel': 0.7105881025820332,
   'lambdaX': 7,
   'alpha': 0.6577177879507912,
   'gamma': 3,
   'eta': 0.025},
  0.9153912),
 ({'max_depth': 12,
   'subsample': 0.9245364839395265,
   'colsample_bytree': 0.6213496795943316,
   'colsample_bylevel': 0.7837434587967906,
   'lambdaX': 18,
   'alpha': 1.8730958994512084,
   'gamma': 5,
   'eta': 0.03},
  0.9153694),
 ({'max_depth': 7,
   'subsample': 0.7156854

In [53]:
#poprawa wyniku

test_precisions, probs, indices, imp=CV_XGB(features=features,lambdaX=13,gamma=0,alpha=1.69,max_depth=8,subsample=0.565,colsample_bytree=0.69,colsample_bylevel=0.96,eta=0.025)
precis=np.mean(test_precisions)
print(precis)

0.9157803999999998


In [55]:
probs=create_probs_series(probs,indices)

In [56]:
modelXGB = {
    "nazwa":"XGB",
    "opis":"Model XGB, max_depth=8, subsample=0.565, colsample_bytree=0.69, colsample_bylevel=0.96, lambda=13, gamma=0, alpha=1.69",
    "specyfikacja":{'eta':0.025,'max_depth':8, 'subsample':0.565, 'colsample_bytree':0.69,
                    'colsample_bylevel':0.96,'lambdaX':13,'gamma':0,'alpha':1.69,'features':features,'df':df},     
    "precyzja":precis,
    "predykcje":probs.copy(),
    }

In [57]:
with open('ML_XGB.p','wb') as fp:
    pickle.dump(modelXGB,fp)