In [34]:
import pandas as pd
import numpy as np
import pickle
from sklearn.metrics import average_precision_score
from sklearn.model_selection import KFold, StratifiedKFold
from sklearn.ensemble import RandomForestClassifier
from functions import create_probs_series, grupowanie, log_func

In [35]:
#nastepny algorytm to Random Forest
#w przypadku algorytmu RF zmiennych kategorycznych nie zamieniamy na zerojedynkowe
#w przypadku naszego dfa nie ma potrzeby jednak uzywac label encodera, gdyz zmienne kategoryczne sa numeryczne
#nie trzeba tez standaryzowac zmiennych

df=pd.read_excel("credit_card_data.xlsx",index_col="ID")

features=df.loc[:,:"PAY_AMT6"].columns.tolist()

In [36]:
def CV_RF(n_splits=5,random_state=111,df=df,features=features,*args,**kwargs):
    test_precisions=[]
    probs=[]
    indices=[]
    skf=StratifiedKFold(n_splits=n_splits,shuffle=True,random_state=random_state)
    for train,test in skf.split(df[features],df.y):
        model=RandomForestClassifier(*args,**kwargs,random_state=random_state,n_jobs=-1)
        model.fit(df.iloc[train][features],df.iloc[train].y)
        preds=model.predict_proba(df.iloc[test][features])[:,1].tolist()
        probs.append(preds)
        indices.append(test.tolist())
        test_precisions.append(average_precision_score(df.iloc[test].y,preds))
    return test_precisions, probs, indices,model

In [37]:
test_precisions, probs, indices,model=CV_RF()
print(np.mean(test_precisions))

0.9004257656947281


In [38]:
#tak jak w przypadku pozostalych algorytmow sprawdze czy dodanie kwadratu wieku poprawi model

features2=features.copy()
features2.append("AGE_SQ")

test_precisions, probs, indices,model=CV_RF(features=features2)
print(np.mean(test_precisions))

0.9021083047788444


In [177]:
#przynioslo poprawe, teraz sprawdzmy czy dodanie kwadratu zmiennych PAY pomoze
features=features2.copy()

cols=df.loc[:,"PAY_0_SQ":"PAY_6_SQ"].columns.tolist()

features2.extend(cols)

test_precisions, probs, indices,model=CV_RF(features=features2)
print(np.mean(test_precisions))
#brak poprawy

0.9014185239198342


In [178]:
#tuning hiperparametrów RF
#dla przyspieszenia obliczen do tuningu hiperparametrow uzylem 50 estymatorow
#optymalna maksymalna głębokość
for k in range(2,22,2):
    test_precisions, probs, indices,model=CV_RF(features=features,n_estimators=50,max_depth=k)
    print(k,np.mean(test_precisions))

2 0.9002889502419185
4 0.9056945660488092
6 0.9080375529620106
8 0.9108686657172637
10 0.9111354039426439
12 0.9103582852274542
14 0.909084336832821
16 0.9083794364732736
18 0.9077888807543953
20 0.9047171478097595


In [180]:
#optymalna maksymalna ilosc zmiennych uzywana przy trenowaniu algorytmu
for k in range(3,11):
    test_precisions, probs, indices,model=CV_RF(features=features,n_estimators=50,max_depth=10,max_features=k)
    print(k,np.mean(test_precisions))

3 0.9105286626601237
4 0.9111354039426439
5 0.9112926890453498
6 0.9112164499675135
7 0.9111559310725379
8 0.9115306366321014
9 0.911043669582677
10 0.911364947223122


In [181]:
#minimalna ilosc obserwacji przed podzialem
for k in [2, 4, 6, 8, 10, 15, 20, 30]:
    test_precisions, probs, indices,model=CV_RF(features=features,n_estimators=50,max_depth=10,max_features=8,min_samples_split=k)
    print(k,np.mean(test_precisions))

2 0.9115306366321014
4 0.9114647858040336
6 0.9109788889485785
8 0.9118602395704292
10 0.9122141251307154
15 0.9117990969406821
20 0.9114687022624107
30 0.9115695769106296


In [182]:
#minimalna ilosc obserwacji po podziale
for k in [1,2,3,4,5]:
    test_precisions, probs, indices,model=CV_RF(features=features,n_estimators=50,max_depth=10,max_features=8,min_samples_split=10,min_samples_leaf=k)
    print(k,np.mean(test_precisions))

1 0.9122141251307154
2 0.9117495582135373
3 0.9115339271165899
4 0.9118686399903224
5 0.9118449351545952


In [183]:
#liczba drzew
for k in [50,75,100,125,150,175,200]:
    test_precisions, probs, indices,model=CV_RF(features=features,n_estimators=k,max_depth=10,max_features=8,min_samples_split=10,min_samples_leaf=1)   
    print(k,np.mean(test_precisions))

50 0.9122141251307154
75 0.9120986166483165
100 0.9124649293865043
125 0.9124638278364413
150 0.9123863416617022
175 0.9123637933871571
200 0.9123766984454171


In [39]:
test_precisions, probs, indices,model=CV_RF(features=features,n_estimators=100,max_depth=10,max_features=8,min_samples_split=10,min_samples_leaf=1)   
precis=np.mean(test_precisions)
print(precis)

0.9118460092705245


In [6]:
#sprawdzmy udzial poszczegolnych zmiennych w prognozowaniu
importances=list(zip(features,model.feature_importances_.tolist()))
importances.sort(key= lambda x: x[1],reverse=True)
importances

[('PAY_0', 0.3178162740526303),
 ('PAY_2', 0.11038004686834076),
 ('PAY_4', 0.05531156055329554),
 ('PAY_3', 0.04868232813081056),
 ('BILL_AMT1', 0.03783733253651171),
 ('LIMIT_BAL', 0.037613165881591394),
 ('PAY_5', 0.03418508189052191),
 ('PAY_AMT3', 0.03154156314691243),
 ('PAY_AMT1', 0.031000385430015587),
 ('PAY_AMT2', 0.03011936664500869),
 ('AGE', 0.028252021633135352),
 ('BILL_AMT2', 0.027509660204440196),
 ('BILL_AMT6', 0.026028036014340623),
 ('PAY_AMT5', 0.025638488101772976),
 ('PAY_AMT6', 0.025632984485359625),
 ('BILL_AMT3', 0.02546748032694123),
 ('PAY_AMT4', 0.024995641342186844),
 ('BILL_AMT4', 0.023775572489922178),
 ('BILL_AMT5', 0.023561455314857257),
 ('PAY_6', 0.01850948740124314),
 ('EDUCATION', 0.00835738863392508),
 ('MARRIAGE', 0.004636061130689911),
 ('SEX', 0.0031486177855465244)]

In [40]:
#najlepsze argumenty z grid searcha

parametersRF={'n_estimators':100,'max_depth':10,'max_features':8,'min_samples_split':10,'min_samples_leaf':1}

In [None]:
parametersRF

In [42]:
#grupowanie malo istotnych zmiennych o podobnych wspolczynnikach

grupowanie(df=df,columns=['BILL_AMT4','BILL_AMT5'],new_column='BILL_AMT45',
          valid=CV_RF,function='mean',features=features,precision=precis,**parametersRF)

0.9123555666092585
Poprawa modelu
0.9123555666092585


In [43]:
test_precisions, probs, indices,model=CV_RF(df=df,features=features,**parametersRF)   
precis=np.mean(test_precisions)

grupowanie(df=df,columns=['BILL_AMT3','BILL_AMT6'],new_column='BILL_AMT36',
          valid=CV_RF,function='mean',features=features,precision=precis,**parametersRF)

0.9123636864552147
Poprawa modelu
0.9123636864552147


In [44]:
test_precisions, probs, indices,model=CV_RF(df=df,features=features,**parametersRF)   
precis=np.mean(test_precisions)

grupowanie(df=df,columns=['PAY_AMT5','PAY_AMT6'],new_column='PAY_AMT56',
          valid=CV_RF,function='mean',features=features,precision=precis,**parametersRF)

0.9117639985679835
Brak poprawy modelu
0.9123636864552147


In [50]:
#kolejne grupowanie nie ulepsza modelu
log_func(df,'PAY_AMT2',valid=CV_RF,features=features,precision=precis,**parametersRF)
#tak samo logarytmowanie skosnej zmiennej

0.912176861369878
Brak poprawy modelu


In [51]:
cols=df.loc[:,'PAY_0':'PAY_6'].columns.tolist()
df['PAY_max']=df[cols].apply('max',axis=1)

features2=features.copy()
features2.append("PAY_max")

test_precisions, probs, indices,model=CV_RF(features=features2,df=df,**parametersRF)   
print(np.mean(test_precisions))
#poprawa po dodaniu maksymalnego zwlekania z zaplata

0.9130307582805971


In [53]:
features=features2.copy()

test_precisions, probs, indices,model=CV_RF(features=features,df=df,**parametersRF)   
print(np.mean(test_precisions))

0.9130307582805971


In [54]:
#warto poszukac optymalnych hiperparametrow za pomoca random search

In [199]:
results=[]
for k in range(50):
    parameters={'n_estimators':np.random.choice(range(75,180,5)),
                'max_depth':np.random.randint(8,14),
                'max_features':np.random.randint(4,11),
                'min_samples_split':np.random.randint(2,11)
               }
    parameters['min_samples_leaf']=np.random.randint(1,parameters['min_samples_split']//2+1)
    test_precisions, probs, indices,model=CV_RF(**parameters,features=features,df=df)   
    results.append((parameters.copy(),np.mean(test_precisions)))
    print(np.mean(test_precisions))

0.9120514650222423
0.9117113681468385
0.9117545467936872
0.9124741752725225
0.9113792734452222
0.9120584430871039
0.9117741083024444
0.9125824029552645
0.9122354517698531
0.9119888123173492
0.9117052742230991
0.9117887671559011
0.911490083351846
0.9106047090401053
0.9117476349508025
0.9119606810453696
0.9122281568075581
0.9120622321530115
0.9123505345152021
0.9121206384824049
0.9112923391179095
0.9114860249365127
0.9108517229134308
0.9118713592707139
0.9127185437248307
0.9119599509481999
0.9115310027314478
0.9119231461285716
0.9123799322371102
0.9119189680459637
0.9114674979031495
0.9122453236946395
0.9118204071695132
0.911518025551335
0.9121082177739126
0.912257144421712
0.9121597164528605
0.9117985889635116
0.9119984886985119
0.9115974155330064
0.9114965588353618
0.9110678676480498
0.9116778271112513
0.911391585444638
0.912072078407275
0.9122587133140106
0.911480484340397
0.9113930380120052
0.9113907442111691
0.9114148153923909


In [200]:
sorted(results,key=lambda x: x[1],reverse=True)[0]
#brak poprawy

({'n_estimators': 100,
  'max_depth': 11,
  'max_features': 10,
  'min_samples_split': 9,
  'min_samples_leaf': 2},
 0.9127185437248307)

In [57]:
test_precisions, probs, indices,model=CV_RF(features=features,df=df,**parametersRF)   
precis=np.mean(test_precisions)
print(precis)

0.9130307582805971


In [58]:
probs=create_probs_series(probs,indices)

In [59]:
parametersRF

{'n_estimators': 100,
 'max_depth': 10,
 'max_features': 8,
 'min_samples_split': 10,
 'min_samples_leaf': 1}

In [60]:
modelRF = {
    "nazwa":"RF",
    "opis":"Model RF, max_depth=10, max_features=8, min_samples_split=10, min_samples_leaf=1",
    "specyfikacja":{'n_estimators':100,'max_depth':10, 'max_features':8, 'min_samples_split':10,
                    'min_samples_leaf':1,'features':features,'df':df},     
    "precyzja":precis,
    "predykcje":probs.copy(),
    }

In [61]:
with open('ML_RF.p','wb') as fp:
    pickle.dump(modelRF,fp)