In [26]:
import pandas as pd
import numpy as np
import pickle
from sklearn.metrics import average_precision_score
from sklearn.model_selection import KFold, StratifiedKFold
from sklearn import svm

In [27]:
#wczytujemy wcześniej przygotowany zbiór

df=pd.read_excel("dumm_credit_card_data.xlsx",index_col="ID")

#zmienne objaśniające
features=pd.concat([df.loc[:,:"PAY_AMT6"],df.loc[:,"SEX_2":]],axis=1).columns.tolist()

In [28]:
def CV_SVM(df=df,n_splits=5,random_state=111,features=features,sample_size=3000,probability=True,*args,**kwargs):
    test_precisions=[]
    probs=[] 
    indices=[]
    skf=StratifiedKFold(n_splits=n_splits,shuffle=True,random_state=random_state)
    for train,test in skf.split(df[features],df.y):
        model=svm.SVC(probability=probability,*args,**kwargs,random_state=random_state)
        df2=df.iloc[train].sample(sample_size,random_state=random_state)
        model.fit(df2[features],df2.y)
        preds=model.predict_proba(df.iloc[test][features])[:,1].tolist()
        probs.append(preds)
        indices.append(test.tolist())
        test_precisions.append(average_precision_score(df.iloc[test].y,preds))
    return test_precisions, probs, indices

In [29]:
#sprawdźmy czy lepszy wynik uzyskamy używając standaryzacji na przedziale [0,1] czy z-score

In [30]:
def stand(features,df,valid,function=lambda x: (x-np.min(x))/(np.max(x)-np.min(x))):
    df2=df.copy()
    df2[features]=df2[features].apply(function)
    test_precisions, probs, indices=valid(df=df2)
    print(np.mean(test_precisions))

In [57]:
features_st=['LIMIT_BAL',"AGE"]
features_st.extend(df.loc[:,"BILL_AMT1":"PAY_AMT6"].columns.tolist())

In [62]:
stand(features=features_st,df=df,valid=CV_SVM)

0.8565581897480026


In [63]:
stand(features=features_st,df=df,valid=CV_SVM,function=lambda x: (x-np.mean(x))/np.std(x))

0.8663024196379311


In [64]:
#lepszy wynik dla z-score
df[features_st]=df[features_st].apply(lambda x: (x-np.mean(x))/np.std(x))

In [68]:
#tuning hiperparametrów
results=[]
for kernel in ['linear','poly','rbf']:
    for C in [0.01,0.1,0.25,0.5,1,2,5,10,25,50,100]:
        if kernel=='poly':
            for degree in [2,3,4,5]:
                test_precisions, probs, indices=CV_SVM(degree=degree,C=C,kernel=kernel)
                print(f'C={C} kernel={kernel} degree={degree} precision=',np.mean(test_precisions),sep='')
                results.append((C,kernel,degree,np.mean(test_precisions)))
        elif kernel=='rbf':
            for gamma in [0.1,1,10,'scale']:
                test_precisions, probs, indices=CV_SVM(gamma=gamma,C=C,kernel=kernel)
                print(f'C={C} kernel={kernel} gamma={gamma} precision=',np.mean(test_precisions),sep='')
                results.append((C,kernel,gamma,np.mean(test_precisions)))
        else:
            #pomijamy C>2 dla kernela liniowego z uwagi na długi czas uczenia algorytmu
            if C>2:
                continue
            test_precisions, probs, indices=CV_SVM(C=C,kernel=kernel)
            print(f'C={C} kernel={kernel} precision=',np.mean(test_precisions),sep='')
            results.append((C,kernel,np.mean(test_precisions)))
                

C=0.01 kernel=linear precision=0.8669406617783733
C=0.1 kernel=linear precision=0.8679415202633942
C=0.25 kernel=linear precision=0.867772396315664
C=0.5 kernel=linear precision=0.8679335479252963
C=1 kernel=linear precision=0.8682913964024752
C=2 kernel=linear precision=0.8667646425892329
C=0.01 kernel=poly degree=2 precision=0.8570940819853717
C=0.01 kernel=poly degree=3 precision=0.8563799261864453
C=0.01 kernel=poly degree=4 precision=0.8501220328839665
C=0.01 kernel=poly degree=5 precision=0.8421273494078282
C=0.1 kernel=poly degree=2 precision=0.8607142199555398
C=0.1 kernel=poly degree=3 precision=0.8566714203692898
C=0.1 kernel=poly degree=4 precision=0.8484547475630118
C=0.1 kernel=poly degree=5 precision=0.8441432819396504
C=0.25 kernel=poly degree=2 precision=0.8630510618116876
C=0.25 kernel=poly degree=3 precision=0.8570600911846047
C=0.25 kernel=poly degree=4 precision=0.8505354202278392
C=0.25 kernel=poly degree=5 precision=0.8443614121534786
C=0.5 kernel=poly degree=2 pr

In [70]:
#dla kernela liniowego i C większego od 5 podzieliłem próbki na mniejsze części z uwagi na bardzo długi czas uczenia algorytmu
for C in [5,10]:
    means=[]
    for k in range(3): 
        test_precisions, probs, indices=CV_SVM(C=C,kernel='linear',sample_size=1000)
        means.append(np.mean(test_precisions))          
    print(C,np.mean(means))
#widać, że wyniki dla tych C nie są lepsze niż w przypadku C<=2

5 0.8662554632943825
10 0.8660178906535153


In [71]:
results.sort(key=lambda x: x[-1],reverse=True)
results

[(10, 'rbf', 'scale', 0.8714850664708397),
 (5, 'rbf', 'scale', 0.8714827566457055),
 (100, 'poly', 2, 0.8701397488054894),
 (2, 'rbf', 'scale', 0.8698191548395382),
 (50, 'poly', 2, 0.8690922376421589),
 (1, 'linear', 0.8682913964024752),
 (0.1, 'rbf', 0.1, 0.8682499757334261),
 (0.01, 'rbf', 0.1, 0.8681537822538473),
 (25, 'rbf', 'scale', 0.8680635420880265),
 (0.1, 'linear', 0.8679415202633942),
 (0.5, 'linear', 0.8679335479252963),
 (1, 'rbf', 0.1, 0.8678908682859404),
 (25, 'poly', 2, 0.8678392023894906),
 (0.25, 'linear', 0.867772396315664),
 (0.25, 'rbf', 0.1, 0.8677105436935323),
 (0.5, 'rbf', 0.1, 0.867112124598983),
 (0.01, 'linear', 0.8669406617783733),
 (2, 'linear', 0.8667646425892329),
 (10, 'poly', 2, 0.8664447043452876),
 (1, 'rbf', 'scale', 0.8663024196379311),
 (2, 'rbf', 0.1, 0.8658029024600573),
 (1, 'poly', 2, 0.8655121151834304),
 (2, 'poly', 2, 0.8648537673918106),
 (0.5, 'poly', 2, 0.8645820729342676),
 (5, 'poly', 2, 0.8642979242634908),
 (0.5, 'rbf', 'scale', 

In [19]:
#najlepszy model to C=10, kernel=rbf i gamma=scale
#sprobujmy dodac kwadrat zmiennej wiek

features2=features.copy()
features2.append("AGE_SQ")

In [80]:
df['AGE_SQ']=df[['AGE_SQ']].apply(lambda x: (x-np.mean(x))/np.std(x))

In [81]:
test_precisions, probs, indices=CV_SVM(features=features2,C=10)
precis=np.mean(test_precisions)
print(precis)
#pograsza model

0.8711982471062214


In [88]:
#następnym krokiem będzie sprawdzenie, która zmienna prognozuje najlepiej
results=[]
for feature in features:
    test_precisions, probs, indices=CV_SVM(features=[feature],C=10)
    print(feature,np.mean(test_precisions))
    results.append((feature,np.mean(test_precisions)))

LIMIT_BAL 0.7818686434841873
AGE 0.779362330803987
PAY_0 0.8570841796140922
PAY_2 0.8297367559524845
PAY_3 0.8049868066444844
PAY_4 0.8162245140910592
PAY_5 0.8151182759925121
PAY_6 0.8038316729115691
BILL_AMT1 0.7724281595981941
BILL_AMT2 0.7729490208969994
BILL_AMT3 0.773131947428323
BILL_AMT4 0.7755084076958074
BILL_AMT5 0.7803020896375114
BILL_AMT6 0.7772175883842036
PAY_AMT1 0.7652197656654895
PAY_AMT2 0.8108092350327929
PAY_AMT3 0.7614783975772829
PAY_AMT4 0.7829707104834549
PAY_AMT5 0.7702591445407172
PAY_AMT6 0.7944881612592051
SEX_2 0.78073190264039
EDUCATION_1 0.7748327009832986
EDUCATION_2 0.7730198809288671
EDUCATION_3 0.7764638273310605
EDUCATION_4 0.7788000000000002
EDUCATION_5 0.7788000000000002
EDUCATION_6 0.7788000000000002
MARRIAGE_1 0.7775619206910973
MARRIAGE_2 0.7782491414271528
MARRIAGE_3 0.7788000000000002


In [89]:
results.sort(key=lambda x: x[-1])
results

[('PAY_AMT3', 0.7614783975772829),
 ('PAY_AMT1', 0.7652197656654895),
 ('PAY_AMT5', 0.7702591445407172),
 ('BILL_AMT1', 0.7724281595981941),
 ('BILL_AMT2', 0.7729490208969994),
 ('EDUCATION_2', 0.7730198809288671),
 ('BILL_AMT3', 0.773131947428323),
 ('EDUCATION_1', 0.7748327009832986),
 ('BILL_AMT4', 0.7755084076958074),
 ('EDUCATION_3', 0.7764638273310605),
 ('BILL_AMT6', 0.7772175883842036),
 ('MARRIAGE_1', 0.7775619206910973),
 ('MARRIAGE_2', 0.7782491414271528),
 ('EDUCATION_4', 0.7788000000000002),
 ('EDUCATION_5', 0.7788000000000002),
 ('EDUCATION_6', 0.7788000000000002),
 ('MARRIAGE_3', 0.7788000000000002),
 ('AGE', 0.779362330803987),
 ('BILL_AMT5', 0.7803020896375114),
 ('SEX_2', 0.78073190264039),
 ('LIMIT_BAL', 0.7818686434841873),
 ('PAY_AMT4', 0.7829707104834549),
 ('PAY_AMT6', 0.7944881612592051),
 ('PAY_6', 0.8038316729115691),
 ('PAY_3', 0.8049868066444844),
 ('PAY_AMT2', 0.8108092350327929),
 ('PAY_5', 0.8151182759925121),
 ('PAY_4', 0.8162245140910592),
 ('PAY_2', 0.

In [94]:
test_precisions, probs, indices=CV_SVM(C=10)
precis=np.mean(test_precisions)
print(precis)

0.8714850664708397


In [95]:
#sprawdzmy jaki wynik da wyrzucanie najgorzej przewidujacych zmiennych
for result in results:
    features2=features.copy()
    features2.remove(result[0])
    test_precisions, probs, indices=CV_SVM(features=features2,C=10)
    precis2=np.mean(test_precisions)
    if precis2>precis:
        print(result[0],precis2)
        precis=precis2
        features=features2.copy()
    else:
        print('Brak poprawy')
        break

PAY_AMT3 0.871900520466055
PAY_AMT1 0.8723320394712841
Brak poprawy


In [120]:
def grupowanie(df,columns,new_column,valid,function,features,precision,stand=False,*args,**kwargs):
    df2=df.copy()
    df2[new_column]=df[columns].apply(function,axis=1)
    df2.drop(columns,axis=1,inplace=True)
    features2=features.copy()
    for i in columns:
        features2.remove(i)
    features2.append(new_column)
    if stand:
        df2[new_column]=df2[[new_column]].apply(lambda x: (x-np.mean(x))/np.std(x))
    prec=np.mean(valid(df=df2,features=features2,*args,**kwargs)[0])
    print(prec)
    if prec>precision:
        print('Poprawa modelu')
        df[new_column]=df2[new_column]
        for i in columns:
            features.remove(i)
        features.append(new_column)
    else:
        print('Brak poprawy modelu')

In [121]:
#identyczny wynik dla zmiennych education_4-6
#sprobuje polaczyc te zmienne
cols=df.loc[:,'EDUCATION_4':'EDUCATION_6'].columns.tolist()


grupowanie(df,columns=cols,new_column='EDUC4_6',valid=CV_SVM,function=sum,features=features,
          precision=precis,C=10)

0.8722494911869759
Brak poprawy modelu


In [122]:
test_precisions, probs, indices=CV_SVM(C=10)
precis=np.mean(test_precisions)
print(precis)

0.8714850664708397


In [124]:
def create_probs_series(probs,indices):
    probs=sum(probs,[])
    indices=sum(indices,[])
    probs=pd.Series(probs,index=indices).sort_index()
    probs.index=range(1,30001)
    return probs

In [125]:
probs=create_probs_series(probs,indices)

In [126]:
modelSVM = {
    "nazwa":"SVM",
    "opis":"Model SVM, kernel rbf, C=10",
    "specyfikacja":{'kernel':"rbf", 'C':10, 'features':features.copy(),'df':df.copy()},
    "precyzja":precis,
    "predykcje":probs.copy(),
    }

In [127]:
#zapisujemy model

with open('ML_SVM.p','wb') as fp:
    pickle.dump(modelSVM,fp)