In [128]:
import pandas as pd
import numpy as np
import pickle
from sklearn.metrics import average_precision_score
from sklearn.model_selection import KFold, StratifiedKFold
from sklearn import neighbors

In [129]:
df=pd.read_excel("dumm_credit_card_data.xlsx",index_col="ID")

features=pd.concat([df.loc[:,:"PAY_AMT6"],df.loc[:,"SEX_2":]],axis=1).columns.tolist()

In [130]:
def CV_kNN(n_jobs=-1,df=df,n_splits=5,random_state=111,features=features,*args,**kwargs):
    test_precisions=[]
    probs=[]
    indices=[]
    skf=StratifiedKFold(n_splits=n_splits,shuffle=True,random_state=random_state)
    for train,test in skf.split(df[features],df.y):
        model=neighbors.KNeighborsClassifier(*args,**kwargs,n_jobs=n_jobs)
        model.fit(df.iloc[train][features],df.iloc[train].y)
        preds=model.predict_proba(df.iloc[test][features])[:,1].tolist()
        probs.append(preds)
        indices.append(test.tolist())
        test_precisions.append(average_precision_score(df.iloc[test].y,preds))
    return test_precisions, probs, indices

In [131]:
#sprawdźmy czy lepszy wynik uzyskamy używając standaryzacji na przedziale [0,1] czy z-score

In [132]:
def stand(features,df,valid,function=lambda x: (x-np.min(x))/(np.max(x)-np.min(x))):
    df2=df.copy()
    df2[features]=df2[features].apply(function)
    test_precisions, probs, indices=valid(df=df2)
    print(np.mean(test_precisions))

In [133]:
features_st=['LIMIT_BAL',"AGE"]
features_st.extend(df.loc[:,"BILL_AMT1":"PAY_AMT6"].columns.tolist())

In [134]:
stand(features=features_st,df=df,valid=CV_kNN)

0.8605121095035322


In [135]:
stand(features=features_st,df=df,valid=CV_kNN,function=lambda x: (x-np.mean(x))/np.std(x))

0.8598604834449978


In [136]:
#lepszy wynik dla standaryzacji na przedziale [0,1]
df[features_st]=df[features_st].apply(lambda x: (x-np.min(x))/(np.max(x)-np.min(x)))

In [137]:
#tuning hiperparametrow
#optymalna liczba sasiadow
for k in [1, 3, 5, 10, 15, 30, 50, 100, 150, 200]:
    test_precisions, probs, indices=CV_kNN(n_neighbors=k)
    print(k,np.mean(test_precisions))

1 0.8189979646277475
3 0.8483493781544797
5 0.8605121095035322
10 0.8747291102072972
15 0.8817889587522704
30 0.8900102255005363
50 0.8942442855303268
100 0.8982430449403436
150 0.8994798063377429
200 0.8994936841330954


In [138]:
#sprawdzmy wieksza liczbe sasiadow
for k in [250,300,400,500]:
    test_precisions, probs, indices=CV_kNN(n_neighbors=k)
    print(k,np.mean(test_precisions))

250 0.899770945036131
300 0.899146413153941
400 0.8987070830945989
500 0.897387186474018


In [139]:
#najlepszy wynik dla 250 sasiadow
#sprawdzmy czy nie lepszy wynik uzyskamy dla odleglosci miejskiej
test_precisions, probs, indices=CV_kNN(p=1,n_neighbors=250)
np.mean(test_precisions)
#pogorszenie

0.897663827609746

In [141]:
#sprobujmy dodac kwadrat zmiennej wiek

features2=features.copy()
features2.append("AGE_SQ")

In [142]:
df['AGE_SQ']=df[['AGE_SQ']].apply(lambda x: (x-np.min(x))/(np.max(x)-np.min(x)))

In [143]:
test_precisions, probs, indices=CV_kNN(n_neighbors=250,features=features2)
np.mean(test_precisions)

0.8991901942398423

In [147]:
#brak poprawy

#sprawdzmy ktora zmienna najlepiej prognozuje
results=[]
for feature in features:
    test_precisions, probs, indices=CV_kNN(n_neighbors=250,features=[feature])
    print(feature,np.mean(test_precisions))
    results.append((feature,np.mean(test_precisions)))

LIMIT_BAL 0.8336091141619173
AGE 0.7887607141142219
PAY_0 0.8617613688330579
PAY_2 0.836881975125656
PAY_3 0.8277373765707796
PAY_4 0.821034659334211
PAY_5 0.8197006534637812
PAY_6 0.813688879832305
BILL_AMT1 0.7958301911412503
BILL_AMT2 0.790620059023959
BILL_AMT3 0.793279198935448
BILL_AMT4 0.797234653949632
BILL_AMT5 0.8012856335585656
BILL_AMT6 0.7975279085980407
PAY_AMT1 0.8367309886564769
PAY_AMT2 0.8336876495744912
PAY_AMT3 0.8293853816272378
PAY_AMT4 0.8219662609640374
PAY_AMT5 0.8216065688160767
PAY_AMT6 0.8213784914087736
SEX_2 0.7856940626576984
EDUCATION_1 0.7893748584041311
EDUCATION_2 0.7865111457559786
EDUCATION_3 0.781704482850144
EDUCATION_4 0.7796348153876094
EDUCATION_5 0.7805720723581492
EDUCATION_6 0.778745067974909
MARRIAGE_1 0.7850437126840977
MARRIAGE_2 0.7852393423091553
MARRIAGE_3 0.7787730741669037


In [148]:
results.sort(key=lambda x: x[1])
results

[('EDUCATION_6', 0.778745067974909),
 ('MARRIAGE_3', 0.7787730741669037),
 ('EDUCATION_4', 0.7796348153876094),
 ('EDUCATION_5', 0.7805720723581492),
 ('EDUCATION_3', 0.781704482850144),
 ('MARRIAGE_1', 0.7850437126840977),
 ('MARRIAGE_2', 0.7852393423091553),
 ('SEX_2', 0.7856940626576984),
 ('EDUCATION_2', 0.7865111457559786),
 ('AGE', 0.7887607141142219),
 ('EDUCATION_1', 0.7893748584041311),
 ('BILL_AMT2', 0.790620059023959),
 ('BILL_AMT3', 0.793279198935448),
 ('BILL_AMT1', 0.7958301911412503),
 ('BILL_AMT4', 0.797234653949632),
 ('BILL_AMT6', 0.7975279085980407),
 ('BILL_AMT5', 0.8012856335585656),
 ('PAY_6', 0.813688879832305),
 ('PAY_5', 0.8197006534637812),
 ('PAY_4', 0.821034659334211),
 ('PAY_AMT6', 0.8213784914087736),
 ('PAY_AMT5', 0.8216065688160767),
 ('PAY_AMT4', 0.8219662609640374),
 ('PAY_3', 0.8277373765707796),
 ('PAY_AMT3', 0.8293853816272378),
 ('LIMIT_BAL', 0.8336091141619173),
 ('PAY_AMT2', 0.8336876495744912),
 ('PAY_AMT1', 0.8367309886564769),
 ('PAY_2', 0.836

In [149]:
#sprobujmy przemnozyc najgorzej prognozujaca zmienna zmienna przez stala mniejsza od 1 
def feature_eng_kNN(consts=[0,0.1,0.25,0.5,0.75],feature_nr=0):
    for k in consts:
        df2=df.copy()
        df2[results[feature_nr][0]]*=k
        test_precisions, probs, indices=CV_kNN(df=df2,n_neighbors=250)
        print(k,np.mean(test_precisions))

In [150]:
feature_eng_kNN()

0 0.8997501925667131
0.1 0.8997611128946966
0.25 0.8997600493685001
0.5 0.8997607607075748
0.75 0.8997606926046867


In [151]:
#brak poprawy
#przemnozmy teraz najlepsza zmienna przez stala wieksza od 1

feature_eng_kNN(consts=[1.2,1.5,1.75,2,2.5,3,3.5,4,5,6,7],feature_nr=-1)

1.2 0.8999110234153083
1.5 0.9002100603675194
1.75 0.9002797310677924
2 0.9002331378781141
2.5 0.9000480368999847
3 0.8999872130351904
3.5 0.8998688476062064
4 0.8998360133587205
5 0.8996942066915026
6 0.8996459695735053
7 0.8996237339938326


In [153]:
#lekka poprawa
df[results[-1][0]]*=1.75

In [154]:
#najlepszy model knn
test_precisions, probs, indices=CV_kNN(df=df,features=features,n_neighbors=250)
precis=np.mean(test_precisions)
print(precis)

0.9002797310677924


In [None]:
#Tworzymy serie pandasa zawierającą predykcje
def create_probs_series(probs,indices):
    probs=sum(probs,[])
    indices=sum(indices,[])
    probs=pd.Series(probs,index=indices).sort_index()
    probs.index=range(1,30001)
    return probs

In [None]:
probs=create_probs_series(probs,indices)