In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

In [2]:
ger=pd.read_table('./German/german.data', header=None,sep=' ')

In [3]:
colnames=['Status','Duration','History','Purpose','Amount','Savings','Employment','Installment%','Personal','Other','Residence','Property','Age','Plans','Housing','Existing','Job', 'People','Telephone','Foreign','Label']
ger.columns=colnames

In [4]:
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import StandardScaler,MinMaxScaler

In [5]:
#preprocessing
#normalize numerical, category:one hot
ger_pre=ger.copy(deep=True)
ger_pre=ger_pre.drop(columns=['Label'])
numer_ger=StandardScaler().fit_transform(ger_pre.select_dtypes(include='int64'))
numer_ger=pd.DataFrame(numer_ger,columns = ger_pre.select_dtypes(include='number').columns)
#print(numer_ger)

cate_ger=pd.get_dummies(ger_pre.select_dtypes(exclude='int64'))


scale_ger=pd.concat([numer_ger, cate_ger, ger[['Label']]], axis=1)

In [6]:
from sklearn.ensemble import RandomForestClassifier
X, y = scale_ger.loc[:, np.delete(scale_ger.columns.values, 
                                        np.where(scale_ger.columns.values == ['Label']))], \
        scale_ger.loc[:, 'Label']

# train on randomForest to get important features
clf = RandomForestClassifier(n_estimators=100, max_depth=2,
                             random_state=0)
clf.fit(X, y) 

feature_importances = pd.DataFrame(sorted(zip(scale_ger.columns, clf.feature_importances_), key=lambda x: x[1] * -1),
                                    columns = ['feature','importance'])


In [7]:
print(feature_importances)

           feature  importance
0       Status_A14    0.172064
1       Status_A11    0.158938
2         Duration    0.088689
3      History_A34    0.077666
4           Amount    0.070354
..             ...         ...
56   Property_A122    0.000000
57        Job_A171    0.000000
58        Job_A172    0.000000
59        Job_A173    0.000000
60  Telephone_A191    0.000000

[61 rows x 2 columns]


In [9]:
top_features = feature_importances[feature_importances.importance>0]['feature'].values

X, y = scale_ger.loc[:,top_features], scale_ger.loc[:,'Label']

In [20]:
y=y*(-1)+2

In [21]:
from sklearn.model_selection import train_test_split, KFold,cross_val_score,StratifiedKFold, GridSearchCV
from imblearn.over_sampling import SMOTE

In [22]:
x_train, x_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 0,stratify = y)
sm = SMOTE(random_state=0,sampling_strategy=1)
x_train_b, y_train_b = sm.fit_resample(x_train, y_train.ravel())

In [23]:
unique, counts = np.unique(y_train_b, return_counts=True)
dict(zip(unique, counts))

{0: 560, 1: 560}

In [24]:
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import roc_auc_score, recall_score, precision_score,make_scorer,confusion_matrix,brier_score_loss,accuracy_score
from sklearn import metrics
from scipy.stats import ks_2samp

In [25]:
def ks_stat(y, yhat):
    return ks_2samp(yhat[y==1], yhat[y!=1]).statistic

In [26]:
def type2_calcu(y,yhat):
    confusion =confusion_matrix(y,yhat)
    #[row, column]
    TP = confusion[1, 1]
    TN = confusion[0, 0]
    FP = confusion[0, 1]
    FN = confusion[1, 0]
    return FP / float(FP + TN)
def type1_calcu(y,yhat):
    confusion =confusion_matrix(y,yhat)
    #[row, column]
    TP = confusion[1, 1]
    TN = confusion[0, 0]
    FP = confusion[0, 1]
    FN = confusion[1, 0]
    return FN / float(TP + FN)

In [27]:
import xgboost as xgb
import lightgbm as lgb

In [28]:
def scores(model):
    train=[]
    test=[]
    y_train_c=model.predict(x_train)
    y_train_pre=model.predict_proba(x_train)
    y_pred_c=model.predict(x_test)
    y_pred = model.predict_proba(x_test)
    #roc
    train.append(roc_auc_score(y_train,y_train_pre[:,1]))
    test.append(roc_auc_score(y_test, y_pred[:, 1]))
    #ks
    train.append(ks_stat(y_train,y_train_pre[:,1]))
    test.append(ks_stat(y_test, y_pred[:, 1]))
    #brier
    train.append(brier_score_loss(y_train,y_train_pre[:,1]))
    test.append(brier_score_loss(y_test, y_pred[:, 1]))
    #acc
    train.append(accuracy_score(y_train,y_train_c))
    test.append(accuracy_score(y_test, y_pred_c))
    #t1
    train.append(type1_calcu(y_train,y_train_c))
    test.append(type1_calcu(y_test, y_pred_c))
    #t2
    train.append(type2_calcu(y_train,y_train_c))
    test.append(type2_calcu(y_test, y_pred_c))
    return train,test

In [29]:
x_train.columns[x_train.columns.str.contains("[\[\]<]")]

Index([], dtype='object')

In [30]:
models=[]
xgb_params = {'learning_rate': [0.01],'max_depth': [10,20,40,50],'subsample': [0.5,0.7]}
svc_params = {'kernel':['linear'], 'C':[1]}
lr_params={'penalty':('l1','l2'),'C':np.logspace(-2,2,10,base=10)}
dt_params={'max_depth':[2,4,6,8,10,12],'min_samples_leaf':[1,3,5,7]}
rf_params={'max_depth': [2,8,10,14],'min_samples_leaf': [1, 2, 4], 'min_samples_split': [2, 5, 10], 'n_estimators':[1,5,10,15,20]}
nb_p={}
models.append(('svc',svc_params,SVC(probability=True,max_iter=100)))
models.append(('xgb',xgb_params,xgb.XGBClassifier()))
models.append(('LR',lr_params,LogisticRegression()))
models.append(('DT',dt_params,DecisionTreeClassifier()))
models.append(('RF',rf_params,RandomForestClassifier()))
models.append(('nb',nb_p,GaussianNB()))

results=[]
for model_name, parameters, model in models:
    skf = StratifiedKFold(n_splits=2, shuffle=True, random_state=1234)
    clf = GridSearchCV(model, parameters, cv=skf,scoring='roc_auc')
    clf.fit(x_train, y_train)
    print('For ',model_name,', the best parameters: ', clf.best_params_)
    results.append(scores(clf))
    
results=pd.DataFrame(results)
print(results)



For  svc , the best parameters:  {'C': 1, 'kernel': 'linear'}
For  xgb , the best parameters:  {'learning_rate': 0.01, 'max_depth': 20, 'subsample': 0.5}


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

For  LR , the best parameters:  {'C': 0.21544346900318834, 'penalty': 'l2'}
For  DT , the best parameters:  {'max_depth': 4, 'min_samples_leaf': 7}
For  RF , the best parameters:  {'max_depth': 14, 'min_samples_leaf': 4, 'min_samples_split': 10, 'n_estimators': 10}
For  nb , the best parameters:  {}
                                                   0  \
0  [0.543921130952381, 0.11071428571428571, 0.208...   
1  [0.9510565476190477, 0.7755952380952381, 0.140...   
2  [0.8207514880952382, 0.5232142857142857, 0.151...   
3  [0.7867745535714286, 0.43392857142857144, 0.16...   
4  [0.9468824404761905, 0.7601190476190476, 0.116...   
5  [0.780141369047619, 0.4928571428571429, 0.2309...   

                                                   1  
0  [0.5033333333333333, 0.09523809523809523, 0.21...  
1  [0.8308333333333333, 0.4785714285714286, 0.170...  
2  [0.8488095238095238, 0.5261904761904762, 0.144...  
3  [0.729702380952381, 0.40476190476190477, 0.181...  
4  [0.7726190476190476, 0.46428

In [31]:
results.columns=list('xc')
results.head()

Unnamed: 0,x,c
0,"[0.543921130952381, 0.11071428571428571, 0.208...","[0.5033333333333333, 0.09523809523809523, 0.21..."
1,"[0.9510565476190477, 0.7755952380952381, 0.140...","[0.8308333333333333, 0.4785714285714286, 0.170..."
2,"[0.8207514880952382, 0.5232142857142857, 0.151...","[0.8488095238095238, 0.5261904761904762, 0.144..."
3,"[0.7867745535714286, 0.43392857142857144, 0.16...","[0.729702380952381, 0.40476190476190477, 0.181..."
4,"[0.9468824404761905, 0.7601190476190476, 0.116...","[0.7726190476190476, 0.4642857142857143, 0.167..."


In [32]:
def split_col(data, columns):
    for c in columns:
        new_col = data.pop(c)
        max_len = max(list(map(len, new_col.values)))  # 最大长度
        new_col = new_col.apply(lambda x: x + [None]*(max_len - len(x)))  # 补空值，None可换成np.nan
        new_col = np.array(new_col.tolist()).T  # 转置
        for i, j in enumerate(new_col):
            data[c + str(i)] = j


split_col(results, columns=['x','c'])
results

Unnamed: 0,x0,x1,x2,x3,x4,x5,c0,c1,c2,c3,c4,c5
0,0.543921,0.110714,0.208754,0.5475,0.410714,0.55,0.503333,0.095238,0.21221,0.555,0.392857,0.566667
1,0.951057,0.775595,0.140162,0.89625,0.016071,0.308333,0.830833,0.478571,0.17076,0.79,0.064286,0.55
2,0.820751,0.523214,0.151799,0.7825,0.098214,0.495833,0.84881,0.52619,0.144182,0.8,0.064286,0.516667
3,0.786775,0.433929,0.161627,0.75875,0.117857,0.529167,0.729702,0.404762,0.181536,0.72,0.15,0.583333
4,0.946882,0.760119,0.116073,0.8575,0.016071,0.4375,0.772619,0.464286,0.167753,0.74,0.092857,0.65
5,0.780141,0.492857,0.230911,0.7275,0.278571,0.258333,0.714881,0.435714,0.256111,0.7,0.3,0.3


In [33]:
colnames=['train_auc','train_k-s','train_brier','train_acc','train_t1','train_t2','test_auc','test_k-s','test_brier','test_acc','test_t1','test_t2']
results.columns=colnames
results.to_csv("unbalanced-ger.csv")

In [34]:
x_train=x_train_b
y_train=y_train_b

In [35]:
models=[]
xgb_params = {'learning_rate': [0.01],'max_depth': [10,20,40,50],'subsample': [0.5,0.7]}
svc_params = {'kernel':['linear'], 'C':[1]}
lr_params={'penalty':('l1','l2'),'C':np.logspace(-2,2,10,base=10)}
dt_params={'max_depth':[2,4,6,8,10,12],'min_samples_leaf':[1,3,5,7]}
rf_params={'max_depth': [2,8,10,14],'min_samples_leaf': [1, 2, 4], 'min_samples_split': [2, 5, 10], 'n_estimators':[1,5,10,15,20]}
nb_p={}
models.append(('svc',svc_params,SVC(probability=True,max_iter=100)))
models.append(('xgb',xgb_params,xgb.XGBClassifier()))
models.append(('LR',lr_params,LogisticRegression()))
models.append(('DT',dt_params,DecisionTreeClassifier()))
models.append(('RF',rf_params,RandomForestClassifier()))
models.append(('nb',nb_p,GaussianNB()))

results=[]
for model_name, parameters, model in models:
    skf = StratifiedKFold(n_splits=2, shuffle=True, random_state=1234)
    clf = GridSearchCV(model, parameters, cv=skf,scoring='roc_auc')
    clf.fit(x_train, y_train)
    print('For ',model_name,', the best parameters: ', clf.best_params_)
    results.append(scores(clf))
    
results=pd.DataFrame(results)
print(results)



For  svc , the best parameters:  {'C': 1, 'kernel': 'linear'}
For  xgb , the best parameters:  {'learning_rate': 0.01, 'max_depth': 20, 'subsample': 0.7}


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

For  LR , the best parameters:  {'C': 35.93813663804626, 'penalty': 'l2'}
For  DT , the best parameters:  {'max_depth': 6, 'min_samples_leaf': 7}
For  RF , the best parameters:  {'max_depth': 14, 'min_samples_leaf': 1, 'min_samples_split': 5, 'n_estimators': 15}
For  nb , the best parameters:  {}
                                                   0  \
0  [0.6118686224489797, 0.1982142857142857, 0.244...   
1  [0.9843654336734695, 0.8946428571428572, 0.108...   
2  [0.9206760204081633, 0.6946428571428571, 0.110...   
3  [0.8931967474489796, 0.6303571428571428, 0.128...   
4  [0.9973501275510204, 0.95, 0.04921224085127885...   
5  [0.8483928571428572, 0.6035714285714285, 0.221...   

                                                   1  
0  [0.4945833333333333, 0.09047619047619047, 0.24...  
1  [0.8241666666666666, 0.5023809523809524, 0.171...  
2  [0.8061904761904762, 0.5, 0.1575009341761698, ...  
3  [0.7363095238095239, 0.4642857142857143, 0.187...  
4  [0.8213690476190476, 0.51666666

In [36]:
results.columns=list('xc')
split_col(results, columns=['x','c'])
colnames=['train_auc','train_k-s','train_brier','train_acc','train_t1','train_t2','test_auc','test_k-s','test_brier','test_acc','test_t1','test_t2']
results.columns=colnames
results.to_csv("balanced-ger.csv")