# Project1 量化金融信用评估实验

In [1]:
import numpy as np
import pandas as pd
import datetime as dt
import sklearn.preprocessing as sklp
import sklearn.decomposition as skld
import sklearn.manifold as sklm
from sklearn.ensemble import GradientBoostingClassifier
from sklearn import cross_validation, metrics
from sklearn.linear_model import LogisticRegression 
from sklearn import neighbors 



## 1. Read the dataset files.

In [2]:
dir_prefix = "/home/ubuntu/workspace/data_set_public/public_data/"
train_dataset = "project_LoanStats3d_securev1.csv"
sample_dataset = "project_2015Sample.csv"
sample_df = pd.read_csv(dir_prefix + sample_dataset)
train_df = pd.read_csv(dir_prefix + train_dataset, low_memory=False) # please note that this is a large file, it may take a little bit longer to read
sample_df.head()
#print 'Shape of train_df:', sample_df.shape
test_dataset = "antifraud_proj1_test.csv"
test_df = pd.read_csv(dir_prefix + test_dataset, low_memory=False)
#print 'Shape of test_df:', test_df.shape
#print 'Which fields are missing in the test dataset?'
#print set(sample_df.columns) - set(test_df.columns)

Shape of train_df: (98, 86)


## 2. 针对Train_df数据进行清理
- 这里将Train_df作为函数的输入
### 2.1 对各个非数值字段进行数值化

In [3]:
#删除train_df字段中['open_il_6m', 'fico_range_low', 'member_id', 'loan_status', 'fico_range_high', 'id']
train_df = train_df.drop(['open_il_6m','fico_range_low','member_id','fico_range_high','id'],axis = 1)
#修改因变量字段loan_status为数字
train_df = train_df[(train_df['loan_status']=='Current') | (train_df['loan_status']=='Charged Off') | (train_df['loan_status']=='Fully Paid')]
train_df['loan_status'] = train_df['loan_status'].replace(['Current','Fully Paid','Charged Off',],[0,0,1])

#将因变量放在第一位
train_df_loan_status = train_df.loan_status
train_df = train_df.drop('loan_status',axis = 1)
train_df.insert(0,'loan_status',train_df_loan_status)

In [4]:
def Numeralization_df(train_df):
    del train_df['Unnamed: 0']
    train_keys = list(train_df.keys())
    lenth = len(train_df)
    train_df['term'] = train_df['term'].replace([' 60 months',' 36 months'],[1,0])#'term' 修改为0-1变量
    train_df = train_df.drop(['issue_d','emp_title'], axis = 1)#删除issue_d，emp_title
    train_df['emp_length']=train_df['emp_length'].replace(['1 year', '10+ years', '2 years', '3 years', '4 years',
                                     '5 years', '6 years', '7 years', '8 years', '9 years', 
                                     '< 1 year', 'n/a'],[1,10,2,3,4,5,6,7,8,9,0,0])
    train_df['home_ownership'] = train_df['home_ownership'].replace(['MORTGAGE', 'RENT', 'OWN','ANY','NONE'],[0,0.5,1,0.5,0])
    train_df['verification_status'] = train_df['verification_status'].replace(['Source Verified', 
                                                                                 'Not Verified', 'Verified'],
                                                                               [0.5,0,1])
    train_df.index = range(len(train_df))
    #修改因变量为数字
    del train_df['pymnt_plan']#数据值只有“n”，故删除
    del train_df['desc']#描述
    train_df['purpose'] = train_df['purpose'].replace(['car','credit_card','debt_consolidation','educational',
                                                         'home_improvement','house','major_purchase','medical',
                                                         'moving','other','renewable_energy','small_business',
                                                         'vacation','wedding'],[0.6,0.2,1.0,0.2,0.4,0.4,0.6,0.6,0.4,0.2,0.6,0.8,0.2,0.4])
    del train_df['title']#和‘purpose’字段一致
    del train_df['zip_code']#借贷程序的zip code 前三位

    #根据贷款各项金额大小按比例排序
    #地区,我按照各地的人均GDP与总GDP，按照7:3的比例进行合计评分，并归一化
    train_df['addr_state'] = train_df['addr_state'].replace(['ID','DC', 'TX', 'PA', 'GA', 'FL', 'NY', 'CA', 'TN', 'KS', 'MA', 
                                                               'RI','OH', 'OR', 'HI', 'SC', 'MD', 'AZ', 'WI', 'VA', 'CO', 
                                                               'IN', 'LA','NC', 'NJ', 'MO', 'NM', 'IL', 'MI', 'SD', 'WA', 
                                                               'NH', 'VT', 'AL','MN', 'CT', 'DE', 'NE', 'WV', 'MT', 'NV',
                                                               'OK', 'WY', 'AR', 'KY','MS', 'ME', 'UT', 'ND', 'AK'],
                                                              [0.125305614,0.249494545,0.065555092,0.039895621,0.085444756,0.593361556,
                                                                  0.190277765,0.278189313,1,0.227317672,0.192702231,0.155311576,
                                                                  0.131447429,0.280336943,0.132540357,0.106452086,0.069307246,
                                                                  0.15253082,0.302445071,0.21218416,0.030257468,0.137946485,
                                                                  0.208228197,0.110989561,0,0.048207504,0.16335986,0.231921874,
                                                                  0.161796013,0.117910048,0.2777898,0.050846682,0.094623649,
                                                                  0.494175602,0.191919255,0.085791932,0.160341405,0.220064923,
                                                                  0.111679786,0.046895947,0.10685987,0.109896491,0.441591068,
                                                                  0.097945954,0.207944043,0.06716049,0.242232775,0.141853116,
                                                                  0.013158262,0.213121431])

    train_df['revol_util'] = train_df['revol_util'].str.strip('%').astype(float)/100
    #网上将百分比转化的方法：p_float = df['p_str'].str.strip("%").astype(float)/100
    #sample_df['revol_util'] 

    #对earliest_cr_line进行数字化处理
    train_df_keys = list(train_df.keys())
    lenth = len(train_df)
    j = 0
    for i in list(train_df.keys()):
        if i =='earliest_cr_line':
            break
        j += 1
    #print j,train_df_keys[j]

    dic = {'Jan':1,'Feb':2,'Mar':3,'Apr':4,'May':5,'Jun':6,
           'Jul':7,'Aug':8,'Sep':9,'Oct':10,'Nov':11,'Dec':12}
    month = [dic[i[0:3]]for i in train_df['earliest_cr_line']]
    if test_df['earliest_cr_line'][0][-3] == '-':
        year = [int(i[-2:]) for i in train_df['earliest_cr_line']]
        for i in range(lenth):
            if year[i]<20:
                year[i] += 2000
            else:
                year[i] += 1900
    else :
        year = [int(i[-4:]) for i in train_df['earliest_cr_line']]
    num_month = pd.DataFrame([(2018-year[i])*12+4-month[i] for i in range(lenth)])
    del train_df['earliest_cr_line']
    train_df.insert(j,'earliest_cr_line',num_month)
    
#    print '字段数字化完成,如下字段进行数字化：\n',['term','emp_length',
#                                  'home_ownership','verification_status','purpose',
#                                  'addr_state','revol_util','earliest_cr_line']
#    print '如下字段被删除：\n',['Unnamed: 0','issue_d','emp_title','pymnt_plan','desc','title','zip_code']
    return train_df

### 2.2找寻含有nan的字段，直接删除超过50%空缺的字段，并用中位数填充

In [5]:
def Find_non_from_df(train_df):
    train_df_keys = list(train_df.keys())
    lenth = len(train_df)
    non_num = [[train_df_keys[i],(lenth - int(train_df[train_df_keys[i]].describe()[0]))]for i in range(len(train_df_keys))]
    need_to_del_non_name = [non_num[i] for i in range(len(train_df_keys))if non_num[i][1]>lenth/2.5]
    train_df = train_df.drop([need_to_del_non_name[i][0]for i in range(len(need_to_del_non_name))],axis = 1)
    train_df_keys = list(train_df.keys())
    non_num = [[train_df_keys[i],(lenth - int(train_df[train_df_keys[i]].describe()[0]))]for i in range(len(train_df_keys))]
#    print '予以删除的字段及空缺值数量：\n',need_to_del_non_name

    need_to_solve_non = [non_num[i] for i in range(len(non_num)) if non_num[i][1]>0]
    names_to_solve = [need_to_solve_non[i][0] for i in range(len(need_to_solve_non))]
#    print '需要填充空缺的字段及其空缺数量：\n',need_to_solve_non,'\n它们已用字段中位数填充。\n'

    for i in range(len(need_to_solve_non)):
        mean = train_df[need_to_solve_non[i][0]].describe()['50%']
        train_df[names_to_solve[i]] = train_df[names_to_solve[i]].fillna(mean)
    return train_df

### 2.3 字段异常值检测及处理
经过分析，仅有字段‘dti’，‘revol_util'，’bc_util'适合进行异常值处理:
    
- 而'dti','revol_util','bc_util'是比率值，范围应在0~1或0~100


In [6]:
def Abnormal_solve_df(train_df):
    train_keys = list(train_df.keys())
    lenth = len(train_df)
    num_abnormal = [0]*3
    name_abnormal = ['dti','revol_util','bc_util']
    ab_index = [0]*3
    j = 0
    for i in range(len(train_keys)):
        if train_keys[i]==name_abnormal[j]:
            ab_index[j] = i
            j += 1
        if j == 3:
            break
    #将revol_util的数x100，转换为百分数
    train_df['revol_util'] = [train_df['revol_util'][i]*100 for i in range(lenth)]

    for i in range(3):#排除了第一列，应变量
        mean = train_df[name_abnormal[i]].describe()['50%']
        train_df = train_df.sort_values(by = name_abnormal[i])
        train_df.index = range(lenth)
        data_append = list(train_df[name_abnormal[i]])
        j = 0
        while((train_df[name_abnormal[i]][j]<0) or train_df[name_abnormal[i]][lenth-1-j]>100):
            if train_df[name_abnormal[i]][j]<0:
                num_abnormal[i] += 1
                data_append[j] = mean
            if train_df[name_abnormal[i]][lenth-1-j]>100:
                num_abnormal[i] += 1
                data_append[lenth-1-j] = mean
            j += 1
        del train_df[name_abnormal[i]]
        train_df.insert(ab_index[i],name_abnormal[i],pd.DataFrame(data_append))
    #print ab_index
#    print 'dti,revol_util,bc_util字段分别有如下数量异常值被处理,以中位数填充：\n',[[name_abnormal[i],num_abnormal[i]]for i in range(len(name_abnormal))]
    return train_df

## 3 根据清理后的数据，进行归一化处理

### 3.1 Z-score化处理

In [7]:
def Z_score_df(train_df):
    train_keys = list(train_df.keys())
    variable = train_df.iloc[:,1:]
    Z_score = sklp.StandardScaler().fit(variable)
    
    Z_score_train_df = pd.DataFrame(Z_score.transform(variable))
    Z_score_train_df.insert(0,'loan_status',train_df['loan_status'])
    Z_score_train_df.columns = train_keys
#    print '数据集进行Z_score归一化处理完毕\n'
#Z_score_train_df.to_csv('Z_score_train_df.csv')
    return Z_score_train_df,Z_score

### 3.2 最小-最大标准化

In [8]:
def Min_Max_df(cl_train_df):
    cl_train_keys = list(cl_train_df.keys())
    variable = cl_train_df.iloc[:,1:]
    Min_Max = sklp.MinMaxScaler().fit(variable)
    Min_Max_train_df = Min_Max.transform(variable)

    Min_Max_train_df = pd.DataFrame(Min_Max_train_df)
    Min_Max_train_df.insert(0,'loan_status',cl_train_df['loan_status'])
    Min_Max_train_df.columns = cl_train_keys
#    print '数据集进行Min_Max归一化处理完毕\n'
    #Min_Max_train_df.to_csv('Min_Max_train_df.csv')
    return Min_Max_train_df,Min_Max

### 3.3 正则化处理

In [9]:
def Normalize_df(cl_train_df):
    cl_train_keys = list(cl_train_df.keys())
    variable = cl_train_df.iloc[:,1:]
    Normalize = sklp.Normalizer().fit(variable)
    Normalize_train_df = Normalize.transform(variable)

    Normalize_train_df = pd.DataFrame(Normalize_train_df)
    Normalize_train_df.insert(0,'loan_status',cl_train_df['loan_status'])
    Normalize_train_df.columns = cl_train_keys
#    print '数据集进行正则化处理完毕\n'
    #Normalize_train_df.to_csv('Normalize_train_df.csv')
    return Normalize_train_df,Normalize

In [10]:
train_df = Numeralization_df(train_df)

train_df = Find_non_from_df(train_df)

train_df = Abnormal_solve_df(train_df)

Z_train_df,Z_score = Z_score_df(train_df)
M_train_df,Min_Max = Min_Max_df(train_df)
N_train_df,Normalize = Normalize_df(train_df)

字段数字化完成,如下字段进行数字化：
['term', 'emp_length', 'home_ownership', 'verification_status', 'purpose', 'addr_state', 'revol_util', 'earliest_cr_line']
如下字段被删除：
['Unnamed: 0', 'issue_d', 'emp_title', 'pymnt_plan', 'desc', 'title', 'zip_code']
予以删除的字段及空缺值数量：
[['mths_since_last_delinq', 196926], ['mths_since_last_record', 333779], ['open_acc_6m', 384509], ['open_il_12m', 384509], ['open_il_24m', 384509], ['mths_since_rcnt_il', 385052], ['total_bal_il', 384509], ['il_util', 387175], ['open_rv_12m', 384509], ['open_rv_24m', 384509], ['max_bal_bc', 384509], ['all_util', 384509], ['inq_fi', 384509], ['total_cu_tl', 384509], ['inq_last_12m', 384509], ['mths_since_recent_bc_dlq', 301121], ['mths_since_recent_revol_delinq', 259695]]
需要填充空缺的字段及其空缺数量：
[['revol_util', 156], ['bc_open_to_buy', 3765], ['bc_util', 4013], ['mo_sin_old_il_acct', 11837], ['mths_since_recent_bc', 3614], ['mths_since_recent_inq', 43442], ['num_rev_accts', 1], ['num_tl_120dpd_2m', 18365], ['percent_bc_gt_75', 4029]] 
它们已用字段中位数填充。

数

### 考虑到test_df与train_df字段统一性，因此对test_df删除一些字段

In [11]:
test_df = Numeralization_df(test_df)
test_df = test_df.drop(['mths_since_last_delinq','mths_since_last_record',
                        'open_acc_6m','open_il_12m', 'open_il_24m',
                        'mths_since_rcnt_il', 'total_bal_il', 'il_util',
                        'open_rv_12m', 'open_rv_24m','max_bal_bc',
                        'all_util', 'inq_fi','total_cu_tl', 
                        'inq_last_12m', 'mths_since_recent_bc_dlq', 
                        'mths_since_recent_revol_delinq'],axis = 1)
test_df = Find_non_from_df(test_df)
test_df = Abnormal_solve_df(test_df)

字段数字化完成,如下字段进行数字化：
['term', 'emp_length', 'home_ownership', 'verification_status', 'purpose', 'addr_state', 'revol_util', 'earliest_cr_line']
如下字段被删除：
['Unnamed: 0', 'issue_d', 'emp_title', 'pymnt_plan', 'desc', 'title', 'zip_code']
予以删除的字段及空缺值数量：
[]
需要填充空缺的字段及其空缺数量：
[['dti', 30], ['revol_util', 50], ['bc_open_to_buy', 954], ['bc_util', 994], ['mo_sin_old_il_acct', 2520], ['mths_since_recent_bc', 914], ['mths_since_recent_inq', 11045], ['num_tl_120dpd_2m', 4444], ['percent_bc_gt_75', 956]] 
它们已用字段中位数填充。



### test_df 的归一化使用train_df中归一化的参数

In [12]:
test_df_keys = list(test_df.keys())
Z_test_df = pd.DataFrame(Z_score.transform(test_df))
Z_test_df.columns = test_df_keys

M_test_df = pd.DataFrame(Min_Max.transform(test_df))
M_test_df.columns = test_df_keys

N_test_Df = pd.DataFrame(Normalize.transform(test_df))
N_test_Df.columns = test_df_keys

## 4.数据降维—— PCA

- 清理后的数据为Z_train_df和Z_test_df(以Z-score归一数据集为例)

In [13]:
variable = Z_train_df.iloc[:,1:]
train_df_keys = list(train_df.keys())
lenth = len(variable)
pca = skld.IncrementalPCA(n_components=None,batch_size=lenth/100)
pca.fit(variable)

IncrementalPCA(batch_size=4050, copy=True, n_components=None, whiten=False)

- 变换后特征向量的特征值占比累计统计

In [14]:
score = [i*100 for i in pca.explained_variance_ratio_]
for i in range(len(score)-1):
    score[i+1] += score[i]
    if score[i]>95:
        dim = i+1
        break
#print '95%占比的特征向量数量：',dim

95%占比的特征向量数量： 35


- 选取dim维特征向量,形成pca_train_df和pca_test_df 

In [15]:
pca_train_df = pd.DataFrame(pca.transform(variable)).iloc[:,:dim]
pca_train_df_keys = ['']*dim
for i in range(dim):
    pca_train_df_keys[i] = 'X'+str(i+1)
pca_train_df.columns = pca_train_df_keys
pca_train_df.insert(0,'Y',Z_train_df['loan_status'])
pca_test_df = pd.DataFrame(pca.transform(Z_test_df)).iloc[:,:dim]
pca_test_df_keys = ['']*dim
for i in range(dim):
    pca_test_df_keys[i] = 'X'+str(i+1)
pca_test_df.columns = pca_test_df_keys

In [16]:
#pca_train_df.to_csv('pca_train_df.csv')

## 5 生成样本集
- SMOTEENN
- EasyEnsemble
- BalanceCascade

参考blog：https://blog.csdn.net/kizgel/article/details/78553009?locationNum=6&fps=1

### 5.1 SMOTE
- SMOTE方法容易产生较多噪声，因此选用smoteenn

In [17]:
from sklearn.cross_validation import train_test_split
#切分pca_train_df数据集为训练与测试
X_train,X_test,y_train,y_test = train_test_split(pca_train_df.iloc[:,1:],pca_train_df['Y'],test_size = 0.2)


In [18]:
X_train = X_train.as_matrix()
X_test = X_test.as_matrix()

In [19]:
y_train = y_train.as_matrix()
y_test = y_test.as_matrix()

In [20]:
def DataFramize(X,y,X_keys):
    X,y = pd.DataFrame(X),pd.DataFrame(y)
    X.columns = X_keys
    y.columns = ['y']
    return X,y

In [21]:
from imblearn.over_sampling import SMOTE
X,y = X_train,y_train
X_sm,y_sm = SMOTE().fit_sample(X,y)
X_sm=np.round(X_sm,3)

In [22]:
#计算时间太久，已经存入文件‘train_smoteenn.csv'
#from imblearn.combine import SMOTEENN
#X_smn,y_smn = SMOTEENN().fit_sample(X,y)

In [23]:
train_smote = pd.DataFrame(X_sm)
train_smote.columns = list(pca_train_df.keys())[1:]
train_smote.insert(0,'Y',y_sm)

In [24]:
#train_smote.to_csv('train_smote.csv')

### 5.2 EasyEnsemble

In [25]:
"""
from imblearn.ensemble import EasyEnsemble
ee = EasyEnsemble(random_state=0, n_subsets=15)
X_e,y_e = ee.fit_sample(X, y)
"""

'\nfrom imblearn.ensemble import EasyEnsemble\nee = EasyEnsemble(random_state=0, n_subsets=15)\nX_e,y_e = ee.fit_sample(X, y)\n'

### 5.3 BalancedCascade

In [26]:
"""
from imblearn.ensemble import BalanceCascade
from sklearn.linear_model import LogisticRegression
bc = BalanceCascade(random_state=0,n_max_subset=10,estimator=LogisticRegression(random_state=0))
X_bc,y_bc = bc.fit_sample(X,y)
"""

'\nfrom imblearn.ensemble import BalanceCascade\nfrom sklearn.linear_model import LogisticRegression\nbc = BalanceCascade(random_state=0,n_max_subset=10,estimator=LogisticRegression(random_state=0))\nX_bc,y_bc = bc.fit_sample(X,y)\n'

## 模型建立
- Logist 回归

In [27]:
from sklearn import metrics  
#使用smote后的训练集X_sm,y_sm
LR = LogisticRegression(C = 0.1,penalty='l2',solver='sag',class_weight='balanced')
LR.fit(X_sm,y_sm)
#print LR.score(X_sm,y_sm)
LRy_predict_sm = LR.predict(X_test)
LRy_prob_sm = LR.predict_proba(X_test)
#print X_test
#print np.array(LRy_predict_sm)
LRy_prob_sm1=LRy_prob_sm[:,1]
##print LRy_prob_sm1
#print y_test

0.64901383642


In [28]:
#knn
knn = neighbors.KNeighborsClassifier()
knn.fit(X_sm,y_sm)
knn_predict_sm = LR.predict(X_test)
knn_prob_sm = LR.predict_proba(X_test)
knn_prob_sm1=LRy_prob_sm[:,1]

In [30]:
#print metrics.accuracy_score(y_test,LRy_predict_sm)
#print metrics.accuracy_score(y_test,LRy_predict_smn)

- ensemble-logist

In [31]:
"""
#easyensemble的子集数量
easy_num = 15
y_test_prob = np.array([0.0]*len(test_df))
for i in range(easy_num):
    LR.fit(X_e[i],y_e[i])
    y_test_prob += LR.predict_proba(pca_test_df)[:,1]
"""

'\n#easyensemble\xe7\x9a\x84\xe5\xad\x90\xe9\x9b\x86\xe6\x95\xb0\xe9\x87\x8f\neasy_num = 15\ny_test_prob = np.array([0.0]*len(test_df))\nfor i in range(easy_num):\n    LR.fit(X_e[i],y_e[i])\n    y_test_prob += LR.predict_proba(pca_test_df)[:,1]\n'

In [32]:
#y_test_prob = y_test_prob/easy_num
#y_test_prob

In [33]:
"""
#RF模型，使用smote数据
from sklearn.ensemble import RandomForestClassifier 
RF=RandomForestClassifier(n_estimators=40,max_features=6,n_jobs=16,max_depth=6,min_samples_leaf=5000)
RF.fit(X_sm,y_sm)
RF.score(X_sm,y_sm)
RF_predict_sm = RF.predict(X_test)
RF_prob_sm = RF.predict_proba(X_test)
#print "Accuracy : %.4g" % metrics.accuracy_score(y_test, RF_predict_sm)
#print "AUC Score (Train): %f" % metrics.roc_auc_score(y_test, RF_predict_sm)
"""

'\n#RF\xe6\xa8\xa1\xe5\x9e\x8b\xef\xbc\x8c\xe4\xbd\xbf\xe7\x94\xa8smote\xe6\x95\xb0\xe6\x8d\xae\nfrom sklearn.ensemble import RandomForestClassifier \nRF=RandomForestClassifier(n_estimators=40,max_features=6,n_jobs=16,max_depth=6,min_samples_leaf=5000)\nRF.fit(X_sm,y_sm)\nRF.score(X_sm,y_sm)\nRF_predict_sm = RF.predict(X_test)\nRF_prob_sm = RF.predict_proba(X_test)\n#print "Accuracy : %.4g" % metrics.accuracy_score(y_test, RF_predict_sm)\n#print "AUC Score (Train): %f" % metrics.roc_auc_score(y_test, RF_predict_sm)\n'

In [34]:
#创建GBM模型，使用SMOTE后的训练集X_sm,y_sm
min_samples_split_init=int(0.0075*y_sm.shape[0])
LGBM=GradientBoostingClassifier(random_state=10,min_samples_split=min_samples_split_init,n_estimators=50,min_samples_leaf=5000,max_depth=6)
LGBM.fit(X_sm,y_sm)
LGBM.score(X_sm,y_sm)
LGBMy_predict_sm = LGBM.predict(X_test)
LGBMy_prob_sm = LGBM.predict_proba(X_test)
#print "Accuracy : %.4g" % metrics.accuracy_score(y_test, LGBMy_predict_sm)
#print "AUC Score (Train): %f" % metrics.roc_auc_score(y_test, LGBMy_predict_sm)


In [45]:
#逻辑回归、RF和GBM的整合
#sumary_prob_sm=(LRy_prob_sm1+LGBMy_prob_sm[:,1]+RF_prob_sm[:,1]+knn_prob_sm1)/4
sumary_predict_sm=LRy_predict_sm+LGBMy_predict_sm+knn_predict_sm
sumary_predict_sm.tolist()
lengh=len(sumary_predict_sm)
for xx in range(1,lengh):
    if sumary_predict_sm[xx]<2:
        sumary_predict_sm[xx]=0
    else:
        sumary_predict_sm[xx]=1
sumary_predict_sm
#sumary_predict_sm=[x=1 for x in sumary_predict_sm if x>=2]
#sumary_prob_sm
#print "Accuracy : %.4g" % metrics.accuracy_score(y_test, sumary_prob_sm)
#print "AUC Score (Train): %f" % metrics.roc_auc_score(y_test, sumary_prob_sm)


array([0, 1, 0, ..., 1, 0, 1])

## Output the results
You do not need to modify the following lines. 

**Please do not modify the file name "antifraud_proj1_result.csv".**

In [46]:
#y_test_predict = LR.predict_proba(pca_test_df)

In [47]:
#Y_prediction_test = np.squeeze(d['Y_prediction_test'])

In [48]:
d = {'Predict': sumary_predict_sm}
test_predict = pd.DataFrame(data=d)
test_predict.to_csv("antifraud_proj1_result.csv")
