In [1]:
import pandas as pd
import numpy as np
import sklearn
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings("ignore")

%matplotlib inline

train_set_csv = 'data/train_set.csv'
test_set_csv = 'data/test_set.csv'

In [2]:
train_set = pd.read_csv(train_set_csv)
test_set = pd.read_csv(test_set_csv)

print(train_set.columns)
train_set.head()

Index(['ID', 'age', 'job', 'marital', 'education', 'default', 'balance',
       'housing', 'loan', 'contact', 'day', 'month', 'duration', 'campaign',
       'pdays', 'previous', 'poutcome', 'y'],
      dtype='object')


Unnamed: 0,ID,age,job,marital,education,default,balance,housing,loan,contact,day,month,duration,campaign,pdays,previous,poutcome,y
0,1,43,management,married,tertiary,no,291,yes,no,unknown,9,may,150,2,-1,0,unknown,0
1,2,42,technician,divorced,primary,no,5076,yes,no,cellular,7,apr,99,1,251,2,other,0
2,3,47,admin.,married,secondary,no,104,yes,yes,cellular,14,jul,77,2,-1,0,unknown,0
3,4,28,management,single,secondary,no,-994,yes,yes,cellular,18,jul,174,2,-1,0,unknown,0
4,5,42,technician,divorced,secondary,no,2974,yes,no,unknown,21,may,187,5,-1,0,unknown,0


In [3]:
# 输入处理 one-hot,blance 取log
import sklearn.preprocessing as preprocessing

train_set['is_train'] = 1
test_set['is_train'] = 0
combine_df = pd.concat([train_set,test_set])
combine_df = pd.get_dummies(combine_df)

columns=['balance','duration']
combine_df[columns] = preprocessing.StandardScaler().fit_transform(combine_df[columns])
combine_df[columns] = preprocessing.MinMaxScaler().fit_transform(combine_df[columns])

#combine_df['balance'] = combine_df['balance'].apply(lambda x: np.log(x + 1)) 归一化处理后取对数意义不大

In [4]:
train_df = combine_df[combine_df['is_train']==1].drop(labels=['is_train'],axis=1)
test_df = combine_df[combine_df['is_train']==0].drop(labels=['is_train','y'],axis=1)
print(train_df.columns)

train_df.head()

Index(['ID', 'age', 'balance', 'campaign', 'day', 'duration', 'pdays',
       'previous', 'y', 'contact_cellular', 'contact_telephone',
       'contact_unknown', 'default_no', 'default_yes', 'education_primary',
       'education_secondary', 'education_tertiary', 'education_unknown',
       'housing_no', 'housing_yes', 'job_admin.', 'job_blue-collar',
       'job_entrepreneur', 'job_housemaid', 'job_management', 'job_retired',
       'job_self-employed', 'job_services', 'job_student', 'job_technician',
       'job_unemployed', 'job_unknown', 'loan_no', 'loan_yes',
       'marital_divorced', 'marital_married', 'marital_single', 'month_apr',
       'month_aug', 'month_dec', 'month_feb', 'month_jan', 'month_jul',
       'month_jun', 'month_mar', 'month_may', 'month_nov', 'month_oct',
       'month_sep', 'poutcome_failure', 'poutcome_other', 'poutcome_success',
       'poutcome_unknown'],
      dtype='object')


Unnamed: 0,ID,age,balance,campaign,day,duration,pdays,previous,y,contact_cellular,...,month_jun,month_mar,month_may,month_nov,month_oct,month_sep,poutcome_failure,poutcome_other,poutcome_success,poutcome_unknown
0,1,43,0.075445,2,9,0.03865,-1,0,0.0,0,...,0,0,1,0,0,0,0,0,0,1
1,2,42,0.118888,1,7,0.025509,251,2,0.0,1,...,0,0,0,0,0,0,0,1,0,0
2,3,47,0.073748,2,14,0.01984,-1,0,0.0,1,...,0,0,0,0,0,0,0,0,0,1
3,4,28,0.063779,2,18,0.044834,-1,0,0.0,1,...,0,0,0,0,0,0,0,0,0,1
4,5,42,0.099804,5,21,0.048183,-1,0,0.0,0,...,0,0,1,0,0,0,0,0,0,1


In [5]:
# 划分数据集 验证集
from sklearn.model_selection import train_test_split

#train_df, val_df = train_test_split(train_df, test_size=0.2, random_state=42)
train_x = train_df.drop(labels=['y','ID'],axis=1)
train_y = train_df['y']
#val_x = val_df.drop(labels=['y','ID'],axis=1)
#val_y = val_df['y']

In [6]:
# 选择模型
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import GradientBoostingClassifier

model = GradientBoostingClassifier(learning_rate=0.1,random_state=42)
cv_score = cross_val_score(model, train_x, train_y, cv=3,scoring='roc_auc')
print('cv score:', cv_score.mean())

model.fit(train_x, train_y)


cv score: 0.9243123342765306


GradientBoostingClassifier(criterion='friedman_mse', init=None,
              learning_rate=0.1, loss='deviance', max_depth=3,
              max_features=None, max_leaf_nodes=None,
              min_impurity_decrease=0.0, min_impurity_split=None,
              min_samples_leaf=1, min_samples_split=2,
              min_weight_fraction_leaf=0.0, n_estimators=100,
              n_iter_no_change=None, presort='auto', random_state=42,
              subsample=1.0, tol=0.0001, validation_fraction=0.1,
              verbose=0, warm_start=False)

In [7]:
#预测并保存结果
test_y = model.predict_proba(test_df.drop(labels=['ID'],axis=1))[:,1]
sumbit = pd.DataFrame(zip(test_df['ID'],test_y),columns=['ID','pred'])
sumbit.to_csv('sumbit.csv')
sumbit.describe()

Unnamed: 0,ID,pred
count,10852.0,10852.0
mean,30743.5,0.117102
std,3132.846895,0.191033
min,25318.0,0.004986
25%,28030.75,0.014217
50%,30743.5,0.032625
75%,33456.25,0.102486
max,36169.0,0.973403
