In [38]:
import time
import numpy as np
import pandas as pd
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import StratifiedKFold,cross_val_score
from sklearn.feature_selection import SelectPercentile,f_classif
from sklearn.ensemble import AdaBoostClassifier
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score
import sklearn

def set_summary(df):
    print('Data Overview:')
    print('Records:{0}\tDimension{1}'.format(df.shape[0],(df.shape[1]-1)))
    print('-'*30)
    print(df.head(4))
    print('-'*30)
    print('Data DESC:')
    print(df.describe())
    print('Data Dtype:')
    print(df.dtypes)
    print('-'*60)
    
def na_summary(df):
    na_cols=df.isnull().any(axis=0)
    print('NA Cols:')
    print(na_cols)
    print('-'*30)
    print('valid records for each Cols:')
    print(df.count())
    print('-'*30)
    na_lines=df.isnull().any(axis=1)
    print('Total number of NA lines is {0}'.format(na_lines.sum()))
    print('-'*30)
    
def label_summary(df):
    print('Labesl samples count:')
    print(df['value_level'].groupby(df['response']).count())
    print('-'*60)
    
def type_con(df):
    var_list={
        'edu':'int32',
        'user_level':'int32',
        'industry':'int32',
        'value_level':'int32',
        'act_level':'int32',
        'sex':'int32',
        'region':'int32'
    }
    for var,types in var_list.items():
        df[var]=df[var].astype(types)
    print('Data Dtype')
    print(df.dtypes)
    print('-'*30)
    return df

def na_replace(df):
    na_rules={
        'age':df['age'].mean(),
        'total_pageviews':df['total_pageviews'].mean(),
        'edu':df['edu'].median(),
        'edu_ages':df['edu_ages'].median(),
        'user_level':df['user_level'].median(),
        'industry':df['industry'].median(),
        'act_level':df['act_level'].median(),
        'sex':df['sex'].median(),
        'red_money':df['red_money'].mean(),
        'region':df['region'].median()
    }
    df=df.fillna(na_rules)
    print('Check NA exists:')
    print(df.isnull().any().sum())
    print('-'*30)
    return df

def symbol_con(df,enc_object=None,train=True):
    convert_cols=['edu','user_level','industry','value_level','act_level','sex','region']
    df_con=df[convert_cols]
    df_org=df[['age','total_pageviews','edu_ages','blue_money','red_money','work_hours']].values
    if train==True:
        enc=OneHotEncoder()
        enc.fit(df_con)
        df_con_new=enc.transform(df_con).toarray()
        new_matrix=np.hstack((df_con_new,df_org))
        return new_matrix,enc
    else:
        df_con_new=enc_object.transform(df_con).toarray()
        new_matrix=np.hstack((df_con_new,df_org))
        return new_matrix
    
def get_best_model(X,y):
    transform=SelectPercentile(f_classif,percentile=50)
    model_adaboost=AdaBoostClassifier()
    model_pipe=Pipeline(steps=[('ANOVA',transform),('model_adaboost',model_adaboost)])
    cv=StratifiedKFold(5)
    n_estimators=[20,50,80,100]
    score_methods=['accuracy','f1','precision','recall','roc_auc']
    mean_list=list()
    std_list=list()
    for parameter in n_estimators:
        t1=time.time()
        score_list=list()
        print('set parameters:%s'%parameter)
        for score_method in score_methods:
            model_pipe.set_params(model_adaboost__n_estimators=parameter)
            score_tmp=cross_val_score(model_pipe,X,y,scoring=score_method,cv=cv)
            score_list.append(score_tmp)
        score_matrix=pd.DataFrame(np.array(score_list),index=score_methods)
        score_mean=score_matrix.mean(axis=1).rename('mean')
        score_std=score_matrix.std(axis=1).rename('std')
        score_pd=pd.concat([score_matrix,score_mean,score_std],axis=1)
        mean_list.append(score_mean)
        std_list.append(score_std)
        print(score_pd.round(2))
        print('-'*60)
        t2=time.time()
        tt=t2-t1
        print('time:%s'%str(tt))
    mean_matrix=np.array(mean_list).T
    std_matrix=np.array(std_list).T
    mean_pd=pd.DataFrame(mean_matrix,index=score_methods,columns=n_estimators)
    std_pd=pd.DataFrame(std_matrix,index=score_methods,columns=n_estimators)
    print('Mean values for each parameter:')
    print(mean_pd)
    print('Std values for each parameter:')
    print(std_pd)
    print('-'*60)
    return transform


raw_data=pd.read_excel('order.xlsx',sheet_name=0)
X=raw_data.drop('response',axis=1)
y=raw_data['response']

set_summary(raw_data)

na_summary(raw_data)

label_summary(raw_data)

X_t1=na_replace(X)
X_t2=type_con(X_t1)




Data Overview:
Records:39999	Dimension13
------------------------------
    age  total_pageviews  edu  edu_ages  user_level  industry  value_level  \
0  39.0          77516.0  1.0      13.0         1.0       1.0            1   
1  50.0          83311.0  1.0      13.0         2.0       2.0            2   
2  38.0         215646.0  2.0       9.0         3.0       3.0            1   
3  53.0         234721.0  2.0       7.0         2.0       3.0            2   

   act_level  sex  blue_money  red_money  work_hours  region  response  
0        1.0  1.0        2174        0.0          40     1.0         0  
1        1.0  1.0           0        0.0          13     1.0         0  
2        1.0  1.0           0        0.0          40     1.0         0  
3        2.0  1.0           0        0.0          40     1.0         0  
------------------------------
Data DESC:
                age  total_pageviews           edu      edu_ages  \
count  39998.000000     3.999800e+04  39998.000000  39998.0000

In [39]:
X_new,enc=symbol_con(X_t2,enc_object=None,train=True)
transform=get_best_model(X_new,y)
transform.fit(X_new,y)
X_final=transform.transform(X_new)
final_model=AdaBoostClassifier(n_estimators=100)
final_model.fit(X_final,y)

In case you used a LabelEncoder before this OneHotEncoder to convert the categories to integers, then you can now use the OneHotEncoder directly.


set parameters:20


  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw


              0     1     2     3     4  mean   std
accuracy   0.85  0.85  0.86  0.86  0.86  0.85  0.01
f1         0.66  0.64  0.66  0.66  0.66  0.66  0.01
precision  0.72  0.74  0.76  0.78  0.77  0.75  0.02
recall     0.60  0.57  0.59  0.58  0.57  0.58  0.01
roc_auc    0.91  0.90  0.91  0.91  0.91  0.91  0.00
------------------------------------------------------------
time:9.456801414489746
set parameters:50


  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw


              0     1     2     3     4  mean   std
accuracy   0.86  0.86  0.86  0.87  0.86  0.86  0.00
f1         0.66  0.66  0.67  0.69  0.68  0.67  0.01
precision  0.75  0.76  0.77  0.78  0.77  0.77  0.01
recall     0.59  0.58  0.59  0.62  0.61  0.60  0.02
roc_auc    0.91  0.91  0.91  0.92  0.92  0.91  0.00
------------------------------------------------------------
time:21.435871839523315
set parameters:80


  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw


              0     1     2     3     4  mean   std
accuracy   0.86  0.86  0.86  0.87  0.86  0.86  0.00
f1         0.67  0.66  0.68  0.70  0.68  0.68  0.01
precision  0.76  0.77  0.77  0.79  0.76  0.77  0.01
recall     0.60  0.58  0.61  0.62  0.62  0.61  0.02
roc_auc    0.92  0.91  0.92  0.92  0.92  0.92  0.00
------------------------------------------------------------
time:32.053879261016846
set parameters:100


  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw


              0     1     2     3     4  mean   std
accuracy   0.86  0.86  0.87  0.87  0.86  0.86  0.01
f1         0.67  0.67  0.69  0.70  0.69  0.68  0.01
precision  0.76  0.76  0.78  0.79  0.76  0.77  0.01
recall     0.60  0.59  0.61  0.63  0.62  0.61  0.01
roc_auc    0.92  0.92  0.92  0.92  0.92  0.92  0.00
------------------------------------------------------------
time:39.506059408187866
Mean values for each parameter:
                20        50        80        100
accuracy   0.853971  0.859972  0.862372  0.863422
f1         0.656367  0.672174  0.678690  0.682029
precision  0.753217  0.765324  0.770058  0.771176
recall     0.582011  0.599333  0.606846  0.611437
roc_auc    0.908314  0.914988  0.918537  0.919915
Std values for each parameter:
                20        50        80        100
accuracy   0.005232  0.004579  0.004738  0.005089
f1         0.009511  0.012776  0.013060  0.012878
precision  0.023149  0.009682  0.011431  0.012954
recall     0.013767  0.016036  0.016842 

AdaBoostClassifier(algorithm='SAMME.R', base_estimator=None,
          learning_rate=1.0, n_estimators=100, random_state=None)

In [43]:
new_data=pd.read_excel('order.xlsx',sheet_name=1)
final_reponse=new_data['final_response']
new_data=new_data.drop('final_response',axis=1)
set_summary(new_data)
na_summary(new_data)
new_X_t1=na_replace(new_data)
new_X_t2=type_con(new_X_t1)
new_X_t3=symbol_con(new_X_t2,enc_object=enc,train=False)
new_X_final=transform.transform(new_X_t3)


Data Overview:
Records:8843	Dimension12
------------------------------
   age  total_pageviews  edu  edu_ages  user_level  industry  value_level  \
0   61           243019   10         1         2.0       7.0            2   
1   33           215596    4         5         2.0       7.0            2   
2   25            31350    2        10         1.0       5.0            1   
3   23           246965    2        10         1.0      10.0            4   

   act_level  sex  blue_money  red_money  work_hours  region  
0          1    1           0          0          40     1.0  
1          5    1           0          0          40     6.0  
2          1    1           0          0          40     1.0  
3          2    1           0          0          40     1.0  
------------------------------
Data DESC:
               age  total_pageviews          edu     edu_ages   user_level  \
count  8843.000000     8.843000e+03  8843.000000  8843.000000  8841.000000   
mean     38.884428     1.90363

In [45]:
predict_labels=pd.DataFrame(final_model.predict(new_X_final),columns=['labels'])
predict_labels_pro=pd.DataFrame(final_model.predict_proba(new_X_final),columns=['pro1','pro2'])
predict_pd=pd.concat((new_data,predict_labels,predict_labels_pro),axis=1)
print('Predict info')
print(predict_pd.head(2))
print('-'*60)

Predict info
   age  total_pageviews  edu  edu_ages  user_level  industry  value_level  \
0   61           243019   10         1         2.0       7.0            2   
1   33           215596    4         5         2.0       7.0            2   

   act_level  sex  blue_money  red_money  work_hours  region  labels  \
0          1    1           0          0          40     1.0       0   
1          5    1           0          0          40     6.0       0   

       pro1      pro2  
0  0.504052  0.495948  
1  0.507487  0.492513  
------------------------------------------------------------


In [47]:
writer=pd.ExcelWriter('order_predict_result.xlsx')
predict_pd.to_excel(writer,'Sheet1')
writer.save()
print('final accuracy:{0}'.format(accuracy_score(final_reponse,predict_labels)))

final accuracy:0.8624901051679295
