In [36]:
import pandas as pd 
import numpy as np
from sklearn.tree import DecisionTreeRegressor
from sklearn import tree
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler
import re
import warnings
warnings.filterwarnings("ignore")

In [37]:
def decision_tree(data_train,data_test,depth,leaf_num,split_num):
    data_test['repost_hat']=0
    data_test['comments_hat']=0
    data_test['likes_hat']=0
    
    #噪音数据（is_noise）预测值均为0,因此仅处理非噪音数据(此处替换为logit)
    train=data_train[data_train['logit']==0]
    test=data_test[data_test['logit']==0]
    
    #分别对转发，评论和点赞建立三棵树
    tree_repost = DecisionTreeRegressor(criterion='mse', min_samples_leaf=leaf_num,max_depth=depth,min_samples_split=split_num)
    tree_comments= DecisionTreeRegressor(criterion='mse', min_samples_leaf=leaf_num,max_depth=depth,min_samples_split=split_num)
    tree_likes = DecisionTreeRegressor(criterion='mse', min_samples_leaf=leaf_num,max_depth=depth,min_samples_split=split_num)
    
    #拟合三棵树
    regressor_train=train.drop(['repost','comments','likes','logit'],axis=1, inplace=False)
    repost_train=train.loc[:,['repost']]
    comments_train=train.loc[:,['comments']]
    likes_train=train.loc[:,['likes']]
    predict_repost=tree_repost.fit(regressor_train,repost_train)
    predict_comments=tree_comments.fit(regressor_train,comments_train)
    predict_likes=tree_likes.fit(regressor_train,likes_train)
    
    #预测测试集的数据
    regressor_test=test.drop(['repost','comments','likes','logit','repost_hat','comments_hat','likes_hat'],axis=1, inplace=False)
    repost_hat=np.round(predict_repost.predict(regressor_test),0)  #round函数只是返回四舍五入值，是浮点类型
    comments_hat=np.round(predict_comments.predict(regressor_test),0)
    likes_hat=np.round(predict_likes.predict(regressor_test),0)
    
    #将预测值赋值并设置为整数值
    data_test['repost_hat'][data_test['logit']==0]=repost_hat
    data_test['comments_hat'][data_test['logit']==0]=comments_hat
    data_test['likes_hat'][data_test['logit']==0]=likes_hat
    data_test['repost_hat']=data_test['repost_hat'].apply(lambda x:int(x))
    data_test['comments_hat']=data_test['comments_hat'].apply(lambda x:int(x))
    data_test['likes_hat']=data_test['likes_hat'].apply(lambda x:int(x))
    return data_test    

In [38]:
def precision(data):
    data['deviation_repost']=list(map(lambda x, y: abs(x-y)/(y+5), data['repost_hat'],data['repost']))
#print (data['deviation_repost'])
    data['deviation_likes']=list(map(lambda x, y: abs(x-y)/(y+3), data['likes_hat'],data['likes']))
    #print (data['deviation_likes'])
    data['deviation_comments']=list(map(lambda x, y: abs(x-y)/(y+3), data['comments_hat'],data['comments']))
    #print (data['deviation_comments'])
    data['lcf_sum']=data['repost']+data['likes']+data['comments']
#    print (data['lcf_sum'])
    data['lcf_sum']=data['lcf_sum'].apply(lambda x: 100 if x>100 else x)
    data['precision_1_-0.8']=1-0.5*data['deviation_repost']-0.25*data['deviation_likes']-0.25*data['deviation_comments']-0.8
    #print (data['precision_1_-0.8'])
    data.loc[data['precision_1_-0.8']<=0,'sgn']=0
    data.loc[data['precision_1_-0.8']>0,'sgn']=1
#    print (data['sgn'])
    precision_=sum((data['lcf_sum']+1)*data['sgn'])/sum(data['lcf_sum']+1)

    
    return precision_

In [39]:
data_train=pd.read_csv('data_train.txt',index_col=[0],header=0)
data_valid=pd.read_csv('data_valid.txt',index_col=[0],header=0)


In [40]:
data_valid.head()

Unnamed: 0,user_id,weibo_id,time,repost,comments,likes,tfidf,text,time_weekday,time_weekend,...,comment_more_ave_pr,like_more_ave_pr,max_f/l,max_c/l,min_f/l,min_c/l,mean_f/l,mean_c/l,night,logit
0,f349a67d1cd7c8683c5bbc5f8486e193,83674a60e5310195fc35d97ea8f45c46,2015-07-15 01:16:24,0,0,0,1.390068,论优衣库试衣间隔音效果好坏？ http://t.cn/RL5aSzp（分享自 @知乎）,3,0,...,0.125,0.0,2.0,9.5,0.0,0.0,0.5,3.272727,0,1
1,875a4a77b339d93f819e2c4de5bd0b57,f2cdcdbcec9ff47cbb3c6a636e4b92a3,2015-07-01 04:11:48,0,0,0,0.731223,#IT#【武汉一专车司机因毒驾被治安拘留】新华网武汉6月29日电(记者冯国栋)记者28日从武...,3,0,...,0.000112,0.000223,1.1,0.2,0.0,0.0,1.11284,0.377432,1,1
2,0fc17bf5e2dc789dd48505df1f5b14fd,4c1e2418127811d212d0e3867a99db3e,2015-07-13 05:07:28,0,0,1,2.988692,羽田机场~~ http://t.cn/RLGJidL,1,0,...,0.142857,0.0,0.0,2.0,0.0,0.0,0.0,7.0,1,1
3,dd749a5af07c04ce7de451273a983671,419dd71d562883ef836e774bc3f4e163,2015-07-30 14:24:28,0,0,0,0.962645,"分享了-KUTLUK-的歌单《Piano,奏响灵魂深处...》 http://t.cn/...",4,0,...,0.126316,0.021053,1.333333,2.333333,0.0,0.0,0.27027,2.081081,0,1
4,6623347e5f19f35f2d02ad515b96524c,9a2f48a870843d1964a03c6642b309d5,2015-07-21 01:06:53,0,0,0,1.018777,#糟蹋酒鬼菜#老板每次推荐完他们家的下酒菜，就会拼命的安利上等的酒……我每次都会妥协的喝两罐汽水。,2,0,...,0.114519,0.010225,5.75,2.75,0.0,0.0,2.038835,2.067961,0,0


In [41]:
#先筛选两个子集，检测代码
#假如decision_tree函数中是is_noise,则需要将特征从logit换为is_noise
features_list = list(data_train.columns)[3:7] + list(data_train.columns)[8:]
train_subset, valid_subset = data_train.loc[:,features_list], data_valid.loc[:,features_list]

In [42]:
data_valid.loc[:,features_list]

Unnamed: 0,repost,comments,likes,tfidf,number_in_train,forward_max,comment_max,like_max,forward_min,comment_min,...,http,stock,app,title,ad,hotwords,keywords,is_noise,night,logit
0,0,0,0,1.390068,88.0,4.0,19.0,2.0,0.0,0.0,...,1,0,0,0,0,0,1,0,0,1
1,0,0,0,0.731223,26900.0,11.0,2.0,10.0,0.0,0.0,...,1,0,0,0,0,0,1,0,1,1
2,0,0,1,2.988692,21.0,0.0,2.0,1.0,0.0,0.0,...,1,0,0,0,0,0,1,0,1,1
3,0,0,0,0.962645,190.0,4.0,7.0,3.0,0.0,0.0,...,1,0,0,0,0,0,1,0,0,1
4,0,0,0,1.018777,489.0,23.0,11.0,4.0,0.0,0.0,...,0,0,1,0,0,0,1,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
184209,0,0,1,1.737585,100.0,1.0,8.0,7.0,0.0,0.0,...,0,0,0,0,0,0,1,0,0,0
184210,93,4,23,2.173594,76.0,37.0,31.0,57.0,0.0,0.0,...,1,0,0,0,0,0,1,0,0,0
184211,0,0,0,0.431630,910.0,4.0,6.0,3.0,0.0,0.0,...,1,0,0,1,0,0,1,0,0,1
184212,2,1,0,1.715618,152.0,95.0,4.0,9.0,0.0,0.0,...,1,0,0,0,0,0,1,0,0,0


In [44]:
best_precision=0
best_para=[0,0,0]
kk=200
for ii in range(20,30,5): #depth
    for jj in range(10,210,50): #leaf_num
        print('%d th depth, %d leaf num is beginning'%(ii,jj))
        #for kk in range(50,550,100): #min leaf split
        train_subset=data_train.loc[:,['repost','comments','likes','tfidf','number_in_train','forward_max','comment_max', 'like_max','forward_mean',
               'comment_mean', 'like_mean','time_weekend','panduan','length_all','length_chinese','sharing','book',
               'mention','emoji', 'video','http','title','hotwords', 'keywords','is_noise','stock', 'logit']]
        valid_subset=data_valid.loc[:,['repost','comments','likes','tfidf','number_in_train','forward_max','comment_max', 'like_max','forward_mean',
               'comment_mean', 'like_mean','time_weekend','panduan','length_all','length_chinese','sharing','book',
               'mention','emoji', 'video','http','title','hotwords', 'keywords','is_noise','stock', 'logit']]
        valid_subset2=decision_tree(train_subset,valid_subset,ii,jj,kk)
        score=precision(valid_subset2)
        if score>best_precision:
            best_precision=score
            best_para=[ii,jj,kk]
        
        print(best_precision)
        print(best_para,'\n')
    
print('Final is \n')    
print(best_precision)
print(best_para)

20 th depth, 10 leaf num is beginning
0.28408584260091707
[20, 10, 200] 

20 th depth, 60 leaf num is beginning
0.28408584260091707
[20, 10, 200] 

20 th depth, 110 leaf num is beginning
0.28408584260091707
[20, 10, 200] 

20 th depth, 160 leaf num is beginning
0.28408584260091707
[20, 10, 200] 

25 th depth, 10 leaf num is beginning
0.28459883365347194
[25, 10, 200] 

25 th depth, 60 leaf num is beginning
0.28459883365347194
[25, 10, 200] 

25 th depth, 110 leaf num is beginning
0.28459883365347194
[25, 10, 200] 

25 th depth, 160 leaf num is beginning
0.28459883365347194
[25, 10, 200] 

Final is 

0.28459883365347194
[25, 10, 200]


In [45]:
#假如decision_tree函数中是is_noise,则需要将特征从logit换为is_noise
train_subset=data_train.loc[:,['repost','comments','likes','tfidf','number_in_train','forward_max','comment_max', 'like_max','forward_mean',
       'comment_mean', 'like_mean','time_weekend','panduan','length_all','length_chinese','sharing','book',
       'mention','emoji', 'video','http','title','hotwords', 'keywords','logit','stock']]
valid_subset=data_valid.loc[:,['repost','comments','likes','tfidf','number_in_train','forward_max','comment_max', 'like_max','forward_mean',
       'comment_mean', 'like_mean','time_weekend','panduan','length_all','length_chinese','sharing','book',
       'mention','emoji', 'video','http','title','hotwords', 'keywords','logit','stock']]
valid_subset2=decision_tree(train_subset,valid_subset,20,60,300)
precision(valid_subset2)

0.28737255214622237

In [None]:
for ii in range(10,40,5):
    print(ii)

In [None]:
#valid_subset.head()

In [None]:
precision(valid_subset)