In [3]:
import numpy as np
import pandas as pd
import lightgbm as lgb
import datetime as dt
import os
import seaborn as sns
import math
import matplotlib.pyplot as plt
%matplotlib inline
import warnings
warnings.filterwarnings("ignore")

from tqdm import tqdm,tqdm_notebook,tnrange
import json
from sklearn.metrics import f1_score
from sklearn.model_selection import StratifiedKFold
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD
from sklearn.utils import shuffle
from itertools import product
from sklearn.metrics import mean_squared_error
import ast

In [2]:
# path = './'
# train_queries = pd.read_csv(path + 'train_queries.csv', parse_dates=['req_time'])
# train_plans   = pd.read_csv(path + 'train_plans.csv', parse_dates=['plan_time'])
# train_clicks  = pd.read_csv(path + 'train_clicks.csv')
# profiles      = pd.read_csv(path + 'profiles.csv') 
# test_queries  = pd.read_csv(path + 'test_queries.csv', parse_dates=['req_time'])
# test_plans    = pd.read_csv(path + 'test_plans.csv', parse_dates=['plan_time'])
# weather       = pd.read_csv(path + 'weather.csv')
# lnglat        = pd.read_csv(path + 'lnglat.csv')
# min_sub_d     = pd.read_csv(path + 'min_sub_d.csv',header=None)
# min_sub_o     = pd.read_csv(path + 'min_sub_o.csv',header=None)
# weather       = pd.read_csv(path + 'weather.csv')

In [3]:
def merge_data():
    path = './'
    data = pd.read_csv('./data_shenzhen.csv',parse_dates=['req_time','plan_time']) 
    return data

In [4]:
def od_feat(pdata):
    
    pdata['o_lng'] = pdata['o'].apply(lambda x: float(x.split(',')[0]))
    pdata['o_lat'] = pdata['o'].apply(lambda x: float(x.split(',')[1]))
    pdata['d_lng'] = pdata['d'].apply(lambda x: float(x.split(',')[0]))
    pdata['d_lat'] = pdata['d'].apply(lambda x: float(x.split(',')[1])) 
    pdata['od_manhattan_distance'] = abs(pdata['o_lng']-pdata['d_lng'])+abs(pdata['o_lat']-pdata['d_lat'])
    pdata['lon_diff'] = pdata['o_lng'] - pdata['d_lng']
    pdata['lat_diff'] = pdata['o_lat'] - pdata['d_lat']
    
#     pdata['d_in_wuhuan'] = Distance(pdata['d_lng'].values,pdata['d_lat'].values,center[0],center[1])
#     in_wuhuan_idx = pdata['d_in_wuhuan'] < 15.5
#     pdata['d_is_in_wuhuan'] = 0
#     pdata['d_is_in_wuhuan'][in_wuhuan_idx] = 1
    
#     pdata['o_in_wuhuan'] = Distance(pdata['o_lng'].values,pdata['o_lat'].values,center[0],center[1])
#     in_wuhuan_idx = pdata['o_in_wuhuan'] < 15.5
#     pdata['o_is_in_wuhuan'] = 0
#     pdata['o_is_in_wuhuan'][in_wuhuan_idx] = 1
    return pdata

def time_feat(pdata):
    time_feature = []
    for i in ['req_time']:
        pdata[i + '_minute'] = pdata[i].dt.minute
        pdata[i + '_hour'] = pdata[i].dt.hour
        pdata[i + '_weekday'] = pdata[i].dt.weekday
        time_feature.append(i + '_minute')
        time_feature.append(i + '_hour')
        time_feature.append(i + '_weekday') 

    ispeak_idx_am = (pdata['req_time'].dt.hour >=7) &  (pdata['req_time'].dt.hour<=9)
    pdata['req_time_ispeak_am'] = 0
    pdata['req_time_ispeak_am'][ispeak_idx_am] = 1
    time_feature.append('req_time_ispeak_am')
    ispeak_idx_pm = (pdata['req_time'].dt.hour >=17) &  (pdata['req_time'].dt.hour<=19)
    pdata['req_time_ispeak_pm'] = 0
    pdata['req_time_ispeak_pm'][ispeak_idx_pm] = 1
    time_feature.append('req_time_ispeak_pm')
    morning_idx = (pdata['req_time'].dt.hour >=6) &  (pdata['req_time'].dt.hour<=18)
    pdata['req_time_is_morning'] = 0
    pdata['req_time_is_morning'][morning_idx] = 1
    night_idx = ((pdata['req_time'].dt.hour >=0) &  (pdata['req_time'].dt.hour<=6)) | ((pdata['req_time'].dt.hour >=18) &  (pdata['req_time'].dt.hour<=24))
    pdata['req_time_is_night'] = 0
    pdata['req_time_is_night'][night_idx] = 1
    subway_stop_idx = (pdata['req_time'].dt.hour >=5) &  (pdata['req_time'].dt.hour<=23)
    pdata['req_time_is_subway_stop'] = 0
    pdata['req_time_is_subway_stop'][subway_stop_idx] = 1
    bus_stop_idx = (pdata['req_time'].dt.hour >=5) &  (pdata['req_time'].dt.hour<=22)
    pdata['req_time_is_bus_stop'] = 0
    pdata['req_time_is_bus_stop'][bus_stop_idx] = 1
    pdata['time_diff'] = pdata['plan_time'].values.astype(int)-pdata['req_time'].values.astype(int)
    time_feature.append('time_diff')
    pdata['weekend'] = (pdata['req_time_weekday']/5).astype(int)
    time_feature.append('weekend')
    pdata['hour_minute'] = pdata['req_time_hour']*60 +pdata['req_time_minute']
    time_feature.append('hour_minute')
    
    pdata['hour_minute'] = pdata['req_time_hour']*60 +pdata['req_time_minute']

    pdata['first_workday'] = (pdata['req_time_weekday'] == 0).astype(int)
    pdata['last_workday'] = (pdata['req_time_weekday'] == 4).astype(int)

    pdata['req_min_seg'] = ((pdata['req_time_minute'] + 30) / 60).astype(int)

    return pdata, time_feature

In [5]:
def gen_plan_feas(data):
    n                                           = data.shape[0]
    mode_list_feas                              = np.zeros((n, 12))
    max_dist, min_dist, mean_dist, std_dist     = np.zeros((n,)), np.zeros((n,)), np.zeros((n,)), np.zeros((n,))

    max_price, min_price, mean_price, std_price = np.zeros((n,)), np.zeros((n,)), np.zeros((n,)), np.zeros((n,))

    max_eta, min_eta, mean_eta, std_eta         = np.zeros((n,)), np.zeros((n,)), np.zeros((n,)), np.zeros((n,))

    min_dist_mode, max_dist_mode, min_price_mode, max_price_mode, min_eta_mode, max_eta_mode, first_mode, = \
    np.zeros((n,)), np.zeros((n,)), np.zeros((n,)), np.zeros((n,)), np.zeros((n,)), np.zeros((n,)), np.zeros((n,)),
    
    max_speed, min_speed, max_pd, min_pd, std_speed, std_pd, mean_speed, mean_pd = \
           np.zeros((n,)), np.zeros((n,)), np.zeros((n,)), np.zeros((n,)), np.zeros((n,)), np.zeros((n,)), np.zeros((n,)), np.zeros((n,))
    
    max_pe, min_pe, mean_pe, std_pe = np.zeros((n,)), np.zeros((n,)), np.zeros((n,)), np.zeros((n,))
    second_mode,max_speed_mode,min_speed_mode = np.zeros((n,)),np.zeros((n,)),np.zeros((n,))
    mode_texts = []
    for i, plan in tqdm(enumerate(data['plans_json'].values)):
        if len(plan) == 0:
            cur_plan_list   = []
        else:
            cur_plan_list   = plan
        if len(cur_plan_list) == 0:
            mode_list_feas[i, 0] =  1
            first_mode[i]        =  0
            second_mode[i]       = -1
            max_pe[i]            = -1
            min_pe[i]            = -1
            mean_pe[i]           = -1
            std_pe[i]            = -1
            max_speed[i]         = -1
            min_speed[i]         = -1
            std_speed[i]         = -1
            mean_speed[i]        = -1
            max_pd[i]            = -1
            min_pd[i]            = -1
            std_pd[i]            = -1
            mean_pd[i]           = -1
            max_dist[i]          = -1
            min_dist[i]          = -1
            mean_dist[i]         = -1
            std_dist[i]          = -1
            max_price[i]         = -1
            min_price[i]         = -1
            mean_price[i]        = -1
            std_price[i]         = -1
            max_eta[i]           = -1
            min_eta[i]           = -1
            mean_eta[i]          = -1
            std_eta[i]           = -1
            min_dist_mode[i]     = -1
            max_dist_mode[i]     = -1
            min_speed_mode[i]     = -1
            max_speed_mode[i]     = -1
            min_price_mode[i]    = -1
            max_price_mode[i]    = -1
            min_eta_mode[i]      = -1
            max_eta_mode[i]      = -1
            mode_texts.append('word_null')
        else:
            distance_list = []
            price_list = []
            eta_list = []
            mode_list = []
            for tmp_dit in cur_plan_list:
                distance_list.append(int(tmp_dit['distance']))
                if tmp_dit['price'] == '':
                    price_list.append(0)
                else:
                    price_list.append(int(tmp_dit['price']))
                eta_list.append(int(tmp_dit['eta']))
                mode_list.append(int(tmp_dit['transport_mode']))
            mode_texts.append(
                ' '.join(['word_{}'.format(mode) for mode in mode_list]))
            mode_arr = np.array(mode_list, dtype='int')
            distance_list                = np.array(distance_list)
            price_list                   = np.array(price_list)
            eta_list                     = np.array(eta_list)
            mode_list                    = np.array(mode_list, dtype='int')
            mode_list_feas[i, mode_list] = 1
            distance_sort_idx            = np.argsort(distance_list)
            price_sort_idx               = np.argsort(price_list)
            eta_sort_idx                 = np.argsort(eta_list)
            speed_sort_idx               = np.argsort((np.array(distance_list)/np.array(eta_list)).tolist())
            max_dist[i]                  = distance_list[distance_sort_idx[-1]]
            min_dist[i]                  = distance_list[distance_sort_idx[0]]
            max_speed[i]                 = (np.array(distance_list)/np.array(eta_list)).max() 
            min_speed[i]                 = (np.array(distance_list)/np.array(eta_list)).min()
            mean_speed[i]                = np.mean((np.array(distance_list)/np.array(eta_list)).tolist())
            std_speed[i]                 = np.std((np.array(distance_list)/np.array(eta_list)).tolist())
            max_pd[i]                    = (np.array(price_list)/np.array(distance_list)).max()
            min_pd[i]                    = (np.array(price_list)/np.array(distance_list)).min()
            mean_pd[i]                   = np.mean((np.array(price_list)/np.array(distance_list)).tolist())
            std_pd[i]                    = np.std((np.array(price_list)/np.array(distance_list)).tolist())
            max_pe[i]                    = (np.array(price_list)/np.array(eta_list)).max()
            min_pe[i]                    = (np.array(price_list)/np.array(eta_list)).min()
            mean_pe[i]                   = np.mean((np.array(price_list)/np.array(eta_list)).tolist())
            std_pe[i]                    = np.std((np.array(price_list)/np.array(eta_list)).tolist())
            mean_dist[i]                 = np.mean(distance_list)
            std_dist[i]                  = np.std(distance_list)
            max_price[i]                 = price_list[price_sort_idx[-1]]
            min_price[i]                 = price_list[price_sort_idx[0]]
            mean_price[i]                = np.mean(price_list)
            std_price[i]                 = np.std(price_list)
            max_eta[i]                   = eta_list[eta_sort_idx[-1]]
            min_eta[i]                   = eta_list[eta_sort_idx[0]]
            mean_eta[i]                  = np.mean(eta_list)
            std_eta[i]                   = np.std(eta_list)
            first_mode[i]                = mode_list[0]
            if len(mode_arr) < 2:
                second_mode[i] = -1
            else:
                second_mode[i] = mode_arr[1]
            max_dist_mode[i]             = mode_list[distance_sort_idx[-1]]
            min_dist_mode[i]             = mode_list[distance_sort_idx[0]]
            max_price_mode[i]            = mode_list[price_sort_idx[-1]]
            min_price_mode[i]            = mode_list[price_sort_idx[0]]
            max_eta_mode[i]              = mode_list[eta_sort_idx[-1]]
            min_eta_mode[i]              = mode_list[eta_sort_idx[0]]
            max_speed_mode[i]            = mode_list[speed_sort_idx[-1]]
            min_speed_mode[i]            = mode_list[speed_sort_idx[0]]
    feature_data                   =  pd.DataFrame(mode_list_feas)
    feature_data.columns           =  ['mode_feas_{}'.format(i) for i in range(12)]
    feature_data['max_dist']       =  max_dist
    feature_data['min_dist']       =  min_dist
    feature_data['mean_dist']      =  mean_dist
    feature_data['std_dist']       =  std_dist
    feature_data['max_price']      = max_price
    feature_data['min_price']      = min_price
    feature_data['mean_price']     = mean_price
    feature_data['std_price']      = std_price
    feature_data['max_eta']        = max_eta
    feature_data['min_eta']        = min_eta
    feature_data['mean_eta']       = mean_eta
    feature_data['std_eta']        = std_eta
    feature_data['max_dist_mode']  = max_dist_mode
    feature_data['min_dist_mode']  = min_dist_mode
    feature_data['max_price_mode'] = max_price_mode
    feature_data['min_price_mode'] = min_price_mode
    feature_data['max_speed_mode'] = max_price_mode
    feature_data['min_speed_mode'] = min_price_mode
    feature_data['max_eta_mode']   = max_eta_mode
    feature_data['min_eta_mode']   = min_eta_mode
    feature_data['first_mode']     = first_mode
    feature_data['max_speed']      = max_speed
    feature_data['min_speed']      = min_speed
    feature_data['mean_speed']     = mean_speed
    feature_data['std_speed']      = std_speed
    feature_data['max_pd']         = max_pd
    feature_data['min_pd']         = min_pd
    feature_data['mean_pd']        = mean_pd
    feature_data['std_pd']         = std_pd
    feature_data['max_pe']         = max_pe
    feature_data['min_pe']         = min_pe
    feature_data['mean_pe']         = mean_pe
    feature_data['std_pe']         = std_pe
    print('mode tfidf...')
    tfidf_enc = TfidfVectorizer(ngram_range=(1, 2))
    tfidf_vec = tfidf_enc.fit_transform(mode_texts)
    svd_enc = TruncatedSVD(n_components=20, n_iter=20, random_state=2019)
    mode_svd = svd_enc.fit_transform(tfidf_vec)
    mode_svd = pd.DataFrame(mode_svd)
    mode_svd.columns = ['svd_mode_{}'.format(i) for i in range(20)]
    plan_fea = pd.concat([feature_data, mode_svd], axis=1)
    plan_fea['sid'] = data['sid'].values
    print("Done")
    return plan_fea

In [6]:
def f1_weighted(labels,preds):
    preds = np.argmax(preds.reshape(12, -1), axis=0)
    score = f1_score(y_true=labels, y_pred=preds, average='weighted')
    return 'f1_weighted', score, True

In [7]:
def od_svd_feat(pdata):
    # od
    from sklearn.preprocessing import LabelEncoder
    from sklearn.preprocessing import OneHotEncoder
    #
    lbl_enc = LabelEncoder()
    pdata['olbl'] = lbl_enc.fit_transform(pdata['o'])
    lbl_enc = LabelEncoder()
    pdata['dlbl'] = lbl_enc.fit_transform(pdata['d'])
    lbl_enc = LabelEncoder()
    pdata['pidlbl'] = lbl_enc.fit_transform(pdata['pid'])
    # 
    onehot_enc = OneHotEncoder()
    o_tmp = onehot_enc.fit_transform(pdata['olbl'].values.reshape(-1,1))
    onehot_enc = OneHotEncoder()
    d_tmp = onehot_enc.fit_transform(pdata['dlbl'].values.reshape(-1,1))
    onehot_enc = OneHotEncoder()
    pid_tmp = onehot_enc.fit_transform(pdata['pidlbl'].values.reshape(-1,1))
    #
    svd = TruncatedSVD(n_components=30, n_iter=20, random_state=47)
    o_svd = svd.fit_transform(o_tmp)
    svd = TruncatedSVD(n_components=30, n_iter=20, random_state=47)
    d_svd = svd.fit_transform(d_tmp)
    svd = TruncatedSVD(n_components=100, n_iter=20, random_state=47)
    pid_svd = svd.fit_transform(pid_tmp)

    o_cols = ['o_svd_{}'.format(i) for i in range(30)]
    d_cols = ['d_svd_{}'.format(i) for i in range(30)]
    pid_cols = ['pid_svd_{}'.format(i) for i in range(100)]
    o_df = pd.DataFrame(o_svd, columns=o_cols)
    d_df = pd.DataFrame(d_svd, columns=d_cols)
    pid_df = pd.DataFrame(pid_svd, columns=pid_cols)
    pdata = pd.concat([pdata, o_df], axis=1)
    pdata = pd.concat([pdata, d_df], axis=1)
    pdata = pd.concat([pdata, pid_df], axis=1)
    return pdata

In [8]:
def construct_plan_feature_II(fdata):
    n = fdata.shape[0]
    max_price, max_dist, max_eta = np.zeros((n,)),np.zeros((n,)),np.zeros((n,))
    k = 13
    mode_feature = np.full((n,k*11), np.nan)
    ps = ['distance', 'price', 'eta']
    for i, plan in tqdm(enumerate(fdata['plans_json'].values)):
        if len(plan) == 0:
            cur_plan_list = []
            continue
        else:
            cur_plan_list = plan
        if len(cur_plan_list) == 0:
            price_list[i] = -1
            distance_list[i] = -1
            eta_list[i] = -1
        else:
            price_list = []
            distance_list = []
            eta_list = []
            for tmp_dit in cur_plan_list:             
                distance_list.append(int(tmp_dit['distance']))
                if tmp_dit['price'] == '':
                    price_list.append(0)
                else:
                    price_list.append(int(tmp_dit['price']))
                eta_list.append(int(tmp_dit['eta']))
            distance_list = np.array(distance_list)
            eta_list      = np.array(eta_list)
            price_list    = np.array(price_list)
            distance_sort_idx   = np.argsort(distance_list)
            price_sort_idx      = np.argsort(price_list)
            eta_sort_idx        = np.argsort(eta_list)
            max_dist[i] = distance_list[distance_sort_idx[-1]]
            max_eta [i] = eta_list[eta_sort_idx[-1]]
            max_price[i]= price_list[price_sort_idx[-1]]
        # {'1':{'distance': [], 'price':[], 'eta':[]}}
        mode_dict = {}
        # mode_dict
        for tmp_dict in plan:
            m = tmp_dict['transport_mode']
            if m not in mode_dict:
                one_dict = {}
                for p in ps:
                    one_dict[p] = []
                mode_dict[m] = one_dict
            for p in ps:
                mode_dict[m][p].append(0 if tmp_dict[p] == '' else int(tmp_dict[p]))


        rank = 0
        total_distance = 0
        total_price = 0
        total_eta = 0
       
        for m, v in mode_dict.items():
            rank += 1
            m = int(m)
            distance = np.array(v['distance']).mean()
            price = np.array(v['price']).mean()
            eta = np.array(v['eta']).mean()
            mode_feature[i, k*(m-1)] = distance
            mode_feature[i, k*(m-1)+1] = price
            mode_feature[i, k*(m-1)+2] = eta
            mode_feature[i, k*(m-1)+3] = distance / eta
            mode_feature[i, k*(m-1)+4] = price / distance
            mode_feature[i, k*(m-a1)+5] = price / eta
            mode_feature[i, k*(m-1)+6] = rank
            mode_feature[i, k*(m-1)+7] = distance / max_dist[i]
            mode_feature[i, k*(m-1)+8] = price / max_price[i]
            mode_feature[i, k*(m-1)+9] = eta   / max_eta[i]
            
        for m, v in mode_dict.items():
            m = int(m)
            distance = np.array(v['distance']).mean()
            price = np.array(v['price']).mean()
            eta = np.array(v['eta']).mean()
            mode_feature[i, k*(m-1)+10] = distance / total_distance
            mode_feature[i, k*(m-1)+11] = price / total_price
            mode_feature[i, k*(m-1)+12] = eta / total_eta

        
    cols = []
    for i in range(1, 12):
        for p in ps:
            cols.append('mode_' + str(i) + '_' + p)
        cols.append('mode_' + str(i) + '_speed')
        cols.append('mode_' + str(i) + '_p/d')
        cols.append('mode_' + str(i) + '_rank')
        cols.append('mode_' + str(i) + '_p/e')
        cols.append('mode_' + str(i) + 'd/max_d')
        cols.append('mode_' + str(i) + 'p/max_p')
        cols.append('mode_' + str(i) + 'e/max_e')
        cols.append('mode_' + str(i) + '_distance_ratio')
        cols.append('mode_' + str(i) + '_price_ratio')
        cols.append('mode_' + str(i) + '_eta_ratio')
        
    mode_feature_df = pd.DataFrame(mode_feature, columns=cols)
    mode_feature_df['sid'] = fdata['sid'].values
    print("Done")
    return mode_feature_df

In [9]:
def p_cross_feat(pdata):
    path = './'
    pprofiles = pd.read_csv(path + 'profiles.csv') 
    p1 = pd.DataFrame()
    p2 = pd.DataFrame()
    for i in range(1,len(pprofiles.columns)-1):
        if i%2 == 0:
            p1=pd.concat([p1,pprofiles['p{}'.format(i)]],axis=1)
        else:
            p2=pd.concat([p2,pprofiles['p{}'.format(i)]],axis=1)

    p1.columns = ['p_' + str(i) for i in range(32)]
    p2.columns = ['p_' + str(i) for i in range(33)]
    res = (p1+p2)
    res.drop(['p_32'],inplace=True,axis=1)
    res.columns = ['p_' + str(i) for i in range(32)]
    res['pid'] = pprofiles['pid']
    pdata  = pdata.merge(res, 'left', ['pid']) 
    return pdata

In [10]:
def gen_exp_data():
    data = merge_data()
    data = od_feat(data)
    data, time_feature = time_feat(data)
    data['plans_json'] = data['plans'].fillna('[]').apply(lambda x: json.loads(x))
    data_plans = gen_plan_feas(data)
    plan_features = [col for col in data_plans.columns if col not in ['sid']]
    data = data.merge(data_plans, on='sid', how='left')
    data_plans_II = construct_plan_feature_II(data)
    plan_features_II = [col for col in data_plans_II.columns if col not in ['sid']]
    data = data.merge(data_plans_II, on='sid', how='left')
    data['pid'] = data['pid'].fillna(-1)
    data = od_svd_feat(data)
    data = p_cross_feat(data)
    data = od_feat(data)
    print("All Done")
    return data,plan_features,plan_features_II

data,plan_features,plan_features_II = gen_exp_data()

555551it [02:12, 4194.14it/s]


mode tfidf...
Done


555551it [02:02, 4517.00it/s]


Done
All Done


In [11]:
extra = pd.read_csv('./extra.csv')
extra_I = pd.read_csv('./extra_I.csv')
data = data.merge(extra, on='o',how='left')
data = data.merge(extra_I, on='d',how='left')

In [13]:
import gc
del p_svd_feas
del extra
del extra_I
gc.collect()

178

In [4]:
p_svd_feas = pd.read_csv('./p_count_cm.csv')
#data = pd.concat([data,p_svd_feas],axis=1)

In [5]:
p_svd_feas

Unnamed: 0,p_svd_0,p_svd_1,p_svd_2,p_svd_3,p_svd_4,p_svd_5,p_svd_6,p_svd_7,p_svd_8,p_svd_9,...,p_svd_90,p_svd_91,p_svd_92,p_svd_93,p_svd_94,p_svd_95,p_svd_96,p_svd_97,p_svd_98,p_svd_99
0,-1.073548e-08,-2.126399e-09,-8.218471e-10,-3.200015e-10,-7.790980e-11,-6.352080e-10,4.518052e-10,1.187310e-09,-1.476402e-10,1.190747e-09,...,-4.396116e-09,-7.156334e-11,-1.854589e-09,2.939086e-09,6.118292e-09,2.694057e-09,-3.045167e-09,-5.390880e-09,2.759807e-09,-2.952383e-09
1,9.426705e+05,-1.317522e+04,1.316840e+05,-2.086238e+03,-3.173196e+04,4.362052e+04,-1.910117e+04,6.617387e+03,-1.279673e+04,1.693101e+04,...,-1.418552e-09,4.048275e-09,-4.965515e-09,1.431551e-09,8.370036e-09,1.610189e-09,1.210757e-09,-1.535771e-08,9.677103e-09,9.586367e-10
2,9.572547e+05,-1.933942e+04,6.505495e+04,2.775768e+04,4.067025e+04,-5.490227e+03,-7.718095e+03,-1.074269e+04,-9.159441e+04,6.843111e+03,...,8.045933e-09,-1.060797e-09,1.047891e-08,-1.199787e-08,-6.439693e-09,-6.958315e-10,-4.794997e-09,9.458692e-10,5.288873e-09,-1.686711e-10
3,9.183350e+05,1.289942e+05,2.823533e+04,-7.117944e+03,2.608265e+03,1.929139e+04,-8.127898e+04,-2.057412e+04,-6.521253e+04,-2.431390e+04,...,2.883051e-09,-1.134518e-09,-3.624266e-09,-2.985235e-09,3.196441e-09,4.385466e-10,5.094626e-09,-8.737050e-09,7.796017e-09,-4.153717e-11
4,-9.475658e-09,-1.928204e-10,2.818437e-10,9.005137e-10,4.357136e-09,3.462990e-09,3.952097e-09,-1.184085e-09,-2.587312e-09,-2.668985e-10,...,4.130230e-09,1.045404e-09,-7.516769e-09,-1.042367e-08,1.210490e-08,9.275685e-09,-7.260576e-10,-6.001576e-09,6.626658e-09,6.254984e-09
5,9.603575e+05,-4.887918e+04,-4.714479e+04,2.827031e+04,4.204507e+04,4.016458e+04,-6.668529e+03,5.217542e+04,-5.313438e+04,-4.186059e+04,...,-2.740687e-09,-3.225212e-09,3.001809e-09,-1.149886e-09,1.089509e-08,-6.935112e-10,-1.066467e-08,4.311104e-09,2.664783e-09,-3.229531e-09
6,2.240782e-10,2.864401e-09,-4.658275e-11,1.235873e-09,-1.159362e-09,1.378197e-09,-3.794130e-09,-4.299609e-10,6.420004e-10,-1.068199e-09,...,-7.976068e-09,-6.963218e-09,-8.332708e-09,-6.083745e-09,1.433143e-09,8.283788e-09,-6.686110e-09,6.407442e-09,-5.292181e-09,5.817420e-09
7,9.722916e+05,-2.421659e+04,-1.850040e+04,4.736009e+03,5.396102e+04,2.208310e+04,1.735341e+04,-2.978846e+04,-2.712256e+04,-2.090540e+04,...,-1.622163e-09,5.119603e-10,-7.444521e-10,-9.123624e-09,-1.169958e-09,-4.437398e-09,-5.014032e-09,8.375770e-09,-1.956480e-10,-3.526473e-09
8,9.308588e+05,-3.792692e+04,-4.246428e+04,5.947804e+04,-1.898932e+04,-4.048581e+04,-8.439949e+03,8.973810e+04,-8.749587e+03,1.234006e+04,...,7.151009e-10,-4.342477e-09,1.888106e-09,-7.819425e-09,1.604014e-09,-4.229943e-09,6.063698e-09,-6.830844e-09,-5.950751e-10,-6.096459e-11
9,-3.444853e-09,3.378498e-09,5.277790e-10,1.715498e-09,-1.366547e-09,2.533710e-09,7.724029e-10,-3.106159e-09,1.889364e-09,-2.354028e-09,...,-4.231036e-09,-2.682667e-09,-3.780920e-09,-6.298616e-09,-3.415675e-09,1.061764e-08,1.189792e-08,-5.852867e-09,-6.929619e-09,-9.728821e-09


In [21]:
tmp = data[['pid','click_mode']]
tmp['count'] =1
tmp = tmp.groupby(['pid','click_mode']).count().reset_index()
tmp = tmp.pivot(index='pid',values=['count'],columns='click_mode').reset_index()
tmp.columns=['count_pid_cm{}'.format(i) for i in range(-1,12)]
tmp = tmp.rename(columns={'count_pid_cm-1':'pid'})

In [23]:
tmp1 = pd.DataFrame(data['click_mode'])
tmp1 = pd.get_dummies(tmp1.click_mode)
tmp1.columns=['count_pid_cm{}'.format(j) for j in range(12)]
tmp[['count_pid_cm{}'.format(j) for j in range(12)]] = tmp[['count_pid_cm{}'.format(j) for j in range(12)]] - tmp1
tmp.head()

Unnamed: 0,pid,count_pid_cm0,count_pid_cm1,count_pid_cm2,count_pid_cm3,count_pid_cm4,count_pid_cm5,count_pid_cm6,count_pid_cm7,count_pid_cm8,count_pid_cm9,count_pid_cm10,count_pid_cm11
0,-1.0,63014.0,22176.0,26724.0,6757.0,3106.0,14494.0,2718.0,15683.0,590.0,13377.0,3889.0,2273.0
1,0.0,1.0,1.0,,,,,,,,,,
2,2.0,0.0,,1.0,,,,,,,,,
3,4.0,1.0,,,,,,,,,,,
4,7.0,26.0,15.0,21.0,5.0,5.0,15.0,5.0,17.0,,10.0,2.0,1.0


In [24]:
data = data.merge(tmp,on='pid',how='left')
data['sum'] = data[pid_count_cm].sum(axis=1)
for i in tnrange(12):
    data['count_pid_cm{}'.format(i)] /= data['sum']

In [54]:
tmp = data[['req_time_hour','click_mode']]
tmp['count'] =1
tmp = tmp.groupby(['req_time_hour','click_mode']).count().reset_index()
tmp = tmp.pivot(index='req_time_hour',values=['count'],columns='click_mode').reset_index()
tmp.columns=['count_hour_cm{}'.format(i) for i in range(-1,12)]
tmp = tmp.rename(columns={'count_hour_cm-1':'req_time_hour'})
data = data.merge(tmp,on='req_time_hour',how='left')
data['sum_hour'] = data[pid_hour_cm].sum(axis=1)
for i in tnrange(12):
    data['count_hour_cm{}'.format(i)] /= data['sum_hour']

In [65]:
tmp = data[['req_time_weekday','click_mode']]
tmp['count'] =1
tmp = tmp.groupby(['req_time_weekday','click_mode']).count().reset_index()
tmp = tmp.pivot(index='req_time_weekday',values=['count'],columns='click_mode').reset_index()
tmp.columns=['count_weekday_cm{}'.format(i) for i in range(-1,12)]
tmp = tmp.rename(columns={'count_weekday_cm-1':'req_time_weekday'})
data = data.merge(tmp,on='req_time_weekday',how='left')
data['sum_weekday'] = data[pid_hour_cm].sum(axis=1)
for i in tnrange(12):
    data['count_weekday_cm{}'.format(i)] /= data['sum_weekday']

HBox(children=(IntProgress(value=0, max=12), HTML(value='')))




In [70]:
tmp = data[['o','click_mode']]
tmp['count'] =1
tmp = tmp.groupby(['o','click_mode']).count().reset_index()
tmp = tmp.pivot(index='o',values=['count'],columns='click_mode').reset_index()
tmp.columns=['count_o_cm{}'.format(i) for i in range(-1,12)]
tmp = tmp.rename(columns={'count_o_cm-1':'o'})
data = data.merge(tmp,on='o',how='left')
data['sum_o'] = data[pid_hour_cm].sum(axis=1)
for i in tnrange(12):
    data['count_o_cm{}'.format(i)] /= data['sum_o']

HBox(children=(IntProgress(value=0, max=12), HTML(value='')))




In [75]:
tmp = data[['d','click_mode']]
tmp['count'] =1
tmp = tmp.groupby(['d','click_mode']).count().reset_index()
tmp = tmp.pivot(index='d',values=['count'],columns='click_mode').reset_index()
tmp.columns=['count_d_cm{}'.format(i) for i in range(-1,12)]
tmp = tmp.rename(columns={'count_d_cm-1':'d'})
data = data.merge(tmp,on='d',how='left')
data['sum_d'] = data[pid_hour_cm].sum(axis=1)
for i in tnrange(12):
    data['count_d_cm{}'.format(i)] /= data['sum_d']

HBox(children=(IntProgress(value=0, max=12), HTML(value='')))




In [85]:
tmp = data[['od_manhattan_distance','click_mode']]
tmp['count'] =1
tmp = tmp.groupby(['od_manhattan_distance','click_mode']).count().reset_index()
tmp = tmp.pivot(index='od_manhattan_distance',values=['count'],columns='click_mode').reset_index()
tmp.columns=['count_distance_cm{}'.format(i) for i in range(-1,12)]
tmp = tmp.rename(columns={'count_distance_cm-1':'od_manhattan_distance'})
data = data.merge(tmp,on='od_manhattan_distance',how='left')
data['sum_distance'] = data[pid_hour_cm].sum(axis=1)
for i in tnrange(12):
    data['count_distance_cm{}'.format(i)] /= data['sum_distance']

HBox(children=(IntProgress(value=0, max=12), HTML(value='')))




In [None]:
tmp = data[['od_manhattan_distance','click_mode']]
tmp['count'] =1
tmp = tmp.groupby(['od_manhattan_distance','click_mode']).count().reset_index()
tmp = tmp.pivot(index='od_manhattan_distance',values=['count'],columns='click_mode').reset_index()
tmp.columns=['count_distance_cm{}'.format(i) for i in range(-1,12)]
tmp = tmp.rename(columns={'count_distance_cm-1':'od_manhattan_distance'})
data = data.merge(tmp,on='od_manhattan_distance',how='left')
data['sum_distance'] = data[pid_hour_cm].sum(axis=1)
for i in tnrange(12):
    data['count_distance_cm{}'.format(i)] /= data['sum_distance']

In [99]:
profile_feature = ['p' + str(i) for i in range(66)]
p_cross_feature = ['p_' + str(i) for i in range(32)]
pid_cols = ['pid_svd_{}'.format(i) for i in range(20)]
o_cols = ['o_svd_{}'.format(i) for i in range(20)]
d_cols = ['d_svd_{}'.format(i) for i in range(20)]
pid_count_cm = ['count_pid_cm{}'.format(j) for j in range(12)]
pid_hour_cm = ['count_hour_cm{}'.format(j) for j in range(12)]
pid_weekday_cm = ['count_weekday_cm{}'.format(j) for j in range(12)]
pid_o_cm = ['count_o_cm{}'.format(j) for j in range(12)]
pid_d_cm = ['count_d_cm{}'.format(j) for j in range(12)]
pid_distance_cm = ['count_distance_cm{}'.format(j) for j in range(12)]
time_feature = ['req_min_seg','first_workday','last_workday','req_time_minute', 'req_time_hour', 'req_time_weekday',
                'time_diff','weekend','hour_minute','req_time_ispeak_am','req_time_ispeak_pm','req_time_is_morning',
                'req_time_is_night','req_time_is_subway_stop','req_time_is_bus_stop']
origin_num_feature = ['o_lng', 'o_lat', 'd_lng', 'd_lat'] #+ profile_feature
test_feature = ['lon_diff','lat_diff','tube_stops_d','car_prob_d','tube_stops_o','car_prob_o','sum']
p_svd = ['p_svd_{}'.format(i) for i in range(20)]
feature = origin_num_feature+plan_features+plan_features_II+profile_feature+time_feature+o_cols+d_cols+p_cross_feature+test_feature

print(len(feature))

432


In [100]:
test = ['click_mode','req_time']
train1_index = ((data.req_time >= '2018-10-01') &(data.req_time < '2018-11-24'))
feature_x = feature+test
train_1 = data[train1_index][feature_x].reset_index(drop=True)
c3 = train_1.loc[train_1['click_mode']==3.0]
c4 = train_1.loc[train_1['click_mode']==4.0]
c6 = train_1.loc[train_1['click_mode']==6.0]
# tmp = pd.DataFrame()
# tmp = pd.concat([tmp,c4,c4,c4,c3,c3,c6,c6],axis=0)
# tmp = shuffle(tmp, random_state=47)
# train_1 = pd.concat([train_1, tmp], axis=0)
train_1 = pd.concat([train_1,c4,c3,c6],axis=0)
train1_index = (train_1.req_time < '2018-12-01')
train_x = train_1[train1_index][feature].reset_index(drop=True)
train_y = train_1[train1_index].click_mode.reset_index(drop=True)
# train_X = train_x[train_x['time_diff']<500]
# train_Y = train_y[train_x['time_diff']<500]
valid_index = (data.req_time >= '2018-11-24') &(data.req_time < '2018-12-01')
valid_x =data[valid_index][feature].reset_index(drop=True)
valid_y =data[valid_index].click_mode.reset_index(drop=True)
test_index = (data.req_time >= '2018-12-01')
test_x = data[test_index][feature].reset_index(drop=True)

KeyError: "['mode_feas_0' 'mode_feas_1' 'mode_feas_2' 'mode_feas_3' 'mode_feas_4'\n 'mode_feas_5' 'mode_feas_6' 'mode_feas_7' 'mode_feas_8' 'mode_feas_9'\n 'mode_feas_10' 'mode_feas_11' 'max_dist' 'min_dist' 'mean_dist'\n 'std_dist' 'max_price' 'min_price' 'mean_price' 'std_price' 'max_eta'\n 'min_eta' 'mean_eta' 'std_eta' 'max_dist_mode' 'min_dist_mode'\n 'max_price_mode' 'min_price_mode' 'max_speed_mode' 'min_speed_mode'\n 'max_eta_mode' 'min_eta_mode' 'first_mode' 'max_speed' 'min_speed'\n 'mean_speed' 'std_speed' 'max_pd' 'min_pd' 'mean_pd' 'std_pd' 'max_pe'\n 'min_pe' 'mean_pe' 'std_pe' 'svd_mode_0' 'svd_mode_1' 'svd_mode_2'\n 'svd_mode_3' 'svd_mode_4' 'svd_mode_5' 'svd_mode_6' 'svd_mode_7'\n 'svd_mode_8' 'svd_mode_9' 'svd_mode_10' 'svd_mode_11' 'svd_mode_12'\n 'svd_mode_13' 'svd_mode_14' 'svd_mode_15' 'svd_mode_16' 'svd_mode_17'\n 'svd_mode_18' 'svd_mode_19' 'mode_1_distance' 'mode_1_price' 'mode_1_eta'\n 'mode_1_speed' 'mode_1_p/d' 'mode_1_rank' 'mode_1_p/e' 'mode_1d/max_d'\n 'mode_1p/max_p' 'mode_1e/max_e' 'mode_1_distance_ratio'\n 'mode_1_price_ratio' 'mode_1_eta_ratio' 'mode_2_distance' 'mode_2_price'\n 'mode_2_eta' 'mode_2_speed' 'mode_2_p/d' 'mode_2_rank' 'mode_2_p/e'\n 'mode_2d/max_d' 'mode_2p/max_p' 'mode_2e/max_e' 'mode_2_distance_ratio'\n 'mode_2_price_ratio' 'mode_2_eta_ratio' 'mode_3_distance' 'mode_3_price'\n 'mode_3_eta' 'mode_3_speed' 'mode_3_p/d' 'mode_3_rank' 'mode_3_p/e'\n 'mode_3d/max_d' 'mode_3p/max_p' 'mode_3e/max_e' 'mode_3_distance_ratio'\n 'mode_3_price_ratio' 'mode_3_eta_ratio' 'mode_4_distance' 'mode_4_price'\n 'mode_4_eta' 'mode_4_speed' 'mode_4_p/d' 'mode_4_rank' 'mode_4_p/e'\n 'mode_4d/max_d' 'mode_4p/max_p' 'mode_4e/max_e' 'mode_4_distance_ratio'\n 'mode_4_price_ratio' 'mode_4_eta_ratio' 'mode_5_distance' 'mode_5_price'\n 'mode_5_eta' 'mode_5_speed' 'mode_5_p/d' 'mode_5_rank' 'mode_5_p/e'\n 'mode_5d/max_d' 'mode_5p/max_p' 'mode_5e/max_e' 'mode_5_distance_ratio'\n 'mode_5_price_ratio' 'mode_5_eta_ratio' 'mode_6_distance' 'mode_6_price'\n 'mode_6_eta' 'mode_6_speed' 'mode_6_p/d' 'mode_6_rank' 'mode_6_p/e'\n 'mode_6d/max_d' 'mode_6p/max_p' 'mode_6e/max_e' 'mode_6_distance_ratio'\n 'mode_6_price_ratio' 'mode_6_eta_ratio' 'mode_7_distance' 'mode_7_price'\n 'mode_7_eta' 'mode_7_speed' 'mode_7_p/d' 'mode_7_rank' 'mode_7_p/e'\n 'mode_7d/max_d' 'mode_7p/max_p' 'mode_7e/max_e' 'mode_7_distance_ratio'\n 'mode_7_price_ratio' 'mode_7_eta_ratio' 'mode_8_distance' 'mode_8_price'\n 'mode_8_eta' 'mode_8_speed' 'mode_8_p/d' 'mode_8_rank' 'mode_8_p/e'\n 'mode_8d/max_d' 'mode_8p/max_p' 'mode_8e/max_e' 'mode_8_distance_ratio'\n 'mode_8_price_ratio' 'mode_8_eta_ratio' 'mode_9_distance' 'mode_9_price'\n 'mode_9_eta' 'mode_9_speed' 'mode_9_p/d' 'mode_9_rank' 'mode_9_p/e'\n 'mode_9d/max_d' 'mode_9p/max_p' 'mode_9e/max_e' 'mode_9_distance_ratio'\n 'mode_9_price_ratio' 'mode_9_eta_ratio' 'mode_10_distance'\n 'mode_10_price' 'mode_10_eta' 'mode_10_speed' 'mode_10_p/d'\n 'mode_10_rank' 'mode_10_p/e' 'mode_10d/max_d' 'mode_10p/max_p'\n 'mode_10e/max_e' 'mode_10_distance_ratio' 'mode_10_price_ratio'\n 'mode_10_eta_ratio' 'mode_11_distance' 'mode_11_price' 'mode_11_eta'\n 'mode_11_speed' 'mode_11_p/d' 'mode_11_rank' 'mode_11_p/e'\n 'mode_11d/max_d' 'mode_11p/max_p' 'mode_11e/max_e'\n 'mode_11_distance_ratio' 'mode_11_price_ratio' 'mode_11_eta_ratio'\n 'req_min_seg' 'first_workday' 'last_workday' 'req_time_minute'\n 'req_time_hour' 'req_time_weekday' 'time_diff' 'weekend' 'hour_minute'\n 'req_time_ispeak_am' 'req_time_ispeak_pm' 'req_time_is_morning'\n 'req_time_is_night' 'req_time_is_subway_stop' 'req_time_is_bus_stop'\n 'o_svd_0' 'o_svd_1' 'o_svd_2' 'o_svd_3' 'o_svd_4' 'o_svd_5' 'o_svd_6'\n 'o_svd_7' 'o_svd_8' 'o_svd_9' 'o_svd_10' 'o_svd_11' 'o_svd_12' 'o_svd_13'\n 'o_svd_14' 'o_svd_15' 'o_svd_16' 'o_svd_17' 'o_svd_18' 'o_svd_19'\n 'd_svd_0' 'd_svd_1' 'd_svd_2' 'd_svd_3' 'd_svd_4' 'd_svd_5' 'd_svd_6'\n 'd_svd_7' 'd_svd_8' 'd_svd_9' 'd_svd_10' 'd_svd_11' 'd_svd_12' 'd_svd_13'\n 'd_svd_14' 'd_svd_15' 'd_svd_16' 'd_svd_17' 'd_svd_18' 'd_svd_19' 'p_0'\n 'p_1' 'p_2' 'p_3' 'p_4' 'p_5' 'p_6' 'p_7' 'p_8' 'p_9' 'p_10' 'p_11'\n 'p_12' 'p_13' 'p_14' 'p_15' 'p_16' 'p_17' 'p_18' 'p_19' 'p_20' 'p_21'\n 'p_22' 'p_23' 'p_24' 'p_25' 'p_26' 'p_27' 'p_28' 'p_29' 'p_30' 'p_31'\n 'lon_diff' 'lat_diff' 'tube_stops_d' 'car_prob_d' 'tube_stops_o'\n 'car_prob_o' 'sum' 'count_hour_cm0' 'count_hour_cm1' 'count_hour_cm2'\n 'count_hour_cm3' 'count_hour_cm4' 'count_hour_cm5' 'count_hour_cm6'\n 'count_hour_cm7' 'count_hour_cm8' 'count_hour_cm9' 'count_hour_cm10'\n 'count_hour_cm11' 'count_weekday_cm0' 'count_weekday_cm1'\n 'count_weekday_cm2' 'count_weekday_cm3' 'count_weekday_cm4'\n 'count_weekday_cm5' 'count_weekday_cm6' 'count_weekday_cm7'\n 'count_weekday_cm8' 'count_weekday_cm9' 'count_weekday_cm10'\n 'count_weekday_cm11' 'count_o_cm0' 'count_o_cm1' 'count_o_cm2'\n 'count_o_cm3' 'count_o_cm4' 'count_o_cm5' 'count_o_cm6' 'count_o_cm7'\n 'count_o_cm8' 'count_o_cm9' 'count_o_cm10' 'count_o_cm11' 'count_d_cm0'\n 'count_d_cm1' 'count_d_cm2' 'count_d_cm3' 'count_d_cm4' 'count_d_cm5'\n 'count_d_cm6' 'count_d_cm7' 'count_d_cm8' 'count_d_cm9' 'count_d_cm10'\n 'count_d_cm11' 'count_distance_cm0' 'count_distance_cm1'\n 'count_distance_cm2' 'count_distance_cm3' 'count_distance_cm4'\n 'count_distance_cm5' 'count_distance_cm6' 'count_distance_cm7'\n 'count_distance_cm8' 'count_distance_cm9' 'count_distance_cm10'\n 'count_distance_cm11'] not in index"

In [88]:
del train_1
gc.collect()

21

In [None]:
cv_pred = np.zeros((train_x.shape[0],12))
test_pred = np.zeros((test_x.shape[0],12))

In [91]:
import time
t0 = time.time()
lgb_model = lgb.LGBMClassifier(boosting_type="gbdt", num_leaves=52, reg_alpha=0.1, reg_lambda=1,
                                max_depth=-1, n_estimators=5000, objective='multiclass',sub_feature=0.8,
                                subsample=0.8, colsample_bytree=0.8, subsample_freq=1,min_child_samples=50,  
                               learning_rate=0.1, random_state=2019, metric="None",n_jobs=-1,device='gpu')
eval_set = [(valid_x, valid_y)]

lgb_model.fit(train_x, train_y, eval_set=eval_set,eval_metric=f1_weighted,verbose=10, early_stopping_rounds=100)#0.698428
t1 = time.time()
#0.615321
print('lgb process elapse time: {}'.format(t1-t0))

Training until validation scores don't improve for 100 rounds.
[10]	valid_0's f1_weighted: 0.5722
[20]	valid_0's f1_weighted: 0.60651
[30]	valid_0's f1_weighted: 0.609084
[40]	valid_0's f1_weighted: 0.610273
[50]	valid_0's f1_weighted: 0.611477
[60]	valid_0's f1_weighted: 0.611998
[70]	valid_0's f1_weighted: 0.612427
[80]	valid_0's f1_weighted: 0.612734
[90]	valid_0's f1_weighted: 0.612783
[100]	valid_0's f1_weighted: 0.613162
[110]	valid_0's f1_weighted: 0.613181
[120]	valid_0's f1_weighted: 0.613461
[130]	valid_0's f1_weighted: 0.613771
[140]	valid_0's f1_weighted: 0.614157
[150]	valid_0's f1_weighted: 0.614283
[160]	valid_0's f1_weighted: 0.614189
[170]	valid_0's f1_weighted: 0.614107
[180]	valid_0's f1_weighted: 0.614159
[190]	valid_0's f1_weighted: 0.613662
[200]	valid_0's f1_weighted: 0.612967
[210]	valid_0's f1_weighted: 0.613221
[220]	valid_0's f1_weighted: 0.613452
[230]	valid_0's f1_weighted: 0.6139
[240]	valid_0's f1_weighted: 0.614125
Early stopping, best iteration is:
[144

In [47]:
imp = pd.DataFrame()
imp['fea'] = train_x.columns
imp['imp'] = lgb_model.feature_importances_ 
imp = imp.sort_values('imp',ascending = False)
imp.head(10)

Unnamed: 0,fea,imp
392,count_pid_cm0,990
371,sum,928
393,count_pid_cm1,746
395,count_pid_cm3,718
394,count_pid_cm2,682
397,count_pid_cm5,676
286,hour_minute,659
399,count_pid_cm7,638
401,count_pid_cm9,572
398,count_pid_cm6,558


In [None]:
pred = lgb_model.predict(valid_x) 
df_analysis = pd.DataFrame()
df_analysis['sid']   = data[valid_index]['sid']
df_analysis['label'] = valid_y.values
df_analysis['pred']  = pred
df_analysis['label'] = df_analysis['label'].astype(int)
from sklearn.metrics import accuracy_score
from sklearn.metrics import accuracy_score,recall_score,precision_score
dic_ = df_analysis['label'].value_counts(normalize = True)
def get_weighted_fscore(y_pred, y_true):
    f_score = 0
    for i in range(12):
        yt = y_true == i
        yp = y_pred == i
        f_score += dic_[i] * f1_score(y_true=yt, y_pred= yp)
        print(i,dic_[i],f1_score(y_true=yt, y_pred= yp), precision_score(y_true=yt, y_pred= yp),recall_score(y_true=yt, y_pred= yp))
    print(f_score)
get_weighted_fscore(y_true =df_analysis['label'] , y_pred = df_analysis['pred'])

In [92]:
train2_index = ((data.req_time >= '2018-10-01') &(data.req_time <'2018-12-01'))
train_2 = data[train2_index][feature_x].reset_index(drop=True)
c3 = train_2.loc[train_2['click_mode']==3.0]
c4 = train_2.loc[train_2['click_mode']==4.0]
c6 = train_2.loc[train_2['click_mode']==6.0]
# tmp1= pd.DataFrame()
# tmp1 = pd.concat([tmp1,c4,c4,c4,c3,c3,c6,c6],axis=0)
# tmp1 = shuffle(tmp1)
# train_2 = pd.concat([train_2, tmp1], axis=0)
train_2 = pd.concat([train_2,c4,c3,c6],axis=0)
train2_index = (train_2.req_time <'2018-12-01')
all_train_x = train_2[train2_index][feature].reset_index(drop=True)
all_train_y = train_2[train2_index].click_mode.reset_index(drop=True)
#all_train_X = all_train_x[all_train_x['time_diff']<500]
#all_train_Y = all_train_y[all_train_x['time_diff']<500]

In [95]:
del data

gc.collect()

94

In [44]:
del train_2
del all_train_x
del all_train_y
gc.collect()

1405

In [97]:
data=merge_data()

In [98]:
#print(lgb_model.best_iteration_)
#lgb_model.n_estimators   = lgb_model.best_iteration_
#lgb_model.fit(all_train_x, all_train_y,verbose=10)
print('fit over')
result                   = pd.DataFrame()
result['sid']            = data[test_index]['sid']
result['recommend_mode'] = lgb_model.predict(test_x)
result['recommend_mode'] = result['recommend_mode'].astype(int)
print(len(result))
print(result['recommend_mode'].value_counts())
result[['sid', 'recommend_mode']].to_csv('./CAMM_shenzhen.csv', index=False)

fit over
55642
0     15394
2     10538
1      9621
7      7213
5      5498
9      4996
11      857
10      792
6       490
3       108
8        94
4        41
Name: recommend_mode, dtype: int64
