In [1]:
import json
import pandas as pd
import numpy as np
import datetime
from tqdm import tqdm
from geopy.distance import geodesic
from sklearn.decomposition import TruncatedSVD
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import TfidfVectorizer

# 一、特征工程

## 1、合并训练集和测试集

In [2]:
def merge_data():
    # 标签
    train_clicks = pd.read_csv("data_set_phase1//train_clicks.csv")

    # 特征数据集
    train_plans = pd.read_csv("data_set_phase1//train_plans.csv")
    train_queries = pd.read_csv("data_set_phase1//train_queries.csv")
    test_plans = pd.read_csv("data_set_phase1//test_plans.csv")
    test_queries = pd.read_csv("data_set_phase1//test_queries.csv")

    # merge训练集
    tra_data = train_queries.merge(train_plans, on='sid', how='left')
    tra_data = tra_data.merge(train_clicks, on='sid', how='left')
    tra_data = tra_data.drop(['click_time'], axis=1)
    tra_data['click_mode'] = tra_data['click_mode'].fillna(0)

    # merge测试集
    tes_data = test_queries.merge(test_plans, on='sid', how='left')
    tes_data['click_mode'] = -1

    # concat训练集和测试集
    all_data = pd.concat([tra_data, tes_data], axis=0)
    all_data = all_data.drop(['plan_time'], axis=1)
    all_data = all_data.reset_index(drop=True)
    
    return all_data

## 2、抽取o、d的特征

### 将o、d分离，添加POI数据

In [3]:
def gen_od_feature(all_data):
    all_data['o1'] = all_data['o'].apply(lambda x : float(x.split(',')[0]))
    all_data['o2'] = all_data['o'].apply(lambda x : float(x.split(',')[1]))
    all_data['d1'] = all_data['d'].apply(lambda x : float(x.split(',')[0]))
    all_data['d2'] = all_data['d'].apply(lambda x : float(x.split(',')[1]))
    
    # od对结合，并labelencoder
    le = LabelEncoder()
    all_data['od'] = all_data['o'] + all_data['d']
    all_data['o_d'] = le.fit_transform(all_data['od'])
    
    # o是否等于d
    all_data['oisd'] = le.fit_transform(all_data['o']==all_data['d'])
    
    # 添加POI数据
    
#     # 经纬度距离
#     all_data['o_d_distance'] = all_data.apply(lambda x: geodesic((x.o2, x.o1),(x.d2, x.d1)).m, axis=1)
    
#     POI_data = pd.read_csv("data_set_phase1//POIs.csv", encoding='ANSI')
#     POIs = pd.DataFrame()
#     POIs.columns = POI_data['tag'].value_counts().index
    
#    all_data = all_data.drop(['o', 'd'], axis=1)
    return all_data

In [127]:
POIs_feature = pd.read_csv("tidy//POIs_feature.csv", encoding='ANSI')
POIs_feature.head()

Unnamed: 0,lng_lat,未知_x,出入口_x,房地产_x,公司企业_x,购物_x,行政地标_x,交通设施_x,教育培训_x,金融_x,...,文化传媒_x,休闲娱乐_x,医疗_x,运动健身_x,政府机构_x,自然地物_x,内部楼号,地产小区,飞机场,餐饮
0,"116.17,39.82",0,0,4,0,0,0,0,3,2,...,0,0,0,0,0,0,0,0,0,0
1,"116.52,39.77",0,0,4,2,1,0,0,1,1,...,0,0,0,0,0,0,0,0,0,0
2,"116.27,40.22",0,1,5,0,0,0,0,1,0,...,0,0,1,0,0,0,0,0,0,0
3,"116.27,39.96",0,0,0,3,0,0,0,3,0,...,1,0,1,0,1,0,0,0,0,0
4,"116.16,39.72",0,0,4,1,1,0,1,0,0,...,0,0,1,0,0,0,0,0,0,0


In [124]:
all_data['lng_lat'] = all_data['o1'].astype(str)+','+all_data['o2'].astype(str)
all_data = all_data.merge(POIs_feature, on='lng_lat', how='left')
all_data['lng_lat'] = all_data['d1'].astype(str)+','+all_data['d2'].astype(str)
all_data = all_data.merge(POIs_feature, on='lng_lat', how='left')

In [126]:
all_data.isnull().sum()

sid                0
pid                0
req_time           0
click_mode         0
o1                 0
o2                 0
d1                 0
d2                 0
o_d                0
svd_mode_0         0
svd_mode_1         0
svd_mode_2         0
svd_mode_3         0
svd_mode_4         0
svd_mode_5         0
svd_mode_6         0
svd_mode_7         0
svd_mode_8         0
svd_mode_9         0
mode_feas_0        0
mode_feas_1        0
mode_feas_2        0
mode_feas_3        0
mode_feas_4        0
mode_feas_5        0
mode_feas_6        0
mode_feas_7        0
mode_feas_8        0
mode_feas_9        0
mode_feas_10       0
                ... 
文化传媒_x_y        6782
休闲娱乐_x_y        6782
医疗_x_y          6782
运动健身_x_y        6782
政府机构_x_y        6782
自然地物_x_y        6782
未知_y_y          6782
出入口_y_y         6782
房地产_y_y         6782
公司企业_y_y        6782
购物_y_y          6782
行政地标_y_y        6782
交通设施_y_y        6782
教育培训_y_y        6782
金融_y_y          6782
酒店_y_y          6782
旅游景点_y_y     

## 3、抽取plans的特征

### 提取plans特征
### 1、max_distance、min_distance、mean_distance、std_distance
### 2、max_price、min_price、mean_price、std_price
### 3、max_eta、min_eta、mean_eta、std_eta
### 4、max_dis_mode、min_dis_mode、max_price_mode、min_price_mode、max_eta_mode、min_eta_mode
### 5、first_mode

In [4]:
def gen_plan_feature(all_data):
    n = all_data.shape[0]
    
    # 初始化推荐给用户的plans，类似于one-hot编码，推荐了哪一个mode，就置为1
    mode_list_feas = np.zeros((n, 12))

    # 初始化最大距离、最小距离、平均距离、距离标准差
    max_distance, min_distance, mean_distance, std_distance = np.zeros((n,)),np.zeros((n,)),np.zeros((n,)),np.zeros((n,))

    # 初始化最大价格、最小价格、平均价格、价格标准差
    max_price, min_price, mean_price, std_price = np.zeros((n,)),np.zeros((n,)),np.zeros((n,)),np.zeros((n,))

    # 初始化最大用时、最小用时、平均用时、用时标准差
    max_eta, min_eta, mean_eta, std_eta = np.zeros((n,)),np.zeros((n,)),np.zeros((n,)),np.zeros((n,))

    # 初始化最大距离mode、最小距离mode、最大价格mode、最小价格mode、最大用时mode、最小用时mode、第一推荐mode
    max_dis_mode, min_dis_mode, max_price_mode, min_price_mode, max_eta_mode, min_eta_mode, first_mode = np.zeros((n,)),np.zeros((n,)),np.zeros((n,)),np.zeros((n,)),np.zeros((n,)),np.zeros((n,)),np.zeros((n,))

    # 初始化推荐mode的顺序
    mode_texts=[]
    
    # 遍历每个用户的plan
    for i, plan in tqdm(enumerate(all_data['plans'].values)):
        try:
            user_plan_list = json.loads(plan)
        except:
            user_plan_list = []
        if len(user_plan_list)==0:
            mode_list_feas[i, 0] = 1

            first_mode[i] = 0

            max_distance[i] = -1
            min_distance[i] = -1
            mean_distance[i] = -1
            std_distance[i] = -1

            max_price[i] = -1
            min_price[i] = -1
            mean_price[i] = -1
            std_price[i] = -1

            max_eta[i] = -1
            min_eta[i] = -1
            mean_eta[i] = -1
            std_eta[i] = -1

            max_dis_mode[i] = -1
            min_dis_mode[i] = -1
            max_price_mode[i] = -1
            min_price_mode[i] = -1
            max_eta_mode[i] = -1
            min_eta_mode[i] = -1

            mode_texts.append('word_null')
        else:
            distance_list = []
            price_list = []
            eta_list = []
            mode_list = []

            # 抽取每个用户的每个plan
            for tmp_dict in user_plan_list:
                distance_list.append(int(tmp_dict['distance']))
                if tmp_dict['price']=='':
                    price_list.append(0)
                else:
                    price_list.append(int(tmp_dict['price']))
                eta_list.append(int(tmp_dict['eta']))
                mode_list.append(int(tmp_dict['transport_mode']))

            # 将每个用户的推荐模型按顺序添加
            mode_texts.append(' '.join(['word_{}'.format(mode) for mode in mode_list]))

            # 将list转换成ndarray
            distance_list = np.array(distance_list)
            price_list = np.array(price_list)
            eta_list = np.array(eta_list)
            mode_list = np.array(mode_list, dtype='int')
            
            # 将有plans推荐的用户的mode置为1
            mode_list_feas[i, mode_list] = 1

            # 获取索引
            distance_sort_idx = np.argsort(distance_list)
            price_sort_idx = np.argsort(price_list)
            eta_sort_idx = np.argsort(eta_list)

            # 构建特征
            max_distance[i] = distance_list[distance_sort_idx[-1]]
            min_distance[i] = distance_list[distance_sort_idx[0]]
            mean_distance[i] = np.mean(distance_list)
            std_distance[i] = np.std(distance_list)

            max_price[i] = price_list[price_sort_idx[-1]]
            min_price[i] = price_list[price_sort_idx[0]]
            mean_price[i] = np.mean(price_list)
            std_price[i] = np.std(price_list)

            max_eta[i] = eta_list[eta_sort_idx[-1]]
            min_eta[i] = eta_list[eta_sort_idx[0]]
            mean_eta[i] = np.mean(eta_list)
            std_eta[i] = np.std(eta_list)

            first_mode[i] = mode_list[0]

            max_dis_mode[i] = mode_list[distance_sort_idx[-1]]
            min_dis_mode[i] = mode_list[distance_sort_idx[0]]

            max_price_mode[i] = mode_list[price_sort_idx[-1]]
            min_price_mode[i] = mode_list[price_sort_idx[0]]

            max_eta_mode[i] = mode_list[eta_sort_idx[-1]]
            min_eta_mode[i] = mode_list[eta_sort_idx[0]]

    # 将特征存储进DataFrame中
    plan_feature_data = pd.DataFrame(mode_list_feas)
    plan_feature_data.columns = ['mode_feas_{}'.format(i) for i in range(12)]

    plan_feature_data['max_distance'] = max_distance
    plan_feature_data['min_distance'] = min_distance
    plan_feature_data['mean_distance'] = mean_distance
    plan_feature_data['std_distance'] = std_distance

    plan_feature_data['max_price'] = max_price
    plan_feature_data['min_price'] = min_price
    plan_feature_data['mean_price'] = mean_price
    plan_feature_data['std_price'] = std_price

    plan_feature_data['max_eta'] = max_eta
    plan_feature_data['min_eta'] = min_eta
    plan_feature_data['mean_eta'] = mean_eta
    plan_feature_data['std_eta'] = std_eta

    plan_feature_data['max_dis_mode'] = max_dis_mode
    plan_feature_data['min_dis_mode'] = min_dis_mode
    plan_feature_data['max_price_mode'] = max_price_mode
    plan_feature_data['min_price_mode'] = min_price_mode
    plan_feature_data['max_eta_mode'] = max_eta_mode
    plan_feature_data['min_eta_mode'] = min_eta_mode

    plan_feature_data['first_mode'] = first_mode

    # tiidf提取特征
    tfidf = TfidfVectorizer(ngram_range=(1, 2))
    tfidf_vec = tfidf.fit_transform(mode_texts)
    svd = TruncatedSVD(n_components=10, n_iter=20, random_state=2019)
    mode_svd = svd.fit_transform(tfidf_vec)
    
    # 转换成dataframe
    mode_svd = pd.DataFrame(mode_svd)
    mode_svd.columns = ['svd_mode_{}'.format(i) for i in range(10)]

    all_data = pd.concat([all_data, mode_svd, plan_feature_data], axis=1)
    all_data = all_data.drop(['plans'], axis=1)
    
    return  all_data

# 4、抽取profiles数据集特征

In [5]:
def gen_profiles_feature(all_data):
    profiles = pd.read_csv("data_set_phase1//profiles.csv")

    # 用于填充没有pid的用户
    profiles_na = np.zeros(67)
    profiles_na[0] = -1
    profiles_na = pd.DataFrame(profiles_na.reshape(1, -1))
    profiles_na.columns = profiles.columns
    profiles = profiles.append(profiles_na)
    
#     # 对特征进行奇异值分解，实现降维
#     pi = profiles.drop(['pid'], axis=1).values
#     svd = TruncatedSVD(n_components=60, n_iter=20, random_state=2019)
#     profiles_svd = svd.fit_transform(pi)
    
#     # 转换成dataframe
#     profiles_svd = pd.DataFrame(profiles_svd)
#     profiles_svd.columns = ['svd_profiles_{}'.format(i) for i in range(60)]
#     profiles_svd['pid'] = profiles['pid'].values

    # 合并数据集
    all_data['pid'] = all_data['pid'].fillna(-1)
    all_data = all_data.merge(profiles, on='pid', how='left')
    return all_data

# 5、抽取时间特征（req_time）

### 距离国庆节的天数、月份、一年中第几天、周几、小时、小时cat、是否是假期、是否是周末

In [7]:
def gen_time_feature(all_data):
    all_data['req_time'] = pd.to_datetime(all_data['req_time'])
    all_data['dayofweek'] = all_data['req_time'].dt.dayofweek
    all_data['hourofday'] = all_data['req_time'].dt.hour
    
    return all_data

# 6、提取pid特征

### 根据lgbm的特征重要度发现，pid是个强特，所以对pid进行特征提取
### 1、统计每个pid出现的次数，将次数作为特征
### 2、统计每个pid在每个类别中出现的次数（这个有问题）
### 3、pid与时间特征的组合出现的次数
### 4、pid与时间特征的组合在每个类别中出现的次数（这个有问题）

In [6]:
def gen_pid_feature(all_data):
    
    # 统计每个pid出现的次数
    pid_counts = pd.DataFrame()
    counts = all_data['pid'].value_counts()
    index = counts.index
    pid_counts['pid'] = index
    pid_counts['pid_counts'] = list(counts)
    
    # pid与o组合出现的次数
    grouped = all_data['o'].groupby(all_data['pid'])
    pidCombineO = grouped.value_counts()
    pidCombineO.to_csv('tidy//pidCombineO.csv')
    pidCombineO = pd.read_csv('tidy//pidCombineO.csv')
    pidCombineO.columns = ['pid', 'o', 'pid_o_counts']
    all_data = all_data.merge(pidCombineO, on=['pid', 'o'], how='left')
    
    # pid与d组合出现的次数
    grouped = all_data['d'].groupby(all_data['pid'])
    pidCombineD = grouped.value_counts()
    pidCombineD.to_csv('tidy//pidCombineD.csv')
    pidCombineD = pd.read_csv('tidy//pidCombineD.csv')
    pidCombineD.columns = ['pid', 'd', 'pid_d_counts']
    all_data = all_data.merge(pidCombineD, on=['pid', 'd'], how='left')
    
    # pid与od组合出现的次数
    grouped = all_data['o_d'].groupby(all_data['pid'])
    pidCombineOD = grouped.value_counts()
    pidCombineOD.to_csv('tidy//pidCombineOD.csv')
    pidCombineOD = pd.read_csv('tidy//pidCombineOD.csv')
    pidCombineOD.columns = ['pid', 'o_d', 'pid_od_counts']
    all_data = all_data.merge(pidCombineOD, on=['pid', 'o_d'], how='left') 
    
    # 填充缺失值
    all_data['pid_o_counts'] = all_data['pid_o_counts'].fillna(0)
    all_data['pid_d_counts'] = all_data['pid_d_counts'].fillna(0)
    all_data['pid_od_counts'] = all_data['pid_od_counts'].fillna(0)
    
    return all_data

# 7、特征组合

In [8]:
def combine_feature(all_data):
    le = LabelEncoder()
    
    # 组合pid与od
    all_data['combine_pid_od'] = all_data['pid'].astype(str) + all_data['od']
    all_data['combine_pid_od'] = le.fit_transform(all_data['combine_pid_od'])
    
    # 组合pid与first_mode
    all_data['combine_pid_fm'] = all_data['pid'].astype(str) + all_data['first_mode'].astype(str)
    all_data['combine_pid_fm'] = le.fit_transform(all_data['combine_pid_fm'])
    
    # 组合od与first_mode
    all_data['combine_od_fm'] = all_data['od'].astype(str) + all_data['first_mode'].astype(str)
    all_data['combine_od_fm'] = le.fit_transform(all_data['combine_od_fm'])
    
    # 组合oisd与first_mode
    all_data['oisd_fm'] = all_data['oisd'].astype(str) + all_data['first_mode'].astype(str)
    all_data['oisd_fm'] = le.fit_transform(all_data['oisd_fm'])
    
    # 组合pid与max_dis_mode
    all_data['combine_pid_maxdm'] = all_data['pid'].astype(str) + all_data['max_dis_mode'].astype(str)
    all_data['combine_pid_maxdm'] = le.fit_transform(all_data['combine_pid_maxdm'])
    
    # 组合pid与min_dis_mode
    all_data['combine_pid_mindm'] = all_data['pid'].astype(str) + all_data['min_dis_mode'].astype(str)
    all_data['combine_pid_mindm'] = le.fit_transform(all_data['combine_pid_mindm'])
    
    # 组合pid与max_price_mode
    all_data['combine_pid_maxpm'] = all_data['pid'].astype(str) + all_data['max_price_mode'].astype(str)
    all_data['combine_pid_maxpm'] = le.fit_transform(all_data['combine_pid_maxpm'])
    
    # 组合pid与min_price_mode
    all_data['combine_pid_minpm'] = all_data['pid'].astype(str) + all_data['min_price_mode'].astype(str)
    all_data['combine_pid_minpm'] = le.fit_transform(all_data['combine_pid_minpm'])
    
    # 组合pid与max_eta_mode
    all_data['combine_pid_maxem'] = all_data['pid'].astype(str) + all_data['max_eta_mode'].astype(str)
    all_data['combine_pid_maxem'] = le.fit_transform(all_data['combine_pid_maxem'])
    
    # 组合pid与min_eta_mode
    all_data['combine_pid_minem'] = all_data['pid'].astype(str) + all_data['min_eta_mode'].astype(str)
    all_data['combine_pid_minem'] = le.fit_transform(all_data['combine_pid_minem'])
    
    all_data = all_data.drop(['o', 'd', 'oisd', 'od'], axis=1)
    
    return all_data

In [42]:
all_data.head()

Unnamed: 0,sid,pid,req_time,click_mode,o1,o2,d1,d2,o_d,svd_mode_0,...,combine_pid_od,combine_pid_fm,combine_od_fm,oisd_fm,combine_pid_maxdm,combine_pid_mindm,combine_pid_maxpm,combine_pid_minpm,combine_pid_maxem,combine_pid_minem
0,3000821,-1.0,2018-11-02 17:54:30,9.0,116.29,39.97,116.32,39.96,50736,0.371088,...,17718,11,60125,11,4,7,6,5,7,5
1,3085857,210736.0,2018-11-16 10:53:10,1.0,116.39,39.84,116.33,39.79,132718,0.509334,...,451840,118061,173093,9,123757,118189,67614,72885,109045,79448
2,2944522,-1.0,2018-10-06 10:33:58,9.0,116.31,39.93,116.27,40.0,67656,0.376407,...,25028,11,82715,11,9,8,6,5,9,5
3,559931,202427.0,2018-11-23 14:54:11,1.0,116.27,39.88,116.39,39.9,35206,0.427644,...,425368,109628,40252,2,114878,109716,62741,67633,101150,73694
4,2819352,172251.0,2018-10-30 11:48:41,7.0,116.34,39.96,116.37,39.86,97471,0.513487,...,330980,76989,124090,9,80702,77115,44051,47400,71005,51705


# 7、切分数据集

In [9]:
def train_test_split(all_data):
    train_data = all_data[all_data['click_mode']!=-1]
    test_data = all_data[all_data['click_mode']==-1]
    test_data = test_data.drop(['click_mode'], axis=1)
    submit = test_data[['sid']].copy()
    
    train_data = train_data.drop(['sid', 'pid'], axis=1)
    train_y = train_data[['req_time','click_mode']]
    train_x = train_data.drop(['click_mode'], axis=1)
    test_x = test_data.drop(['sid','req_time','pid'], axis=1)
    
    return train_x, train_y, test_x, submit

In [10]:
all_data = merge_data()
all_data = gen_od_feature(all_data)
all_data = gen_plan_feature(all_data)
all_data = gen_profiles_feature(all_data)
all_data = gen_time_feature(all_data)
all_data = gen_pid_feature(all_data)
all_data = combine_feature(all_data)
train_x, train_y, test_x, submit = train_test_split(all_data)

594358it [01:33, 6385.55it/s]


# 8、国庆嫁接

In [None]:
import numpy as np
from lightgbm import LGBMClassifier
from sklearn.metrics import f1_score
from time import gmtime, strftime
from sklearn.model_selection import StratifiedKFold

# 模型评估，采用f1-score
def f1_weighted(y_true, y_pred):
    y_pred = y_pred.reshape(12, -1).T
    y_pred = np.argmax(y_pred, axis=1)
    score = f1_score(y_true, y_pred, average='weighted')
    return 'weighted-f1-score', score, True

# 以国庆节期间的数据做训练集
tr_x = train_x[train_x.req_time < '2018-10-08']
tr_y = train_y[train_y.req_time < '2018-10-08']
tr_x = tr_x.drop(['req_time'], axis=1)
tr_y = tr_y.drop(['req_time'], axis=1)

# 以其他数据做测试集
te_x = train_x[train_x.req_time >= '2018-10-08']
te_x = te_x.drop(['req_time'], axis=1)
te_x = pd.concat([te_x, test_x], axis=0)

categorical_feature = ['pid', 'max_dis_mode', 'min_dis_mode', 'max_price_mode', 'min_price_mode',
                       'max_eta_mode', 'min_eta_mode', 'first_mode']

lgb = LGBMClassifier(boosting_type='gbdt', num_leaves=61, objective='multiclass', reg_alpha=0, reg_lambda=0.01, max_depth=1, 
                    n_estimators=2000, subsample=0.8, colsample_bytree=0.8, subsample_freq=1, min_child_samples=50,
                    learning_rate=0.05, random_state=2019, metric='multiclass', n_jobs=-1)
lgb.fit(tr_x, tr_y, categorical_feature=categorical_feature)

y_hat = lgb.predict(te_x)
y_pred = lgb.predict_proba(te_x)

In [None]:
y_hat

In [None]:
y_pred

# 8、模型训练&验证&提交

In [49]:
import numpy as np
from lightgbm import LGBMClassifier
from sklearn.metrics import f1_score
from time import gmtime, strftime
from sklearn.model_selection import StratifiedKFold

# 模型评估，采用f1-score
def f1_weighted(y_true, y_pred):
    y_pred = y_pred.reshape(12, -1).T
    y_pred = np.argmax(y_pred, axis=1)
    score = f1_score(y_true, y_pred, average='weighted')
    return 'weighted-f1-score', score, True

# 切分训练集，以后七天的数据做验证集
tra_x = train_x[train_x.req_time < '2018-11-24']
tra_y = train_y[train_y.req_time < '2018-11-24']
valid_x = train_x[train_x.req_time >= '2018-11-24']
valid_y = train_y[train_y.req_time >= '2018-11-24']

tra_x = tra_x.drop(['req_time'], axis=1)
tra_y = tra_y.drop(['req_time'], axis=1)
valid_x = valid_x.drop(['req_time'], axis=1)
valid_y = valid_y.drop(['req_time'], axis=1)

categorical_feature = ['max_dis_mode', 'min_dis_mode', 'max_price_mode', 'min_price_mode',
                       'max_eta_mode', 'min_eta_mode', 'first_mode', 'o_d', 'pid_od_counts']

lgb = LGBMClassifier(boosting_type='gbdt', num_leaves=61, objective='multiclass', reg_alpha=0, reg_lambda=0.01, max_depth=1, 
                    n_estimators=2000, subsample=0.8, colsample_bytree=0.8, subsample_freq=1, min_child_samples=50,
                    learning_rate=0.05, random_state=2019, metric='multiclass', n_jobs=-1)
eval_set = [(valid_x, valid_y)]
lgb.fit(tra_x, tra_y, eval_set=eval_set, eval_metric=f1_weighted, categorical_feature=categorical_feature, verbose=10, early_stopping_rounds=100)

Training until validation scores don't improve for 100 rounds.
[10]	valid_0's multi_logloss: 1.55541	valid_0's weighted-f1-score: 0.60418
[20]	valid_0's multi_logloss: 1.33314	valid_0's weighted-f1-score: 0.635889
[30]	valid_0's multi_logloss: 1.20069	valid_0's weighted-f1-score: 0.664579
[40]	valid_0's multi_logloss: 1.11522	valid_0's weighted-f1-score: 0.667863
[50]	valid_0's multi_logloss: 1.05645	valid_0's weighted-f1-score: 0.671195
[60]	valid_0's multi_logloss: 1.01521	valid_0's weighted-f1-score: 0.672498
[70]	valid_0's multi_logloss: 0.984733	valid_0's weighted-f1-score: 0.672988
[80]	valid_0's multi_logloss: 0.962333	valid_0's weighted-f1-score: 0.673742
[90]	valid_0's multi_logloss: 0.945165	valid_0's weighted-f1-score: 0.674552
[100]	valid_0's multi_logloss: 0.93212	valid_0's weighted-f1-score: 0.676264
[110]	valid_0's multi_logloss: 0.921795	valid_0's weighted-f1-score: 0.676446
[120]	valid_0's multi_logloss: 0.913815	valid_0's weighted-f1-score: 0.676815
[130]	valid_0's mu

LGBMClassifier(boosting_type='gbdt', class_weight=None, colsample_bytree=0.8,
        importance_type='split', learning_rate=0.05, max_depth=1,
        metric='multiclass', min_child_samples=50, min_child_weight=0.001,
        min_split_gain=0.0, n_estimators=2000, n_jobs=-1, num_leaves=61,
        objective='multiclass', random_state=2019, reg_alpha=0,
        reg_lambda=0.01, silent=True, subsample=0.8,
        subsample_for_bin=200000, subsample_freq=1)

## 特征重要度

In [50]:
imp = pd.DataFrame()
imp['feature'] = tra_x.columns
imp['imp'] = lgb.feature_importances_
imp = imp.sort_values('imp', ascending = False)
imp

Unnamed: 0,feature,imp
4,o_d,4258
45,first_mode,720
115,pid_d_counts,248
17,mode_feas_2,243
25,mode_feas_10,237
24,mode_feas_9,236
26,mode_feas_11,230
16,mode_feas_1,225
22,mode_feas_7,221
23,mode_feas_8,196


In [None]:
pre = lgb.predict(valid_x)
f1_score(valid_y, pre, average='weighted')

## 提交结果

In [51]:
x = train_x.drop(['req_time'], axis=1)
y = train_y.drop(['req_time'], axis=1)

lgb.n_estimators = lgb.best_iteration_
lgb.fit(x, y, categorical_feature=categorical_feature)
pred_test = lgb.predict(test_x)

# 提交结果
now_time = strftime("%Y-%m-%d-%H-%M-%S", gmtime())
submit['recommend_mode'] = pred_test
submit.to_csv('submission_{}.csv'.format(now_time), index=False)

# 五折

In [52]:
def train_test_split(all_data):
    train_data = all_data[all_data['click_mode']!=-1]
    test_data = all_data[all_data['click_mode']==-1]
    test_data = test_data.drop(['click_mode'], axis=1)
    submit = test_data[['sid']].copy()
    
    train_data = train_data.drop(['sid', 'pid', 'req_time'], axis=1)
    train_y = train_data['click_mode']
    train_x = train_data.drop(['click_mode'], axis=1)
    test_x = test_data.drop(['sid','pid','req_time'], axis=1)
    
    return train_x, train_y, test_x, submit

In [54]:
train_x, train_y, test_x, submit = train_test_split(all_data)

In [None]:
import numpy as np
import lightgbm as lgb
from sklearn.metrics import f1_score
from time import gmtime, strftime
from sklearn.model_selection import StratifiedKFold

def f1_weighted(y_pred, train_data):
    y_true = train_data.label
    y_pred = y_pred.reshape(12, -1).T
    y_pred = np.argmax(y_pred, axis=1)
    f1 = f1_score(y_true, y_pred, average='weighted')
    return 'weighted-f1-score', f1, True

kfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=2019)

lgb_paras = {
    'objective': 'multiclass',
    'metrics': 'multiclass',
    'learning_rate': 0.05,
    'num_leaves': 31,
    'lambda_l1': 0.01,
    'lambda_l2': 10,
    'num_class': 12,
    'seed': 2019,
    'feature_fraction': 0.8,
    'bagging_fraction': 0.8,
    'bagging_freq': 4
}

categorical_feature = ['max_dis_mode', 'min_dis_mode', 'max_price_mode', 'min_price_mode',
                       'max_eta_mode', 'min_eta_mode', 'first_mode', 'o_d', 'pid_od_counts']
scores = []
result_proba = []
for tra_idx, val_idx in kfold.split(train_x, train_y):
    tra_x, tra_y, val_x, val_y = train_x.iloc[tra_idx], train_y[tra_idx], train_x.iloc[val_idx], train_y[val_idx]
    train_set = lgb.Dataset(tra_x, tra_y, categorical_feature=categorical_feature)
    val_set = lgb.Dataset(val_x, val_y, categorical_feature=categorical_feature)
    lgb_model = lgb.train(lgb_paras, train_set, valid_sets=[val_set], early_stopping_rounds=50, num_boost_round=40000, verbose_eval=50, feval=f1_weighted)
    val_pred = np.argmax(lgb_model.predict(val_x, num_iteration=lgb_model.best_iteration), axis=1)
    val_score = f1_score(val_y, val_pred, average='weighted')
    result_proba.append(lgb_model.predict(test_x, num_iteration=lgb_model.best_iteration))
    scores.append(val_score)
print('cv f1_score:', np.mean(scores))
pred_test = np.argmax(np.mean(result_proba, axis=0), axis=1)

# 提交结果
now_time = strftime("%Y-%m-%d-%H-%M-%S", gmtime())
submit['recommend_mode'] = pred_test
submit.to_csv('submission_{}.csv'.format(now_time), index=False)

  'precision', 'predicted', average, warn_for)


Training until validation scores don't improve for 50 rounds.
[50]	valid_0's multi_logloss: 1.0114	valid_0's weighted-f1-score: 0.670678
[100]	valid_0's multi_logloss: 0.918214	valid_0's weighted-f1-score: 0.67451
[150]	valid_0's multi_logloss: 0.898114	valid_0's weighted-f1-score: 0.675553
[200]	valid_0's multi_logloss: 0.893153	valid_0's weighted-f1-score: 0.675829
[250]	valid_0's multi_logloss: 0.892047	valid_0's weighted-f1-score: 0.676257
[300]	valid_0's multi_logloss: 0.892111	valid_0's weighted-f1-score: 0.676608
Early stopping, best iteration is:
[260]	valid_0's multi_logloss: 0.892	valid_0's weighted-f1-score: 0.676427
Training until validation scores don't improve for 50 rounds.
[50]	valid_0's multi_logloss: 1.00734	valid_0's weighted-f1-score: 0.672151
[100]	valid_0's multi_logloss: 0.913239	valid_0's weighted-f1-score: 0.675563
[150]	valid_0's multi_logloss: 0.893202	valid_0's weighted-f1-score: 0.676754
[200]	valid_0's multi_logloss: 0.888276	valid_0's weighted-f1-score: 0