In [14]:
import json
import pandas as pd
import numpy as np
import datetime
from tqdm import tqdm
from geopy.distance import geodesic
from sklearn.decomposition import TruncatedSVD
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import TfidfVectorizer

# 一、特征工程

## 1、合并训练集和测试集

In [15]:
def merge_data():
    # 标签
    train_clicks = pd.read_csv("data_set_phase1//train_clicks.csv")

    # 特征数据集
    train_plans = pd.read_csv("data_set_phase1//train_plans.csv")
    train_queries = pd.read_csv("data_set_phase1//train_queries.csv")
    test_plans = pd.read_csv("data_set_phase1//test_plans.csv")
    test_queries = pd.read_csv("data_set_phase1//test_queries.csv")

    # merge训练集
    tra_data = train_queries.merge(train_plans, on='sid', how='left')
    tra_data = tra_data.merge(train_clicks, on='sid', how='left')
    tra_data = tra_data.drop(['click_time'], axis=1)
    tra_data['click_mode'] = tra_data['click_mode'].fillna(0)

    # merge测试集
    tes_data = test_queries.merge(test_plans, on='sid', how='left')
    tes_data['click_mode'] = -1

    # concat训练集和测试集
    all_data = pd.concat([tra_data, tes_data], axis=0)
    all_data = all_data.drop(['plan_time'], axis=1)
    all_data = all_data.reset_index(drop=True)
    
    return all_data

## 2、抽取o、d的特征

### 将o、d分离，添加POI数据

In [16]:
def gen_od_feature(all_data):
    all_data['o1'] = all_data['o'].apply(lambda x : float(x.split(',')[0]))
    all_data['o2'] = all_data['o'].apply(lambda x : float(x.split(',')[1]))
    all_data['d1'] = all_data['d'].apply(lambda x : float(x.split(',')[0]))
    all_data['d2'] = all_data['d'].apply(lambda x : float(x.split(',')[1]))

    # od对结合，并labelencoder
    le = LabelEncoder()
    all_data['o_d'] = all_data['o'] + all_data['d']
    all_data['o_d'] = le.fit_transform(all_data['o_d'])
    
    # 经纬度距离
    all_data['o_d_distance'] = all_data.apply(lambda x: geodesic((x.o2, x.o1),(x.d2, x.d1)).m, axis=1)
    
#     POI_data = pd.read_csv("data_set_phase1//POIs.csv", encoding='ANSI')
#     POIs = pd.DataFrame()
#     POIs.columns = POI_data['tag'].value_counts().index
    
#    all_data = all_data.drop(['o', 'd'], axis=1)
    return all_data

In [17]:
POI_data = pd.read_csv("data_set_phase1//POIs.csv", encoding='ANSI')
POI_data.head()

Unnamed: 0,lng_lat,addr,cp,direction,distance,name,parent_poi,poiType,point,tag,tel,uid,zip
0,"116.29,39.97",北京市海淀区蓝靛厂居住区世纪城3期春荫园6号楼,,北,102,中国建设银行(北京远大中路支行),"{'name': '', 'tag': '', 'addr': '', 'point': {...",金融,"{'x': 116.28983435050277, 'y': 39.96930388943203}",金融;银行,,d4034eb38a6c2ff6c0364441,
1,"116.29,39.97",蓝靛厂中路19号,,西南,203,蓝靛厂清真寺,"{'name': '', 'tag': '', 'addr': '', 'point': {...",旅游景点,"{'x': 116.29121774098813, 'y': 39.971045708291...",旅游景点;教堂,,5191bb9a6696551b1ce99987,
2,"116.29,39.97",北京市海淀区蓝靛厂春荫园小区5号楼,,东北,233,蓝靛厂春荫园,"{'name': '', 'tag': '', 'addr': '', 'point': {...",房地产,"{'x': 116.2883162141909, 'y': 39.969034318229845}",房地产;住宅区,,34f68ab038f0530a8e55ecac,
3,"116.29,39.97",北京市海淀区蓝靛厂翠叠园小区9号楼,,西北,250,蓝靛厂翠叠园,"{'name': '', 'tag': '', 'addr': '', 'point': {...",房地产,"{'x': 116.29195435150632, 'y': 39.96913799958826}",房地产;住宅区,,e93761f6bb9d9572223bf270,
4,"116.29,39.97",蓝晴路与蓝靛厂中路交叉口西北150米,,东南,161,曙光街道温馨家园,"{'name': '', 'tag': '', 'addr': '', 'point': {...",房地产,"{'x': 116.28907977387439, 'y': 39.970859086982...",房地产;住宅区,,7bcfc17f6fa9933e077ce8aa,


In [18]:
POI_data.poiType[POI_data['lng_lat']=='116.29,39.97'].value_counts()

房地产     4
金融      2
购物      1
教育培训    1
汽车服务    1
旅游景点    1
Name: poiType, dtype: int64

## 3、抽取plans的特征

### 提取plans特征
### 1、max_distance、min_distance、mean_distance、std_distance
### 2、max_price、min_price、mean_price、std_price
### 3、max_eta、min_eta、mean_eta、std_eta
### 4、max_dis_mode、min_dis_mode、max_price_mode、min_price_mode、max_eta_mode、min_eta_mode
### 5、first_mode

In [19]:
def gen_plan_feature(all_data):
    n = all_data.shape[0]
    
    # 初始化推荐给用户的plans，类似于one-hot编码，推荐了哪一个mode，就置为1
    mode_list_feas = np.zeros((n, 12))

    # 初始化最大距离、最小距离、平均距离、距离标准差
    max_distance, min_distance, mean_distance, std_distance = np.zeros((n,)),np.zeros((n,)),np.zeros((n,)),np.zeros((n,))

    # 初始化最大价格、最小价格、平均价格、价格标准差
    max_price, min_price, mean_price, std_price = np.zeros((n,)),np.zeros((n,)),np.zeros((n,)),np.zeros((n,))

    # 初始化最大用时、最小用时、平均用时、用时标准差
    max_eta, min_eta, mean_eta, std_eta = np.zeros((n,)),np.zeros((n,)),np.zeros((n,)),np.zeros((n,))

    # 初始化最大距离mode、最小距离mode、最大价格mode、最小价格mode、最大用时mode、最小用时mode、第一推荐mode
    max_dis_mode, min_dis_mode, max_price_mode, min_price_mode, max_eta_mode, min_eta_mode, first_mode = np.zeros((n,)),np.zeros((n,)),np.zeros((n,)),np.zeros((n,)),np.zeros((n,)),np.zeros((n,)),np.zeros((n,))

    # 初始化推荐mode的顺序
    mode_texts=[]
    
    # 遍历每个用户的plan
    for i, plan in tqdm(enumerate(all_data['plans'].values)):
        try:
            user_plan_list = json.loads(plan)
        except:
            user_plan_list = []
        if len(user_plan_list)==0:
            mode_list_feas[i, 0] = 1

            first_mode[i] = 0

            max_distance[i] = -1
            min_distance[i] = -1
            mean_distance[i] = -1
            std_distance[i] = -1

            max_price[i] = -1
            min_price[i] = -1
            mean_price[i] = -1
            std_price[i] = -1

            max_eta[i] = -1
            min_eta[i] = -1
            mean_eta[i] = -1
            std_eta[i] = -1

            max_dis_mode[i] = -1
            min_dis_mode[i] = -1
            max_price_mode[i] = -1
            min_price_mode[i] = -1
            max_eta_mode[i] = -1
            min_eta_mode[i] = -1

            mode_texts.append('word_null')
        else:
            distance_list = []
            price_list = []
            eta_list = []
            mode_list = []

            # 抽取每个用户的每个plan
            for tmp_dict in user_plan_list:
                distance_list.append(int(tmp_dict['distance']))
                if tmp_dict['price']=='':
                    price_list.append(0)
                else:
                    price_list.append(int(tmp_dict['price']))
                eta_list.append(int(tmp_dict['eta']))
                mode_list.append(int(tmp_dict['transport_mode']))

            # 将每个用户的推荐模型按顺序添加
            mode_texts.append(' '.join(['word_{}'.format(mode) for mode in mode_list]))

            # 将list转换成ndarray
            distance_list = np.array(distance_list)
            price_list = np.array(price_list)
            eta_list = np.array(eta_list)
            mode_list = np.array(mode_list, dtype='int')
            
            # 将有plans推荐的用户的mode置为1
            mode_list_feas[i, mode_list] = 1

            # 获取索引
            distance_sort_idx = np.argsort(distance_list)
            price_sort_idx = np.argsort(price_list)
            eta_sort_idx = np.argsort(eta_list)

            # 构建特征
            max_distance[i] = distance_list[distance_sort_idx[-1]]
            min_distance[i] = distance_list[distance_sort_idx[0]]
            mean_distance[i] = np.mean(distance_list)
            std_distance[i] = np.std(distance_list)

            max_price[i] = price_list[price_sort_idx[-1]]
            min_price[i] = price_list[price_sort_idx[0]]
            mean_price[i] = np.mean(price_list)
            std_price[i] = np.std(price_list)

            max_eta[i] = eta_list[eta_sort_idx[-1]]
            min_eta[i] = eta_list[eta_sort_idx[0]]
            mean_eta[i] = np.mean(eta_list)
            std_eta[i] = np.std(eta_list)

            first_mode[i] = mode_list[0]

            max_dis_mode[i] = mode_list[distance_sort_idx[-1]]
            min_dis_mode[i] = mode_list[distance_sort_idx[0]]

            max_price_mode[i] = mode_list[price_sort_idx[-1]]
            min_price_mode[i] = mode_list[price_sort_idx[0]]

            max_eta_mode[i] = mode_list[eta_sort_idx[-1]]
            min_eta_mode[i] = mode_list[eta_sort_idx[0]]

    # 将特征存储进DataFrame中
    plan_feature_data = pd.DataFrame(mode_list_feas)
    plan_feature_data.columns = ['mode_feas_{}'.format(i) for i in range(12)]

    plan_feature_data['max_distance'] = max_distance
    plan_feature_data['min_distance'] = min_distance
    plan_feature_data['mean_distance'] = mean_distance
    plan_feature_data['std_distance'] = std_distance

    plan_feature_data['max_price'] = max_price
    plan_feature_data['min_price'] = min_price
    plan_feature_data['mean_price'] = mean_price
    plan_feature_data['std_price'] = std_price

    plan_feature_data['max_eta'] = max_eta
    plan_feature_data['min_eta'] = min_eta
    plan_feature_data['mean_eta'] = mean_eta
    plan_feature_data['std_eta'] = std_eta

    plan_feature_data['max_dis_mode'] = max_dis_mode
    plan_feature_data['min_dis_mode'] = min_dis_mode
    plan_feature_data['max_price_mode'] = max_price_mode
    plan_feature_data['min_price_mode'] = min_price_mode
    plan_feature_data['max_eta_mode'] = max_eta_mode
    plan_feature_data['min_eta_mode'] = min_eta_mode

    plan_feature_data['first_mode'] = first_mode

    # tiidf提取特征
    tfidf = TfidfVectorizer(ngram_range=(1, 2))
    tfidf_vec = tfidf.fit_transform(mode_texts)
    svd = TruncatedSVD(n_components=10, n_iter=20, random_state=2019)
    mode_svd = svd.fit_transform(tfidf_vec)
    
    # 转换成dataframe
    mode_svd = pd.DataFrame(mode_svd)
    mode_svd.columns = ['svd_mode_{}'.format(i) for i in range(10)]

    all_data = pd.concat([all_data, mode_svd, plan_feature_data], axis=1)
    all_data = all_data.drop(['plans'], axis=1)
    
    return  all_data

# 4、抽取profiles数据集特征

In [20]:
def gen_profiles_feature(all_data):
    profiles = pd.read_csv("data_set_phase1//profiles.csv")

    # 用于填充没有pid的用户
    profiles_na = np.zeros(67)
    profiles_na[0] = -1
    profiles_na = pd.DataFrame(profiles_na.reshape(1, -1))
    profiles_na.columns = profiles.columns
    profiles = profiles.append(profiles_na)
    
#     # 对特征进行奇异值分解，实现降维
#     pi = profiles.drop(['pid'], axis=1).values
#     svd = TruncatedSVD(n_components=60, n_iter=20, random_state=2019)
#     profiles_svd = svd.fit_transform(pi)
    
#     # 转换成dataframe
#     profiles_svd = pd.DataFrame(profiles_svd)
#     profiles_svd.columns = ['svd_profiles_{}'.format(i) for i in range(60)]
#     profiles_svd['pid'] = profiles['pid'].values

    # 合并数据集
    all_data['pid'] = all_data['pid'].fillna(-1)
    all_data = all_data.merge(profiles, on='pid', how='left')
    return all_data

# 5、抽取时间特征（req_time）

### 距离国庆节的天数、月份、一年中第几天、周几、小时、小时cat、是否是假期、是否是周末

In [21]:
def gen_time_feature(all_data):
    
    # 距离国庆还有几天
    NatioinalDay = []
    d1 = datetime.datetime(2018,10,1)
    for i in range(all_data.shape[0]):
        d2 = datetime.datetime.strptime(all_data.req_time[i].split(' ')[0], "%Y-%m-%d")
        s = d2 - d1
        NatioinalDay.append(s.days)
    all_data['NatioinalDay'] = NatioinalDay
    
    all_data['req_time'] = pd.to_datetime(all_data['req_time'])
    
    # 查询日期
    all_data['req_date'] = all_data['req_time'].dt.strftime("%m-%d")
    
    # 是否是假期
    all_data['isholiday'] = all_data['req_date'].isin(['10-01','10-02','10-03','10-04','10-05','10-06','10-07',]).astype(int)
    
    # 月份
    all_data['monthofyear'] = all_data['req_time'].dt.month
    
    # 一年中的第几天
    all_data['dayofyear'] = all_data['req_time'].dt.dayofyear
    
    # 周几
    all_data['dayofweek'] = all_data['req_time'].dt.dayofweek
    
    # 是否是周末
    all_data['isweekend']=0
    all_data.isweekend[all_data['dayofweek']>4]=1
    
    # 小时
    all_data['hour'] = all_data['req_time'].dt.hour
    
    # 小时category
    all_data['cat_hour'] = all_data.hour.apply(lambda x: 0 if x<=6 else 1 if x<=12 else 2 if x<=18 else 3)
    
    all_data = all_data.drop(['req_date'], axis=1)
    
    return all_data

# 6、提取pid特征

### 根据lgbm的特征重要度发现，pid是个强特，所以对pid进行特征提取
### 1、统计每个pid出现的次数，将次数作为特征
### 2、统计每个pid在每个类别中出现的次数（这个有问题）
### 3、pid与时间特征的组合出现的次数
### 4、pid与时间特征的组合在每个类别中出现的次数（这个有问题）

In [30]:
def gen_pid_feature(all_data):
    
    # 统计每个pid出现的次数
    pid_counts = pd.DataFrame()
    counts = all_data['pid'].value_counts()
    index = counts.index
    pid_counts['pid'] = index
    pid_counts['pid_counts'] = list(counts)
    
    # pid与o组合出现的次数
    grouped = all_data['o'].groupby(all_data['pid'])
    pidCombineO = grouped.value_counts()
    pidCombineO.to_csv('tidy//pidCombineO.csv')
    pidCombineO = pd.read_csv('tidy//pidCombineO.csv')
    pidCombineO.columns = ['pid', 'o', 'pid_o_counts']
    all_data = all_data.merge(pidCombineO, on=['pid', 'o'], how='left')
    
    # pid与d组合出现的次数
    grouped = all_data['d'].groupby(all_data['pid'])
    pidCombineD = grouped.value_counts()
    pidCombineD.to_csv('tidy//pidCombineD.csv')
    pidCombineD = pd.read_csv('tidy//pidCombineD.csv')
    pidCombineD.columns = ['pid', 'd', 'pid_d_counts']
    all_data = all_data.merge(pidCombineD, on=['pid', 'd'], how='left')
    
    # pid与o_d组合出现的次数
    grouped = all_data['o_d'].groupby(all_data['pid'])
    pidCombineOD = grouped.value_counts()
    pidCombineOD.to_csv('tidy//pidCombineOD.csv')
    pidCombineOD = pd.read_csv('tidy//pidCombineOD.csv')
    pidCombineOD.columns = ['pid', 'o_d', 'pid_od_counts']
    all_data = all_data.merge(pidCombineOD, on=['pid', 'o_d'], how='left')

    # pid与first_mode组合出现次数
    pidCombineFirstM0 = pd.DataFrame()
    counts = all_data.pid[all_data['first_mode']==0].value_counts()
    index = counts.index
    pidCombineFirstM0['pid'] = index
    pidCombineFirstM0['pid_F0_counts'] = list(counts)
    
    pidCombineFirstM1 = pd.DataFrame()
    counts = all_data.pid[all_data['first_mode']==1].value_counts()
    index = counts.index
    pidCombineFirstM1['pid'] = index
    pidCombineFirstM1['pid_F1_counts'] = list(counts)
    
    pidCombineFirstM2 = pd.DataFrame()
    counts = all_data.pid[all_data['first_mode']==2].value_counts()
    index = counts.index
    pidCombineFirstM2['pid'] = index
    pidCombineFirstM2['pid_F2_counts'] = list(counts)
    
    pidCombineFirstM3 = pd.DataFrame()
    counts = all_data.pid[all_data['first_mode']==3].value_counts()
    index = counts.index
    pidCombineFirstM3['pid'] = index
    pidCombineFirstM3['pid_F3_counts'] = list(counts)
    
    pidCombineFirstM4 = pd.DataFrame()
    counts = all_data.pid[all_data['first_mode']==4].value_counts()
    index = counts.index
    pidCombineFirstM4['pid'] = index
    pidCombineFirstM4['pid_F4_counts'] = list(counts)
    
    pidCombineFirstM5 = pd.DataFrame()
    counts = all_data.pid[all_data['first_mode']==5].value_counts()
    index = counts.index
    pidCombineFirstM5['pid'] = index
    pidCombineFirstM5['pid_F5_counts'] = list(counts)
    
    pidCombineFirstM6 = pd.DataFrame()
    counts = all_data.pid[all_data['first_mode']==6].value_counts()
    index = counts.index
    pidCombineFirstM6['pid'] = index
    pidCombineFirstM6['pid_F6_counts'] = list(counts)
    
    pidCombineFirstM7 = pd.DataFrame()
    counts = all_data.pid[all_data['first_mode']==7].value_counts()
    index = counts.index
    pidCombineFirstM7['pid'] = index
    pidCombineFirstM7['pid_F7_counts'] = list(counts)
    
    pidCombineFirstM8 = pd.DataFrame()
    counts = all_data.pid[all_data['first_mode']==8].value_counts()
    index = counts.index
    pidCombineFirstM8['pid'] = index
    pidCombineFirstM8['pid_F8_counts'] = list(counts)
    
    pidCombineFirstM9 = pd.DataFrame()
    counts = all_data.pid[all_data['first_mode']==9].value_counts()
    index = counts.index
    pidCombineFirstM9['pid'] = index
    pidCombineFirstM9['pid_F9_counts'] = list(counts)
    
    pidCombineFirstM10 = pd.DataFrame()
    counts = all_data.pid[all_data['first_mode']==10].value_counts()
    index = counts.index
    pidCombineFirstM10['pid'] = index
    pidCombineFirstM10['pid_F10_counts'] = list(counts)
    
    pidCombineFirstM11 = pd.DataFrame()
    counts = all_data.pid[all_data['first_mode']==11].value_counts()
    index = counts.index
    pidCombineFirstM11['pid'] = index
    pidCombineFirstM11['pid_F11_counts'] = list(counts)
    
    # pid与时间特征的组合出现次数，按是否是节假日进行组合
    pidCombineHoliday0 = pd.DataFrame()
    counts_0 = all_data.pid[all_data['isholiday']==0].value_counts()
    index_0 = counts_0.index
    pidCombineHoliday0['pid'] = index_0
    pidCombineHoliday0['pid_H0_counts'] = list(counts_0)
    
    pidCombineHoliday1 = pd.DataFrame()
    counts_1 = all_data.pid[all_data['isholiday']==1].value_counts()
    index_1 = counts_1.index
    pidCombineHoliday1['pid'] = index_1
    pidCombineHoliday1['pid_H1_counts'] = list(counts_1)
    
    # pid与时间特征的组合出现次数，按是否是周末进行组合
    pidCombineWeekend0 = pd.DataFrame()
    counts_0 = all_data.pid[all_data['isweekend']==0].value_counts()
    index_0 = counts_0.index
    pidCombineWeekend0['pid'] = index_0
    pidCombineWeekend0['pid_W0_counts'] = list(counts_0)
    
    pidCombineWeekend1 = pd.DataFrame()
    counts_1 = all_data.pid[all_data['isweekend']==1].value_counts()
    index_1 = counts_1.index
    pidCombineWeekend1['pid'] = index_1
    pidCombineWeekend1['pid_W1_counts'] = list(counts_1)
    
    #合并dataframe
    all_data = all_data.merge(pidCombineFirstM0, on='pid', how='left')
    all_data = all_data.merge(pidCombineFirstM1, on='pid', how='left')
    all_data = all_data.merge(pidCombineFirstM2, on='pid', how='left')
    all_data = all_data.merge(pidCombineFirstM3, on='pid', how='left')
    all_data = all_data.merge(pidCombineFirstM4, on='pid', how='left')
    all_data = all_data.merge(pidCombineFirstM5, on='pid', how='left')
    all_data = all_data.merge(pidCombineFirstM6, on='pid', how='left')
    all_data = all_data.merge(pidCombineFirstM7, on='pid', how='left')
    all_data = all_data.merge(pidCombineFirstM8, on='pid', how='left')
    all_data = all_data.merge(pidCombineFirstM9, on='pid', how='left')
    all_data = all_data.merge(pidCombineFirstM10, on='pid', how='left')
    all_data = all_data.merge(pidCombineFirstM11, on='pid', how='left')
    all_data = all_data.merge(pid_counts, on='pid', how='left')
    all_data = all_data.merge(pidCombineHoliday0, on='pid', how='left')
    all_data = all_data.merge(pidCombineHoliday1, on='pid', how='left')
    all_data = all_data.merge(pidCombineWeekend0, on='pid', how='left')
    all_data = all_data.merge(pidCombineWeekend1, on='pid', how='left')
    
    # 填充缺失值
    all_data['pid_o_counts'] = all_data['pid_o_counts'].fillna(0)
    all_data['pid_d_counts'] = all_data['pid_d_counts'].fillna(0)
    all_data['pid_od_counts'] = all_data['pid_od_counts'].fillna(0)
    all_data['pid_F0_counts'] = all_data['pid_F0_counts'].fillna(0)
    all_data['pid_F1_counts'] = all_data['pid_F1_counts'].fillna(0)
    all_data['pid_F2_counts'] = all_data['pid_F2_counts'].fillna(0)
    all_data['pid_F3_counts'] = all_data['pid_F3_counts'].fillna(0)
    all_data['pid_F4_counts'] = all_data['pid_F4_counts'].fillna(0)
    all_data['pid_F5_counts'] = all_data['pid_F5_counts'].fillna(0)
    all_data['pid_F6_counts'] = all_data['pid_F6_counts'].fillna(0)
    all_data['pid_F7_counts'] = all_data['pid_F7_counts'].fillna(0)
    all_data['pid_F8_counts'] = all_data['pid_F8_counts'].fillna(0)
    all_data['pid_F9_counts'] = all_data['pid_F9_counts'].fillna(0)
    all_data['pid_F10_counts'] = all_data['pid_F10_counts'].fillna(0)
    all_data['pid_F11_counts'] = all_data['pid_F11_counts'].fillna(0)
    all_data['pid_H0_counts'] = all_data['pid_H0_counts'].fillna(0)
    all_data['pid_H1_counts'] = all_data['pid_H1_counts'].fillna(0)
    all_data['pid_W0_counts'] = all_data['pid_W0_counts'].fillna(0)
    all_data['pid_W1_counts'] = all_data['pid_W1_counts'].fillna(0)
    
    all_data = all_data.drop(['o', 'd'], axis=1)
    
    return all_data

# 7、切分数据集

In [26]:
def train_test_split(all_data):
    train_data = all_data[all_data['click_mode']!=-1]
    test_data = all_data[all_data['click_mode']==-1]
    test_data = test_data.drop(['click_mode'], axis=1)
    submit = test_data[['sid']].copy()
    
    train_data = train_data.drop(['sid', 'pid'], axis=1)
    train_y = train_data[['req_time','click_mode']]
    train_x = train_data.drop(['click_mode'], axis=1)
    test_x = test_data.drop(['sid','req_time','pid'], axis=1)
    
    return train_x, train_y, test_x, submit

In [31]:
all_data = merge_data()
all_data = gen_od_feature(all_data)
all_data = gen_plan_feature(all_data)
all_data = gen_profiles_feature(all_data)
all_data = gen_time_feature(all_data)
all_data = gen_pid_feature(all_data)
train_x, train_y, test_x, submit = train_test_split(all_data)

594358it [01:39, 5970.64it/s]
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


In [None]:
t_x = train_x[train_x.req_time >= '2018-10-08']
t_y = train_y[train_y.req_time >= '2018-10-08']
x = pd.concat([t_x, test_x], axis=0)
x.shape

# 8、国庆嫁接

In [None]:
import numpy as np
from lightgbm import LGBMClassifier
from sklearn.metrics import f1_score
from time import gmtime, strftime
from sklearn.model_selection import StratifiedKFold

# 模型评估，采用f1-score
def f1_weighted(y_true, y_pred):
    y_pred = y_pred.reshape(12, -1).T
    y_pred = np.argmax(y_pred, axis=1)
    score = f1_score(y_true, y_pred, average='weighted')
    return 'weighted-f1-score', score, True

# 以国庆节期间的数据做训练集
tr_x = train_x[train_x.req_time < '2018-10-08']
tr_y = train_y[train_y.req_time < '2018-10-08']
tr_x = tr_x.drop(['req_time'], axis=1)
tr_y = tr_y.drop(['req_time'], axis=1)

# 以其他数据做测试集
te_x = train_x[train_x.req_time >= '2018-10-08']
te_x = te_x.drop(['req_time'], axis=1)
te_x = pd.concat([te_x, test_x], axis=0)

categorical_feature = ['pid', 'max_dis_mode', 'min_dis_mode', 'max_price_mode', 'min_price_mode',
                       'max_eta_mode', 'min_eta_mode', 'first_mode']

lgb = LGBMClassifier(boosting_type='gbdt', num_leaves=61, objective='multiclass', reg_alpha=0, reg_lambda=0.01, max_depth=1, 
                    n_estimators=2000, subsample=0.8, colsample_bytree=0.8, subsample_freq=1, min_child_samples=50,
                    learning_rate=0.05, random_state=2019, metric='multiclass', n_jobs=-1)
lgb.fit(tr_x, tr_y, categorical_feature=categorical_feature)

y_hat = lgb.predict(te_x)
y_pred = lgb.predict_proba(te_x)

In [None]:
y_hat

In [None]:
y_pred

# 8、模型训练&验证&提交

In [32]:
import numpy as np
from lightgbm import LGBMClassifier
from sklearn.metrics import f1_score
from time import gmtime, strftime
from sklearn.model_selection import StratifiedKFold

# 模型评估，采用f1-score
def f1_weighted(y_true, y_pred):
    y_pred = y_pred.reshape(12, -1).T
    y_pred = np.argmax(y_pred, axis=1)
    score = f1_score(y_true, y_pred, average='weighted')
    return 'weighted-f1-score', score, True

# 切分训练集，以后七天的数据做验证集
tra_x = train_x[train_x.req_time < '2018-11-24']
tra_y = train_y[train_y.req_time < '2018-11-24']
valid_x = train_x[train_x.req_time >= '2018-11-24']
valid_y = train_y[train_y.req_time >= '2018-11-24']

tra_x = tra_x.drop(['req_time'], axis=1)
tra_y = tra_y.drop(['req_time'], axis=1)
valid_x = valid_x.drop(['req_time'], axis=1)
valid_y = valid_y.drop(['req_time'], axis=1)

categorical_feature = ['max_dis_mode', 'min_dis_mode', 'max_price_mode', 'min_price_mode',
                       'max_eta_mode', 'min_eta_mode', 'first_mode', 'pid_o_counts', 'pid_d_counts', 'o_d', 'pid_od_counts']

lgb = LGBMClassifier(boosting_type='gbdt', num_leaves=61, objective='multiclass', reg_alpha=0, reg_lambda=0.01, max_depth=1, 
                    n_estimators=2000, subsample=0.8, colsample_bytree=0.8, subsample_freq=1, min_child_samples=50,
                    learning_rate=0.05, random_state=2019, metric='multiclass', n_jobs=-1)
eval_set = [(valid_x, valid_y)]
lgb.fit(tra_x, tra_y, eval_set=eval_set, eval_metric=f1_weighted, categorical_feature=categorical_feature, verbose=10, early_stopping_rounds=100)

  'precision', 'predicted', average, warn_for)


Training until validation scores don't improve for 100 rounds.
[10]	valid_0's multi_logloss: 1.55656	valid_0's weighted-f1-score: 0.601874
[20]	valid_0's multi_logloss: 1.33319	valid_0's weighted-f1-score: 0.635709
[30]	valid_0's multi_logloss: 1.20043	valid_0's weighted-f1-score: 0.665018
[40]	valid_0's multi_logloss: 1.11322	valid_0's weighted-f1-score: 0.66732
[50]	valid_0's multi_logloss: 1.05558	valid_0's weighted-f1-score: 0.669266
[60]	valid_0's multi_logloss: 1.01372	valid_0's weighted-f1-score: 0.670729
[70]	valid_0's multi_logloss: 0.983093	valid_0's weighted-f1-score: 0.672913
[80]	valid_0's multi_logloss: 0.9601	valid_0's weighted-f1-score: 0.673723
[90]	valid_0's multi_logloss: 0.942849	valid_0's weighted-f1-score: 0.673751
[100]	valid_0's multi_logloss: 0.929783	valid_0's weighted-f1-score: 0.674969
[110]	valid_0's multi_logloss: 0.919789	valid_0's weighted-f1-score: 0.676036
[120]	valid_0's multi_logloss: 0.911683	valid_0's weighted-f1-score: 0.67621
[130]	valid_0's mult

LGBMClassifier(boosting_type='gbdt', class_weight=None, colsample_bytree=0.8,
        importance_type='split', learning_rate=0.05, max_depth=1,
        metric='multiclass', min_child_samples=50, min_child_weight=0.001,
        min_split_gain=0.0, n_estimators=2000, n_jobs=-1, num_leaves=61,
        objective='multiclass', random_state=2019, reg_alpha=0,
        reg_lambda=0.01, silent=True, subsample=0.8,
        subsample_for_bin=200000, subsample_freq=1)

## 特征重要度

In [33]:
imp = pd.DataFrame()
imp['feature'] = tra_x.columns
imp['imp'] = lgb.feature_importances_
imp = imp.sort_values('imp', ascending = False)
imp

Unnamed: 0,feature,imp
4,o_d,2610
121,pid_o_counts,1403
122,pid_d_counts,1268
46,first_mode,579
18,mode_feas_2,225
17,mode_feas_1,224
25,mode_feas_9,221
26,mode_feas_10,219
23,mode_feas_7,213
27,mode_feas_11,191


In [34]:
pre = lgb.predict(valid_x)
f1_score(valid_y, pre, average='weighted')

0.6861672939883036

## 提交结果

In [35]:
x = train_x.drop(['req_time'], axis=1)
y = train_y.drop(['req_time'], axis=1)

lgb.n_estimators = lgb.best_iteration_
lgb.fit(x, y, categorical_feature=categorical_feature)
pred_test = lgb.predict(test_x)

# 提交结果
now_time = strftime("%Y-%m-%d-%H-%M-%S", gmtime())
submit['recommend_mode'] = pred_test
submit.to_csv('submission_{}.csv'.format(now_time), index=False)