In [1]:
import json
import pandas as pd
import numpy as np
import datetime
from tqdm import tqdm
from sklearn.decomposition import TruncatedSVD
from sklearn.feature_extraction.text import TfidfVectorizer

# 一、特征工程

## 1、合并训练集和测试集

In [2]:
def merge_data():
    # 标签
    train_clicks = pd.read_csv("data_set_phase1//train_clicks.csv")

    # 特征数据集
    train_plans = pd.read_csv("data_set_phase1//train_plans.csv")
    train_queries = pd.read_csv("data_set_phase1//train_queries.csv")
    test_plans = pd.read_csv("data_set_phase1//test_plans.csv")
    test_queries = pd.read_csv("data_set_phase1//test_queries.csv")

    # merge训练集
    tra_data = train_queries.merge(train_plans, on='sid', how='left')
    tra_data = tra_data.merge(train_clicks, on='sid', how='left')
    tra_data = tra_data.drop(['click_time'], axis=1)
    tra_data['click_mode'] = tra_data['click_mode'].fillna(0)

    # merge测试集
    tes_data = test_queries.merge(test_plans, on='sid', how='left')
    tes_data['click_mode'] = -1

    # concat训练集和测试集
    all_data = pd.concat([tra_data, tes_data], axis=0)
    all_data = all_data.drop(['plan_time'], axis=1)
    all_data = all_data.reset_index(drop=True)
    
    return all_data

## 2、抽取o、d的特征

In [3]:
def gen_od_feature(all_data):
    all_data['o1'] = all_data['o'].apply(lambda x : float(x.split(',')[0]))
    all_data['o2'] = all_data['o'].apply(lambda x : float(x.split(',')[1]))
    all_data['d1'] = all_data['d'].apply(lambda x : float(x.split(',')[0]))
    all_data['d2'] = all_data['d'].apply(lambda x : float(x.split(',')[1]))
    all_data = all_data.drop(['o', 'd'], axis=1)
    return all_data

## 3、抽取plans的特征

### 提取plans特征
### 1、max_distance、min_distance、mean_distance、std_distance
### 2、max_price、min_price、mean_price、std_price
### 3、max_eta、min_eta、mean_eta、std_eta
### 4、max_dis_mode、min_dis_mode、max_price_mode、min_price_mode、max_eta_mode、min_eta_mode
### 5、first_mode

In [4]:
def gen_plan_feature(all_data):
    n = all_data.shape[0]
    
    # 初始化推荐给用户的plans，类似于one-hot编码，推荐了哪一个mode，就置为1
    mode_list_feas = np.zeros((n, 12))

    # 初始化最大距离、最小距离、平均距离、距离标准差
    max_distance, min_distance, mean_distance, std_distance = np.zeros((n,)),np.zeros((n,)),np.zeros((n,)),np.zeros((n,))

    # 初始化最大价格、最小价格、平均价格、价格标准差
    max_price, min_price, mean_price, std_price = np.zeros((n,)),np.zeros((n,)),np.zeros((n,)),np.zeros((n,))

    # 初始化最大用时、最小用时、平均用时、用时标准差
    max_eta, min_eta, mean_eta, std_eta = np.zeros((n,)),np.zeros((n,)),np.zeros((n,)),np.zeros((n,))

    # 初始化最大距离mode、最小距离mode、最大价格mode、最小价格mode、最大用时mode、最小用时mode、第一推荐mode
    max_dis_mode, min_dis_mode, max_price_mode, min_price_mode, max_eta_mode, min_eta_mode, first_mode = np.zeros((n,)),np.zeros((n,)),np.zeros((n,)),np.zeros((n,)),np.zeros((n,)),np.zeros((n,)),np.zeros((n,))

    # 初始化推荐mode的顺序
    mode_texts=[]
    
    # 遍历每个用户的plan
    for i, plan in tqdm(enumerate(all_data['plans'].values)):
        try:
            user_plan_list = json.loads(plan)
        except:
            user_plan_list = []
        if len(user_plan_list)==0:
            mode_list_feas[i, 0] = 1

            first_mode[i] = 0

            max_distance[i] = -1
            min_distance[i] = -1
            mean_distance[i] = -1
            std_distance[i] = -1

            max_price[i] = -1
            min_price[i] = -1
            mean_price[i] = -1
            std_price[i] = -1

            max_eta[i] = -1
            min_eta[i] = -1
            mean_eta[i] = -1
            std_eta[i] = -1

            max_dis_mode[i] = -1
            min_dis_mode[i] = -1
            max_price_mode[i] = -1
            min_price_mode[i] = -1
            max_eta_mode[i] = -1
            min_eta_mode[i] = -1

            mode_texts.append('word_null')
        else:
            distance_list = []
            price_list = []
            eta_list = []
            mode_list = []

            # 抽取每个用户的每个plan
            for tmp_dict in user_plan_list:
                distance_list.append(int(tmp_dict['distance']))
                if tmp_dict['price']=='':
                    price_list.append(0)
                else:
                    price_list.append(int(tmp_dict['price']))
                eta_list.append(int(tmp_dict['eta']))
                mode_list.append(int(tmp_dict['transport_mode']))

            # 将每个用户的推荐模型按顺序添加
            mode_texts.append(' '.join(['word_{}'.format(mode) for mode in mode_list]))

            # 将list转换成ndarray
            distance_list = np.array(distance_list)
            price_list = np.array(price_list)
            eta_list = np.array(eta_list)
            mode_list = np.array(mode_list, dtype='int')
            
            # 将有plans推荐的用户的mode置为1
            mode_list_feas[i, mode_list] = 1

            # 获取索引
            distance_sort_idx = np.argsort(distance_list)
            price_sort_idx = np.argsort(price_list)
            eta_sort_idx = np.argsort(eta_list)

            # 构建特征
            max_distance[i] = distance_list[distance_sort_idx[-1]]
            min_distance[i] = distance_list[distance_sort_idx[0]]
            mean_distance[i] = np.mean(distance_list)
            std_distance[i] = np.std(distance_list)

            max_price[i] = price_list[price_sort_idx[-1]]
            min_price[i] = price_list[price_sort_idx[0]]
            mean_price[i] = np.mean(price_list)
            std_price[i] = np.std(price_list)

            max_eta[i] = eta_list[eta_sort_idx[-1]]
            min_eta[i] = eta_list[eta_sort_idx[0]]
            mean_eta[i] = np.mean(eta_list)
            std_eta[i] = np.std(eta_list)

            first_mode[i] = mode_list[0]

            max_dis_mode[i] = mode_list[distance_sort_idx[-1]]
            min_dis_mode[i] = mode_list[distance_sort_idx[0]]

            max_price_mode[i] = mode_list[price_sort_idx[-1]]
            min_price_mode[i] = mode_list[price_sort_idx[0]]

            max_eta_mode[i] = mode_list[eta_sort_idx[-1]]
            min_eta_mode[i] = mode_list[eta_sort_idx[0]]

    # 将特征存储进DataFrame中
    plan_feature_data = pd.DataFrame(mode_list_feas)
    plan_feature_data.columns = ['mode_feas_{}'.format(i) for i in range(12)]

    plan_feature_data['max_distance'] = max_distance
    plan_feature_data['min_distance'] = min_distance
    plan_feature_data['mean_distance'] = mean_distance
    plan_feature_data['std_distance'] = std_distance

    plan_feature_data['max_price'] = max_price
    plan_feature_data['min_price'] = min_price
    plan_feature_data['mean_price'] = mean_price
    plan_feature_data['std_price'] = std_price

    plan_feature_data['max_eta'] = max_eta
    plan_feature_data['min_eta'] = min_eta
    plan_feature_data['mean_eta'] = mean_eta
    plan_feature_data['std_eta'] = std_eta

    plan_feature_data['max_dis_mode'] = max_dis_mode
    plan_feature_data['min_dis_mode'] = min_dis_mode
    plan_feature_data['max_price_mode'] = max_price_mode
    plan_feature_data['min_price_mode'] = min_price_mode
    plan_feature_data['max_eta_mode'] = max_eta_mode
    plan_feature_data['min_eta_mode'] = min_eta_mode

    plan_feature_data['first_mode'] = first_mode

    # tiidf提取特征
    tfidf = TfidfVectorizer(ngram_range=(1, 2))
    tfidf_vec = tfidf.fit_transform(mode_texts)
    svd = TruncatedSVD(n_components=10, n_iter=20, random_state=2019)
    mode_svd = svd.fit_transform(tfidf_vec)
    
    # 转换成dataframe
    mode_svd = pd.DataFrame(mode_svd)
    mode_svd.columns = ['svd_mode_{}'.format(i) for i in range(10)]

    all_data = pd.concat([all_data, mode_svd, plan_feature_data], axis=1)
    all_data = all_data.drop(['plans'], axis=1)
    
    return  all_data

# 4、抽取profiles数据集特征

In [5]:
def gen_profiles_feature(all_data):
    profiles = pd.read_csv("data_set_phase1//profiles.csv")

    # 用于填充没有pid的用户
    profiles_na = np.zeros(67)
    profiles_na[0] = -1
    profiles_na = pd.DataFrame(profiles_na.reshape(1, -1))
    profiles_na.columns = profiles.columns
    profiles = profiles.append(profiles_na)
    
    # 对特征进行奇异值分解，实现降维
#     pi = profiles.drop(['pid'], axis=1).values
#     svd = TruncatedSVD(n_components=60, n_iter=20, random_state=2019)
#     profiles_svd = svd.fit_transform(pi)
    
    # 转换成dataframe
#     profiles_svd = pd.DataFrame(profiles_svd)
#     profiles_svd.columns = ['svd_profiles_{}'.format(i) for i in range(60)]
#     profiles_svd['pid'] = profiles['pid'].values

    # 合并数据集
    all_data['pid'] = all_data['pid'].fillna(-1)
    all_data = all_data.merge(profiles, on='pid', how='left')
    return all_data

# 5、抽取时间特征（req_time）

### 距离国庆节的天数、查询的日期、周几、小时、分钟、是否是假期、是否是周末

In [6]:
def gen_time_feature(all_data):
    
    # 国庆嫁接
    NatioinalDay = []
    d1 = datetime.datetime(2018,10,1)
    for i in range(all_data.shape[0]):
        d2 = datetime.datetime.strptime(all_data.req_time[i].split(' ')[0], "%Y-%m-%d")
        s = d2 - d1
        NatioinalDay.append(s.days)
    all_data['NatioinalDay'] = NatioinalDay
    
    all_data['req_time'] = pd.to_datetime(all_data['req_time'])
    
    # 查询日期
    all_data['req_date'] = all_data['req_time'].dt.strftime("%m-%d")
    
    # 是否是假期
    all_data['isholiday'] = all_data['req_date'].isin(['10-01','10-02','10-03','10-04','10-05','10-06','10-07',]).astype(int)
    
    # 周几
    all_data['dayofweek'] = all_data['req_time'].dt.dayofweek
    
    # 小时
    all_data['req_hour'] = all_data['req_time'].dt.hour
    
    # 分钟
    all_data['req_minute'] = all_data['req_time'].dt.minute
    
    # 是否是周末
    all_data['isweekend']=0
    all_data.isweekend[all_data['dayofweek']>4]=1
    
    all_data = all_data.drop(['req_date'], axis=1)
    
    return all_data

# 6、提取pid特征

### 根据lgbm的特征重要度发现，pid是个强特，所以对pid进行特征提取
### 1、统计每个pid出现的次数，将次数作为特征
### 2、统计每个pid在每个类别中出现的次数（这个有问题）
### 3、pid与时间特征的组合出现的次数
### 4、pid与时间特征的组合在每个类别中出现的次数（这个有问题）

In [7]:
def gen_pid_feature(all_data):
    
    # 统计每个pid出现的次数
    pid_counts = pd.DataFrame()
    counts = all_data['pid'].value_counts()
    index = counts.index
    pid_counts['pid'] = index
    pid_counts['pid_counts'] = list(counts)
    
    # pid与first_mode组合出现次数
    pidCombineFirstM0 = pd.DataFrame()
    counts = all_data.pid[all_data['first_mode']==0].value_counts()
    index = counts.index
    pidCombineFirstM0['pid'] = index
    pidCombineFirstM0['pid_F0_counts'] = list(counts)
    
    pidCombineFirstM1 = pd.DataFrame()
    counts = all_data.pid[all_data['first_mode']==1].value_counts()
    index = counts.index
    pidCombineFirstM1['pid'] = index
    pidCombineFirstM1['pid_F1_counts'] = list(counts)
    
    pidCombineFirstM2 = pd.DataFrame()
    counts = all_data.pid[all_data['first_mode']==2].value_counts()
    index = counts.index
    pidCombineFirstM2['pid'] = index
    pidCombineFirstM2['pid_F2_counts'] = list(counts)
    
    pidCombineFirstM3 = pd.DataFrame()
    counts = all_data.pid[all_data['first_mode']==3].value_counts()
    index = counts.index
    pidCombineFirstM3['pid'] = index
    pidCombineFirstM3['pid_F3_counts'] = list(counts)
    
    pidCombineFirstM4 = pd.DataFrame()
    counts = all_data.pid[all_data['first_mode']==4].value_counts()
    index = counts.index
    pidCombineFirstM4['pid'] = index
    pidCombineFirstM4['pid_F4_counts'] = list(counts)
    
    pidCombineFirstM5 = pd.DataFrame()
    counts = all_data.pid[all_data['first_mode']==5].value_counts()
    index = counts.index
    pidCombineFirstM5['pid'] = index
    pidCombineFirstM5['pid_F5_counts'] = list(counts)
    
    pidCombineFirstM6 = pd.DataFrame()
    counts = all_data.pid[all_data['first_mode']==6].value_counts()
    index = counts.index
    pidCombineFirstM6['pid'] = index
    pidCombineFirstM6['pid_F6_counts'] = list(counts)
    
    pidCombineFirstM7 = pd.DataFrame()
    counts = all_data.pid[all_data['first_mode']==7].value_counts()
    index = counts.index
    pidCombineFirstM7['pid'] = index
    pidCombineFirstM7['pid_F7_counts'] = list(counts)
    
    pidCombineFirstM8 = pd.DataFrame()
    counts = all_data.pid[all_data['first_mode']==8].value_counts()
    index = counts.index
    pidCombineFirstM8['pid'] = index
    pidCombineFirstM8['pid_F8_counts'] = list(counts)
    
    pidCombineFirstM9 = pd.DataFrame()
    counts = all_data.pid[all_data['first_mode']==9].value_counts()
    index = counts.index
    pidCombineFirstM9['pid'] = index
    pidCombineFirstM9['pid_F9_counts'] = list(counts)
    
    pidCombineFirstM10 = pd.DataFrame()
    counts = all_data.pid[all_data['first_mode']==10].value_counts()
    index = counts.index
    pidCombineFirstM10['pid'] = index
    pidCombineFirstM10['pid_F10_counts'] = list(counts)
    
    pidCombineFirstM11 = pd.DataFrame()
    counts = all_data.pid[all_data['first_mode']==11].value_counts()
    index = counts.index
    pidCombineFirstM11['pid'] = index
    pidCombineFirstM11['pid_F11_counts'] = list(counts)
    
    # pid与时间特征的组合出现次数，按是否是节假日进行组合
    pidCombineHoliday0 = pd.DataFrame()
    counts_0 = all_data.pid[all_data['isholiday']==0].value_counts()
    index_0 = counts_0.index
    pidCombineHoliday0['pid'] = index_0
    pidCombineHoliday0['pid_H0_counts'] = list(counts_0)
    
    pidCombineHoliday1 = pd.DataFrame()
    counts_1 = all_data.pid[all_data['isholiday']==1].value_counts()
    index_1 = counts_1.index
    pidCombineHoliday1['pid'] = index_1
    pidCombineHoliday1['pid_H1_counts'] = list(counts_1)
    
    # pid与时间特征的组合出现次数，按是否是周末进行组合
    pidCombineWeekend0 = pd.DataFrame()
    counts_0 = all_data.pid[all_data['isweekend']==0].value_counts()
    index_0 = counts_0.index
    pidCombineWeekend0['pid'] = index_0
    pidCombineWeekend0['pid_W0_counts'] = list(counts_0)
    
    pidCombineWeekend1 = pd.DataFrame()
    counts_1 = all_data.pid[all_data['isweekend']==1].value_counts()
    index_1 = counts_1.index
    pidCombineWeekend1['pid'] = index_1
    pidCombineWeekend1['pid_W1_counts'] = list(counts_1)
    
    #合并dataframe
    all_data = all_data.merge(pidCombineFirstM0, on='pid', how='left')
    all_data = all_data.merge(pidCombineFirstM1, on='pid', how='left')
    all_data = all_data.merge(pidCombineFirstM2, on='pid', how='left')
    all_data = all_data.merge(pidCombineFirstM3, on='pid', how='left')
    all_data = all_data.merge(pidCombineFirstM4, on='pid', how='left')
    all_data = all_data.merge(pidCombineFirstM5, on='pid', how='left')
    all_data = all_data.merge(pidCombineFirstM6, on='pid', how='left')
    all_data = all_data.merge(pidCombineFirstM7, on='pid', how='left')
    all_data = all_data.merge(pidCombineFirstM8, on='pid', how='left')
    all_data = all_data.merge(pidCombineFirstM9, on='pid', how='left')
    all_data = all_data.merge(pidCombineFirstM10, on='pid', how='left')
    all_data = all_data.merge(pidCombineFirstM11, on='pid', how='left')
    all_data = all_data.merge(pid_counts, on='pid', how='left')
    all_data = all_data.merge(pidCombineHoliday0, on='pid', how='left')
    all_data = all_data.merge(pidCombineHoliday1, on='pid', how='left')
    all_data = all_data.merge(pidCombineWeekend0, on='pid', how='left')
    all_data = all_data.merge(pidCombineWeekend1, on='pid', how='left')
    
    # 填充缺失值
    all_data['pid_F0_counts'] = all_data['pid_F0_counts'].fillna(0)
    all_data['pid_F1_counts'] = all_data['pid_F1_counts'].fillna(0)
    all_data['pid_F2_counts'] = all_data['pid_F2_counts'].fillna(0)
    all_data['pid_F3_counts'] = all_data['pid_F3_counts'].fillna(0)
    all_data['pid_F4_counts'] = all_data['pid_F4_counts'].fillna(0)
    all_data['pid_F5_counts'] = all_data['pid_F5_counts'].fillna(0)
    all_data['pid_F6_counts'] = all_data['pid_F6_counts'].fillna(0)
    all_data['pid_F7_counts'] = all_data['pid_F7_counts'].fillna(0)
    all_data['pid_F8_counts'] = all_data['pid_F8_counts'].fillna(0)
    all_data['pid_F9_counts'] = all_data['pid_F9_counts'].fillna(0)
    all_data['pid_F10_counts'] = all_data['pid_F10_counts'].fillna(0)
    all_data['pid_F11_counts'] = all_data['pid_F11_counts'].fillna(0)
    all_data['pid_H0_counts'] = all_data['pid_H0_counts'].fillna(0)
    all_data['pid_H1_counts'] = all_data['pid_H1_counts'].fillna(0)
    all_data['pid_W0_counts'] = all_data['pid_W0_counts'].fillna(0)
    all_data['pid_W1_counts'] = all_data['pid_W1_counts'].fillna(0)
    
    return all_data

# 7、切分数据集

In [8]:
def train_test_split(all_data):
    train_data = all_data[all_data['click_mode']!=-1]
    test_data = all_data[all_data['click_mode']==-1]
    test_data = test_data.drop(['click_mode'], axis=1)
    submit = test_data[['sid']].copy()
    
    train_data = train_data.drop(['sid', 'req_time'], axis=1)
    train_y = train_data['click_mode']
    train_x = train_data.drop(['click_mode'], axis=1)
    test_x = test_data.drop(['sid', 'req_time'], axis=1)
    
    return train_x, train_y, test_x, submit

In [9]:
all_data = merge_data()
all_data = gen_od_feature(all_data)
all_data = gen_plan_feature(all_data)
all_data = gen_profiles_feature(all_data)
all_data = gen_time_feature(all_data)
all_data = gen_pid_feature(all_data)
train_x, train_y, test_x, submit = train_test_split(all_data)

594358it [01:31, 6484.03it/s]
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


# 8、模型训练&验证&提交

In [10]:
import numpy as np
import lightgbm as lgb
from sklearn.metrics import f1_score
from time import gmtime, strftime
from sklearn.model_selection import StratifiedKFold

def f1_weighted(y_pred, train_data):
    y_true = train_data.label
    y_pred = y_pred.reshape(12, -1).T
    y_pred = np.argmax(y_pred, axis=1)
    f1 = f1_score(y_true, y_pred, average='weighted')
    return 'weighted-f1-score', f1, True

kfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=2019)

lgb_paras = {
    'objective': 'multiclass',
    'metrics': 'multiclass',
    'learning_rate': 0.05,
    'num_leaves': 31,
    'lambda_l1': 0.01,
    'lambda_l2': 10,
    'num_class': 12,
    'seed': 2019,
    'feature_fraction': 0.8,
    'bagging_fraction': 0.8,
    'bagging_freq': 4
}

categorical_feature = ['pid', 'max_dis_mode', 'min_dis_mode', 'max_price_mode', 'min_price_mode',
                       'max_eta_mode', 'min_eta_mode', 'first_mode']
scores = []
result_proba = []
for tra_idx, val_idx in kfold.split(train_x, train_y):
    tra_x, tra_y, val_x, val_y = train_x.iloc[tra_idx], train_y[tra_idx], train_x.iloc[val_idx], train_y[val_idx]
    train_set = lgb.Dataset(tra_x, tra_y, categorical_feature=categorical_feature)
    val_set = lgb.Dataset(val_x, val_y, categorical_feature=categorical_feature)
    lgb_model = lgb.train(lgb_paras, train_set, valid_sets=[val_set], early_stopping_rounds=50, num_boost_round=40000, verbose_eval=50, feval=f1_weighted)
    val_pred = np.argmax(lgb_model.predict(val_x, num_iteration=lgb_model.best_iteration), axis=1)
    val_score = f1_score(val_y, val_pred, average='weighted')
    result_proba.append(lgb_model.predict(test_x, num_iteration=lgb_model.best_iteration))
    scores.append(val_score)
print('cv f1_score:', np.mean(scores))
pred_test = np.argmax(np.mean(result_proba, axis=0), axis=1)

# 提交结果
now_time = strftime("%Y-%m-%d-%H-%M-%S", gmtime())
submit['recommend_mode'] = pred_test
submit.to_csv('submission_{}.csv'.format(now_time), index=False)

  'precision', 'predicted', average, warn_for)


Training until validation scores don't improve for 50 rounds.
[50]	valid_0's multi_logloss: 1.01235	valid_0's weighted-f1-score: 0.671802
[100]	valid_0's multi_logloss: 0.917836	valid_0's weighted-f1-score: 0.677159
[150]	valid_0's multi_logloss: 0.897418	valid_0's weighted-f1-score: 0.679313
[200]	valid_0's multi_logloss: 0.892203	valid_0's weighted-f1-score: 0.679891
[250]	valid_0's multi_logloss: 0.891986	valid_0's weighted-f1-score: 0.679858
Early stopping, best iteration is:
[204]	valid_0's multi_logloss: 0.892125	valid_0's weighted-f1-score: 0.680097
Training until validation scores don't improve for 50 rounds.
[50]	valid_0's multi_logloss: 1.00907	valid_0's weighted-f1-score: 0.673015
[100]	valid_0's multi_logloss: 0.91363	valid_0's weighted-f1-score: 0.678086
[150]	valid_0's multi_logloss: 0.892745	valid_0's weighted-f1-score: 0.68014
[200]	valid_0's multi_logloss: 0.887679	valid_0's weighted-f1-score: 0.680696
[250]	valid_0's multi_logloss: 0.887403	valid_0's weighted-f1-score

## 特征重要度

In [11]:
imp = pd.DataFrame()
imp['feature'] = tra_x.columns
imp['imp'] = lgb.feature_importances_
imp = imp.sort_values('imp', ascending = False)
imp

AttributeError: module 'lightgbm' has no attribute 'feature_importances_'

In [71]:
pre = lgb.predict(valid_x)
f1_score(valid_y, pre, average='weighted')

0.6833997071807066

## 提交结果

In [None]:
x = train_x.drop(['req_time'], axis=1)
y = train_y.drop(['req_time'], axis=1)

lgb.n_estimators = lgb.best_iteration_
lgb.fit(x, y, categorical_feature=categorical_feature)
pred_test = lgb.predict(test_x)

# 提交结果
now_time = strftime("%Y-%m-%d-%H-%M-%S", gmtime())
submit['recommend_mode'] = pred_test
submit.to_csv('submission_{}.csv'.format(now_time), index=False)