In [2]:
import numpy as np
import pandas as pd
import os
import datetime
from tqdm import tqdm
from collections import Counter
import warnings
warnings.filterwarnings("ignore")


def get_info(x):
    return [i.split(":")[-1] for i in x.split(" ")]
def get_speed(x):
    return np.array([i.split(",")[0] for i in x],dtype='float16')
def get_eta(x):
    return np.array([i.split(",")[1] for i in x],dtype="float16")
def get_state(x):
    return np.array([i.split(",")[2] for i in x])
def get_cnt(x):
    return np.array([i.split(",")[3] for i in x],dtype="int16")


def get_feature(input_file_path_his, input_file_path_attr,input_file_path_topo, mode):
    # his
    df = pd.read_csv(input_file_path_his, sep=";", header=None)
    df["link"] = df[0].apply(lambda x: x.split(" ")[0]).astype(int)
    df["label"] = df[0].apply(lambda x: x.split(" ") [1]).astype(int)
    df["current_slice_id"] = df[0].apply(lambda x: x.split(" ")[2]).astype(int)
    df["future_slice_id"] = df[0].apply(lambda x: x.split(" ")[3]).astype(int)
    df["time_diff"] = df["future_slice_id"] - df["current_slice_id"]
    df = df.drop([0], axis=1)

    if mode == "is_train":
        df["label"] = df["label"].map(lambda x: 3 if x >= 3 else x)
        df['label'] -= 1
    else:
        df = df.drop(["label"], axis=1)

    df["current_state_last"] = df[1].apply(lambda x: x.split(" ")[-1].split(":")[-1])
        # 路况速度,eta速度,路况状态,参与路况计算的车辆数
    df["current_speed"] = df["current_state_last"].apply(lambda x: x.split(",")[0])
    df["current_eat_speed"] = df["current_state_last"].apply(lambda x: x.split(",")[1])
    df["current_state"] = df["current_state_last"].apply(lambda x: x.split(",")[2])
    df["current_count"] = df["current_state_last"].apply(lambda x: x.split(",")[3])
    df = df.drop(["current_state_last"], axis=1)
    for i in tqdm(range(1, 6, 1)):
        flag = f"his_{(6-i)*7}"
        df["history_info"] = df[i].apply(get_info)

        # speed
        df["his_speed"] = df["history_info"].apply(get_speed)
        df[f'{flag}_speed_mean'] = df["his_speed"].apply(lambda x: x.mean())

        # eta
        df["his_eta"] = df["history_info"].apply(get_eta)
        df[f"{flag}_eta_mean"] = df["his_eta"].apply(lambda x: x.mean())


        # state
        df["his_state"] = df["history_info"].apply(get_state)
        df[f"{flag}_state_max"] = df["his_state"].apply(lambda x: Counter(x).most_common()[0][0])
        df[f"{flag}_state_min"] = df["his_state"].apply(lambda x: Counter(x).most_common()[-1][0])

        # cnt
        df["his_cnt"] = df["history_info"].apply(get_cnt)
        df[f"{flag}_cnt_mean"] = df["his_cnt"].apply(lambda x: x.mean())
        df = df.drop([i, "history_info", "his_speed", "his_eta", "his_state", "his_cnt"], axis=1)
        # break

    df2 = pd.read_csv(input_file_path_attr, sep='\t',
                       names=['link', 'length', 'direction', 'path_class', 'speed_class',
                              'LaneNum', 'speed_limit', 'level', 'width'], header=None)
    df = df.merge(df2, on='link', how='left')

    if mode =="is_train":
        output_file_path =f"./data/{mode}_{input_file_path_his.split('/')[-1].split('.')[0]}" +".csv"
        df.to_csv(output_file_path,index =False,mode='w', header=True)

    else:
        output_file_path=f"./data/{input_file_path_his.split('/')[-1].split('.')[0]}" +".csv"
        df.to_csv(output_file_path,index = False,mode='w', header=True)
    # print(df.dtypes)

In [3]:
if __name__ =="__main__":
    print(datetime.datetime.now())
    #训练集
    get_feature(input_file_path_his="data/traffic/20190701.txt",\
                input_file_path_attr="data/attr.txt",\
                input_file_path_topo="data/topo.txt",mode="is_train")
    #测试集
    get_feature(input_file_path_his="data/test/test.txt",\
                input_file_path_attr="data/attr.txt",\
                input_file_path_topo="data/topo.txt",mode="is_test")
    print(datetime.datetime.now())

2020-12-13 23:29:29.426964


100%|██████████| 5/5 [03:26<00:00, 41.25s/it]
100%|██████████| 5/5 [03:24<00:00, 40.93s/it]


2020-12-13 23:37:13.921012


In [4]:
# Version: python 3.7.9  tensorflow 2.0.0 keras 2.3.1
# Name:Model DNN
# =======================================================================

import tensorflow as tf
import keras as K
from keras_applications import vgg16
import pandas as pd
import numpy as np
import datetime
from keras import losses
from keras.models import Sequential
from keras.layers import Dense, Dropout
from keras.utils import np_utils
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelBinarizer

def Dnn_Model(train=None,label=None, test=None, use_features=None,categorical_feats=None, n_class=3):
    input_length=train[use_features].shape[0]
    input_dim=train[use_features].shape[1]
    
    Y= np_utils.to_categorical(train[label],num_classes=3)
    train_x, test_x, train_y, test_y = train_test_split(train[use_features], Y,
                                                        train_size=0.7, test_size=0.3, random_state=0)
    # 2. 定义模型
    init = K.initializers.glorot_uniform(seed=1)
    model = K.models.Sequential()
    model.add(K.layers.Dense(units=5, input_shape=(input_dim,), kernel_initializer=init, activation='relu'))
    model.add(K.layers.Dense(units=6, kernel_initializer=init, activation='relu'))
    model.add(K.layers.Dense(units=n_class, kernel_initializer=init, activation='softmax'))
    # rmsprop可以自定义,也可以使用默认值
    model.compile(loss="categorical_crossentropy",
                  optimizer="rmsprop",
                  metrics=["accuracy"])

    b_size=128
    max_epochs=10
    model.fit(train_x, train_y, batch_size=b_size, epochs=max_epochs, shuffle=True,
              validation_data=(test_x, test_y),verbose=1)
    loss_and_metrics = model.evaluate(test_x, test_y, batch_size=128)
    print(loss_and_metrics)

    predictions = model.predict(test[use_features])

    test["label"] = np.argmax(predictions, axis=1) + 1
    return test[["link", 'current_slice_id', 'future_slice_id', "label"]]

Using TensorFlow backend.


In [5]:
if __name__ =="__main__":
    train = pd.read_csv('data/is_train_20190701.csv')
    test = pd.read_csv("data/test.csv")
    # print(train.dtypes)
    # print(train["link_id_length"].unique())
    
    del_feature = ['link','label']
    use_features = [i for i in train.columns if i not in del_feature]
    category = ["direction","pathclass","speedclass","LaneNum","level"]
    print(datetime.datetime.now())
    submit =Dnn_Model(train=train,label="label", test=test, use_features=use_features,
                      categorical_feats=None, n_class=3)
    submit.to_csv('submit.csv', index=False, encoding='utf8')

    print(datetime.datetime.now())

2020-12-13 23:40:38.098227

Train on 352489 samples, validate on 151067 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
[0.376646004336733, 0.8608365654945374]
2020-12-13 23:41:52.307705


In [None]:
# DNN model

def DNN_Model(dataset=None,label=None,use_features=None,categorical_feats=None, n_class=3):
    input_length = dataset[use_features].shape[0]
    input_dim = dataset[use_features].shape[1]
    
    Y = np_utils.to_categorical(dataset[label],num_classes=3)
    x_train, x_test, y_train, y_test = train_test_split(dataset[use_features], Y,
                                                        train_size=0.7, test_size=0.3, random_state=0)
    
    # define the model 
    init = K.initializers.glorot_uniform(seed=1)
    simple_adam = K.optimizers.Adam()
    model = K.models.Sequential()
    model.add(K.layers.Dense(units=5, input_shape=(input_dim,), kernel_initializer=init, activation='relu'))
    model.add(K.layers.Dense(units=6, kernel_initializer=init, activation='relu'))
    model.add(K.layers.Dense(units=n_class, kernel_initializer=init, activation='softmax'))
    model.compile(loss="categorical_crossentropy",
                  optimizer=simple_adam,
                  #optimizer="rmsprop",
                  metrics=["accuracy"])

    # train the model
    print ("Start training \n")
        
    weights = {0:1,1:1,2:2}
    folds = KFold(n_splits=3, shuffle=True, random_state=0)
    for n_fold, (train_idx, valid_idx) in enumerate(folds.split(x_train), start=1):
        print('----------------------\n'
              'fold '+str(n_fold)+'开始了\n'+'本轮共有'+str(len(train_idx))+'条数据参与训练')
        train_x2, train_y2 = x_train[train_idx], y_train[train_idx]
        valid_x2, valid_y2 = x_train[valid_idx], y_train[valid_idx]
        model.fit(train_x2, train_y2,batch_size = 100, epochs=3, validation_data=(valid_x2,valid_y2),
                  validation_freq=1,class_weight = weights)
        
        y_preds = model.predict(x_test[use_features])
        y_preds = np.argmax(y_preds, axis=1).reshape(-1)
        f1_scores = f1_score(y_test, y_preds, average=None) 
        final_f1 = f1_scores[0] * 0.2 + f1_scores[1] * 0.2 + f1_scores[2] * 0.6
    
    print ("Training completed \n")

    return final_f1,history

In [None]:
# run the model

dataset = pd.read_csv('input/dataset_merged_2d.csv')[:10000]

del_feature = ['link','label']
use_features = [i for i in dataset.columns if i not in del_feature]
category = ["direction","pathclass","speedclass","LaneNum","level"]
print(datetime.datetime.now())

DNN = DNN_Model(dataset=dataset,label="label",use_features=use_features,
                  categorical_feats=None, n_class=3)
final_f1 = DNN[0]
history = DNN[1]

print("Final F1 on test set = ", final_f1)

print(datetime.datetime.now())