In [1]:
import pandas as pd

from __future__ import print_function
import paddle
import paddle.fluid as fluid
from functools import partial
import numpy as np

CLASS_DIM = 2
EMB_DIM = 128
HID_DIM = 512
STACKED_NUM = 3
BATCH_SIZE = 128
USE_GPU = False




In [2]:
reader = pd.read_csv('busdata.csv',chunksize=100000,usecols=[1,3,5,6,7,8,9])
frames=[]
for chunk in reader:
    frames.append(chunk[(chunk.O_BUSNAME==80187)&(chunk.O_SPEED>0)])
result = pd.concat(frames)
result['O_TIME']=pd.to_datetime(result['O_TIME'],format="%H:%M:%S")
result=result.sort_values(by='O_TIME')
result.set_index('O_TIME')
result.to_csv("238night_80187_result_sort.csv")

In [5]:
result.iloc[0]

O_LINENO                      2390
O_BUSNAME                    80187
O_DATE                  2016-01-01
O_TIME         1900-01-01 00:00:02
O_LONGITUDE                7404.54
O_LATITUDE                 2506.38
O_SPEED                         31
Name: 1965, dtype: object

In [34]:
time=result.iloc[0,3]
time=pd.to_datetime(time,format="%H:%M:%S")
print(time)
print(type(time))

1900-01-01 00:01:37
<class 'pandas._libs.tslibs.timestamps.Timestamp'>


In [None]:
def stacked_lstm_net(data, input_dim, class_dim, emb_dim, hid_dim, stacked_num):

    emb = fluid.layers.embedding(dataframe
        input=data, size=[input_dim, emb_dim], is_sparse=True)

    fc1 = fluid.layers.fc(input=emb, size=hid_dim)
    lstm1, cell1 = fluid.layers.dynamic_lstm(input=fc1, size=hid_dim)
3
    inputs = [fc1, lstm1]

    for i in range(2, stacked_num + 1):
        fc = fluid.layers.fc(input=inputs, size=hid_dim)
        lstm, cell = fluid.layers.dynamic_lstm(
            input=fc, size=hid_dim, is_reverse=(i % 2) == 0)
        inputs = [fc, lstm]

    fc_last = fluid.layers.sequence_pool(input=inputs[0], pool_type='max')
    lstm_last = fluid.layers.sequence_pool(input=inputs[1], pool_type='max')

    prediction = fluid.layers.fc(
        input=[fc_last, lstm_last], size=class_dim, act='softmax')
    return prediction

In [None]:
#接下来我们定义预测程序（inference_program）。dataframe预测程序使用convolution_net来对fluid.layer.data的输入进行预测。
def inference_program(word_dict):
    data = fluid.layers.data(
        name="words", shape=[1], dtype="int64", lod_level=1)
    
    dict_dim = len(word_dict)
    
    net = convolution_net(data, dict_dim, CLASS_DIM, EMB_DIM, HID_DIM)
    # net = stacked_lstm_net(data, dict_dim, CLASS_DIM, EMB_DIM, HID_DIM, STACKED_NUM)
    return net

In [None]:
#我们这里定义了training_program。它使用了从inference_program返回的结果来计算误差。我们同时定义了优化函数optimizer_func。

#因为是有监督的学习，训练集的标签也在paddle.layer.data中定义了。在训练过程中，交叉熵用来在paddle.layer.classification_cost中作为损失函数。

#在测试过程中，分类器会计算各个输出的概率。第一个返回的数值规定为 损耗(cost)。

In [None]:
def train_program(word_dict):
    prediction = inference_program(word_dict)
    
    label = fluid.layers.data(name="label", shape=[1], dtype="int64")
    #label定义的是监督学习中的标签，label对应的时数据集中名为“label”的列，维度是一维向量，类型时整型
    #理解fluid.layers.data应该是一个存放数据的函数
    
    cost = fluid.layers.cross_entropy(input=prediction, label=label)
    #定义训练的损失函数为交叉熵，输入为prediction和标签
    avg_cost = fluid.layers.mean(cost)
    
    accuracy = fluid.layers.accuracy(input=prediction, label=label)
    
    return [avg_cost, accuracy]


def optimizer_func():
    return fluid.optimizer.Adagrad(learning_rate=0.002)


In [None]:
#定义训练环境
#定义您的训练是在CPU上还是在GPU上
use_cuda = False
place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace()


In [None]:
#定义数据提供器
#下一步是为训练和测试定义数据提供器。提供器读入一个大小为 BATCH_SIZE的数据。
#paddle.dataset.imdb.train 每次会在乱序化后提供一个大小为BATCH_SIZE的数据，乱序化的大小为缓存大小buf_size。
#注意：读取IMDB的数据可能会花费几分钟的时间，请耐心等待

print("Loading IMDB word dict....")
word_dict = paddle.dataset.imdb.word_dict()

print ("Reading training data....")
train_reader = paddle.batch(
    paddle.reader.shuffle(
        paddle.dataset.imdb.train(word_dict), buf_size=25000),
    batch_size=BATCH_SIZE)


In [None]:
#构造训练器(trainer)
#训练器需要一个训练程序和一个训练优化函数。

trainer = fluid.Trainer(
    train_func=partial(train_program, word_dict),
    place=place,
    optimizer_func=optimizer_func)

In [None]:
#提供数据
#feed_order用来定义每条产生的数据和paddle.layer.data之间的映射关系。比如，imdb.train产生的第一列的数据对应的是words这个特征。
feed_order = ['words', 'label']

In [None]:
# Specify the directory path to save the parameters
#事件处理器
#回调函数event_handler在一个之前定义好的事件发生后会被调用。例如，我们可以在每步训练结束后查看误差。
params_dirname = "understand_sentiment_conv.inference.model"

def event_handler(event):
    if isinstance(event, fluid.EndStepEvent):
        print("Step {0}, Epoch {1} Metrics {2}".format(
                event.step, event.epoch, map(np.array, event.metrics)))

        if event.step == 10:
            trainer.save_params(params_dirname)
            trainer.stop()

In [None]:
#开始训练
#最后，我们传入训练循环数（num_epoch）和一些别的参数，调用 trainer.train 来开始训练。
trainer.train(
    num_epochs=1,
    event_handler=event_handler,
    reader=train_reader,
    feed_order=feed_order)

In [None]:
#构建预测器
#传入inference_program和params_dirname来初始化一个预测器, params_dirname用来存放训练过程中的各个参数。
inferencer = fluid.Inferencer(
        infer_func=partial(inference_program, word_dict), param_path=params_dirname, place=place)

In [None]:
reviews_str = [
    'read the book forget the movie', 'this is a great movie', 'this is very bad'
]
reviews = [c.split() for c in reviews_str]

UNK = word_dict['<unk>']
lod = []
for c in reviews:
    lod.append([word_dict.get(words, UNK) for words in c])

base_shape = [[len(c) for c in lod]]

tensor_words = fluid.create_lod_tensor(lod, base_shape, place)

In [None]:
results = inferencer.infer({'words': tensor_words})

for i, r in enumerate(results[0]):
    print("Predict probability of ", r[0], " to be positive and ", r[1], " to be negative for review \'", reviews_str[i], "\'")