In [4]:
# This cell requests PeMS traffic data.

### Algorithm ###
# login() with your PeMS username and password
# prepare_response reads station_ids.csv in folder dataset/'city'
# prepare_response goes through each station in station_ids.csv
# prepare_response reads 8 weeks of vehicle 'flow' via get_response()

from datetime import datetime
from datetime import timedelta
from xml.etree import ElementTree
from networkx.readwrite import gpickle
from dateutil.parser import parse
from datetime import *; 
from dateutil.relativedelta import *



import os
import sys
import pandas as pd
import numpy as np
import requests
import pickle

def read_response(file):
    with open(file, 'rb') as f:
        response = pickle.load(f)
    
    return response

def prepare_response(session):
    # root = "dataset\\anaheim\\"
    root = "dataset\\oakland\\"
    df = pd.read_csv(root + 'station_ids.csv', header=0)
    ids = df['ID'].values

    weeks_to_read = 8 # weeks
    for i in ids:
        station_response = ''
        
        # check if station response was written before
        if(os.path.isfile(root + 'raw_data_2_months\\' + str(i) + '.txt')):
            print(i, '.txt exists in destination, skipping..')
            continue
        
        # get 8 weeks of response
        for w in range(weeks_to_read):
            start_time = '2016-01-01'
            start_time = parse(start_time)
            start_time = start_time + relativedelta(weeks=+w)
            start_time = start_time + relativedelta(hours=+3) # synching with server
            end_time = start_time + relativedelta(weeks=+1, minutes=-1)

            s_time_id = str(int(start_time.timestamp()))
            s_time_id_f = start_time.strftime('%m') + '%2F' + start_time.strftime('%d') + '%2F2016+00%3A00' 
            e_time_id = str(int(end_time.timestamp()))
            e_time_id_f = end_time.strftime('%m') + '%2F' + end_time.strftime('%d') + '%2F2016+23%3A59'

            string = get_response(session=s, station_id=str(i), s_time_id=s_time_id, s_time_id_f=s_time_id_f, e_time_id=e_time_id, e_time_id_f=e_time_id_f)

            if(i is not 0):
                string = string.split('\n', maxsplit=1)[1]

            station_response = station_response + string
        
        # write response
        with open(root + 'raw_data_2_months\\' + str(i) + ".txt", "w") as text_file:
                print("{}".format(station_response), file=text_file)
        
        

    print('Done.')
    return station_response

    
def get_response(session, station_id, s_time_id, s_time_id_f, e_time_id, e_time_id_f):
    try:
        print('station_id:', station_id)
        url = 'http://pems.dot.ca.gov/?report_form=1&dnode=VDS&content=loops&tab=det_timeseries&export=text' \
            + '&station_id=' + station_id \
            + '&s_time_id=' + s_time_id \
            + '&s_time_id_f=' + s_time_id_f \
            + '&e_time_id=' + e_time_id \
            + '&e_time_id_f=' + e_time_id_f \
            + '&tod=all&tod_from=0&tod_to=0&dow_0=on&dow_1=on&dow_2=on&dow_3=on&dow_4=on&dow_5=on&dow_6=on' \
            + '&holidays=on' \
            + '&q=flow' \
            + '&q2=&gn=5min' \
            + '&agg=on' \
            + '&lane1=on'
        
        print(url)
        response = session.post(url)
        response.raise_for_status
        # root = ElementTree.fromstring(response.content)
        # string = ElementTree.tostring(root, encoding='utf8', method='xml')
        
        
        string = response.content.decode('utf-8')
        return string
    except(Exception):
        print('station_id:', station_id)
        return ''
    
def login():
    # replace username with you email registered with PeMS and password.
    s = requests.Session()
    data = {"username":"", "password":"", 'login': 'Login'}
    url = "http://pems.dot.ca.gov/"
    r = s.post(url, data=data)
    return s

s = login()
string = prepare_response(s)
s.cookies
s.user.is_authenticated()

In [5]:
# This cell splits raw data into train, val, and test data that acts as an input to a Neural Network Model


import numpy as np
import pandas as pd
import sympy as sp
import plotly
import plotly.plotly as py
import plotly.graph_objs as go
import os
import random
import math
import matplotlib.pyplot as plt

from math import floor
from keras import optimizers
from keras.models import Sequential
from keras.models import Model
from keras.layers import Dense, LSTM
from keras.layers.advanced_activations import PReLU
from keras.optimizers import RMSprop, SGD
from keras import backend as K
from numpy import genfromtxt
from keras.wrappers.scikit_learn import KerasRegressor
from sklearn.metrics import pairwise_distances_argmin_min
from sklearn.model_selection import TimeSeriesSplit
from sklearn.cluster import KMeans
from sklearn.pipeline import Pipeline
from sklearn.linear_model import ElasticNet, Lasso,  BayesianRidge, LassoLarsIC
from sklearn.svm import SVR
from sklearn.ensemble import RandomForestRegressor,  GradientBoostingRegressor
from sklearn.kernel_ridge import KernelRidge
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import RobustScaler
from sklearn.base import BaseEstimator, TransformerMixin, RegressorMixin, clone
from sklearn.model_selection import KFold, cross_val_score, train_test_split
from sklearn.metrics import r2_score, mean_squared_error
from sklearn.base import clone

def df_split(df, val_size=0.10, test_size=0.20):  
    time_interval = 5
    N = (24 * 60) // time_interval
    days = df.shape[0] // N
    days_train = floor(days * 0.7)
    days_validate = floor(days * val_size)
    days_test = days - days_train - days_validate
    
    df_train = df[0:days_train*N]
    df_val = df[days_train*N:(days_train+days_validate)*N]
    df_test = df[(days_train+days_validate)*N:]
    
    print('days_train:', days_train)
    print('days_validate:', days_validate)
    print('days_test:', days_test)
        
    return [df_train, df_val, df_test]

# fills missing values with average of the same hour and minute
def fillaverage(df):
    for h in range(24):
        for m in range(0, 60, 5):
            hourminaverage = df[(df.index.hour == h) & (df.index.minute == m)].mean().vehicleCount
            hourminaverage = math.floor(hourminaverage)
            df['vehicleCount'][(df.index.hour == h) & (df.index.minute == m)] = df['vehicleCount'][(df.index.hour == h) & (df.index.minute == m)].fillna(hourminaverage)
    return df

def get_samples(input_dim, output_dim, predict_range, df):
    five_minutes_in_a_day = 1440 // 5
    number_of_days = df.shape[0] // five_minutes_in_a_day
    samples_per_day = five_minutes_in_a_day - input_dim - predict_range + 1

    inputdata = np.zeros((samples_per_day * number_of_days, input_dim))
    outputdata = np.zeros((samples_per_day * number_of_days, output_dim))
    
    for d in range(number_of_days):
        for i in range(samples_per_day):
            sample_index = i + (d * five_minutes_in_a_day)
            input_index = i + (d * samples_per_day)
            x = df[sample_index:sample_index+input_dim]
            inputdata[input_index, :] = x
            # outputdata[input_index] = df[sample_index+input_dim]
            # outputdata[sample_index] = df[sample_index+input_dim:sample_index+input_dim+output]
            # y = [df[sample_index+input_dim], df[sample_index+input_dim+2], df[sample_index+input_dim+5]]
            y = [df[sample_index+input_dim+predict_range-1]]
            outputdata[input_index, :] = y
        
    # outputdata = outputdata.flatten()
    return inputdata, outputdata

def clean_data(weekdays=False, weekends=False):
    # outputs = ['t24', 't36']
    # predict_range = [24, 36]
    # input_dim = [24, 36]
    
    outputs = ['t1']
    predict_range = [1]
    input_dim = [12]
    cities = ['oakland']
    
    for o, p, idim in zip(outputs, predict_range, input_dim):
        # input_dim = 12
        output_dim = 1
        
        root = "dataset\\"+ cities[0] + "\\raw_data_month\\"
        train_destination = "dataset\\"+ cities[0] + "\\clean_data\\weekdays_" + str(idim) + 'i_' + o + "\\train\\"
        val_destination = "dataset\\"+ cities[0] + "\\clean_data\\weekdays_" + str(idim) + 'i_' + o + "\\val\\"
        test_destination = "dataset\\"+ cities[0] + "\\clean_data\\weekdays_" + str(idim) + 'i_' + o + "\\test\\"
        files = os.listdir(root)

        for i in files:
            print(root + i)

            if(os.path.exists(train_destination + i + '.npz')):
                print(i, 'exists in destination, skipping..')
                continue

            try:
                df = pd.read_csv(root + i, parse_dates=[0], header=None, index_col=0, sep='\t')
            except:
                print('skipped empty df:', i)
                continue

            if(df.empty):
                print('skipped empty df:', i)
                continue
            
            print(len(df))
            if(len(df) != (288 * 7 * 4)):
                print('skipped short df:', i)
                continue
            
            if(weekdays):
                df = df[df.index.dayofweek < 5]
            if(weekends):
                df = df[df.index.dayofweek >= 5]

            # avgSpeed = df['avgSpeed'].astype(np.float64).values

            if(df.dtypes[2] != np.float64):
                df[2] = df[2].map(lambda x: x.replace(',', ''))
                df[2] = df[2].astype(np.float64)

            vehicleCount = df[2].astype(np.float64).values

            [df_train, df_val, df_test] = df_split(vehicleCount)

            if((df_train.size == 0) or (df_val.size == 0) or (df_test.size == 0)):
                print('skipped empty df after split:', i)

            x_train, y_train = get_samples(idim, output_dim, p, df_train)
            x_val, y_val = get_samples(idim, output_dim, p, df_val)
            x_test, y_test = get_samples(idim, output_dim, p, df_test)

            # clean_df = np.concatenate((df_train, df_val, df_test))
            np.savez(train_destination + i, x_train, y_train)
            np.savez(val_destination + i, x_val, y_val)
            np.savez(test_destination + i, x_test, y_test)

    print('done')

def normalize(x):
    std = x.std()
    if(std == 0):
        return (x - x.mean()) / np.finfo(np.float64).min
    if(np.isnan(std)):
        return (x - x.mean()) / np.finfo(np.float64).max
    return (x - x.mean()) / std
    # return (x - min(x)) / (max(x) - min(x))
    
clean_data(weekdays=True)

Using TensorFlow backend.


In [1]:
# This cell prepares the input for a clustering algorithm

def prepare_clusters(city, weekdays=False, weekends=False):
    x = []
    root = "dataset\\" + city + "\\raw_data_month\\"
    files = os.listdir(root)
    
    for i in files:
        print(root + i)
        
        try:
            df = pd.read_csv(root + i, header=None, parse_dates=[0], index_col=0, sep='\t')
        except:
            print('skipped empty df:', i)
            continue
        
        if(df.empty):
            print('skipped empty df:', i)
            continue
        
        print(len(df))
        if(len(df) != (288 * 7 * 4)):
            print('skipped short df:', i)
            continue
        
        if(weekdays):
            df = df[df.index.dayofweek < 5]
        if(weekends):
            df = df[df.index.dayofweek >= 5]
        
        if(df.dtypes[2] != np.float64):
            df[2] = df[2].map(lambda x: x.replace(',', ''))
            df[2] = df[2].astype(np.float64)

        t = np.mean(df[2].astype(np.float64).values)
        x.append([i, [t]])
        
    return x

x = prepare_clusters(city="anaheim", weekdays=True)
ordered = list(f[0] for f in x)
kmeans_input = list(f[1] for f in x)
kmeans_input = np.array(kmeans_input)
kmeans_input = np.vstack(kmeans_input)
print(kmeans_input.shape)

In [14]:
# This cell uses the output of the previous cell as an input to the clustering algorithm
# The clustering algorithm seggregates raw data into 3 groups: low, medium, high indicated by 0, 1, 2 respectively

from sklearn.cluster import KMeans

# K-means
city = 'anaheim'

def map_traffic(x):
    if(x == 0):
        return 'medium'
    if(x == 1):
        return 'low'
    if(x == 2):
        return 'high'
    
# def get_clusters():
#     filepath = "AarhusClusters.csv"
#     df = pd.read_csv(filepath, index_col=0)
#     df.index = df.index - 1

#     high = df[df['TrafficLevelAvg'] == 'high'].index
#     medium = df[df['TrafficLevelAvg'] == 'medium'].index
#     low = df[df['TrafficLevelAvg'] == 'low'].index
        
#     return low, medium, high

kmeans = KMeans(n_clusters=3, random_state=0).fit(kmeans_input)
print(kmeans.labels_)
print(kmeans.cluster_centers_)
closest, _ = pairwise_distances_argmin_min(kmeans.cluster_centers_, kmeans_input)
print(closest)

labels = np.array(list(map(map_traffic, kmeans.labels_)))
file_labels = np.column_stack((ordered, labels))

output = pd.DataFrame(file_labels)
output.to_csv('dataset\\'+ city +'\\file_labels.csv')



[0 0 0 1 0 0 1 0 0 1 0 0 0 1 0 0 1 0 0 1 2 0 0 2 2 0 2 1 1 0 0 0 2 0 1 0 0
 2 0 0 2 1 1 0 0 1 0 0 2 0 1 0 1 0 0 1 0 1 0 1 0 0 1 0 0 1 0 1 0 1 0 0 2 0
 0 1 0 2 0 0 2 0 2 2 1 1 0 0 1 0 0 1 0 1 0 1 0 1 0 0 1 1 0 0 0 1 1 0 1 1 1
 0 1 0 0 1 0 1 1 1 0 1 2 1 0 1 0 1 0 1 0 1 1 1 0 1 0 1 1 0 0 2 1 0 0 1 1 0
 0 1 2 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 2 0 0 0 1 0
 1 0 2 0 0 0 0 0 2 0 1 0 0 0 0 0 1 0 0 1 0 0 0 1 0 1 0 0 0 0 0 0 0 0 0 0 0
 0 0 1 2 0 1 0 1 0 2 0 0 0 0 0 0 0 0 0 2 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 1 1 0 0 2 2 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
[[  41.98705552]
 [ 376.4614945 ]
 [ 259.17594763]]
[255 201  24]


In [2]:
# This cell prints the number of elements in low, medium, and high clusters

def get_clusters(city, random=False):

    filepath = "dataset\\" + city + "\\file_labels.csv"
    df = pd.read_csv(filepath, index_col=0, header=None, names=['file_name', 'TrafficLevelAvg'])
    if(random):
        print('--USING RANDOM CLUSTERS--')
        indexes = np.arange(len(df.index))
        np.random.shuffle(indexes)
        segmentation_size = len(indexes) // 3
        # low = indexes[:63]
        # medium = indexes[63:269]
        # high = indexes[269:]
        low, medium, high = np.split(indexes, [segmentation_size, segmentation_size*2])
        print(len(low), len(medium), len(high))
    else:
        # high = df[df['TrafficLevelAvg'] == 'high'].index
        # medium = df[df['TrafficLevelAvg'] == 'medium'].index
        # low = df[df['TrafficLevelAvg'] == 'low'].index
        
        high = df[df['TrafficLevelAvg'] == 'high'].index
        medium = df[df['TrafficLevelAvg'] == 'medium'].index
        low = df[df['TrafficLevelAvg'] == 'low'].index
        
    return low, medium, high

low, medium, high = get_clusters(city='oakland')
print('low:', len(low))
print('medium:', len(medium))
print('high:', len(high))