From metroData_ts_v3.csv, extract:

- In/out flow for every 10 min


In [1]:
import numpy as np
import pandas as pd
import csv
import seaborn as sns
import matplotlib.pyplot as plt
import os
import datetime
from tqdm import tqdm
import pickle
import copy

***

In [2]:
def get_tripLabel(UserID, inStation, outStation, station_lightLabels):
    HomeStationID = station_lightLabels[UserID]['h']
    WorkStationID = station_lightLabels[UserID]['w']

    TripLabel = 3
    if (inStation, outStation) == (HomeStationID, WorkStationID) or (inStation, outStation) == (WorkStationID, HomeStationID) :
        TripLabel  = 0
    elif inStation == HomeStationID or outStation == HomeStationID :
        TripLabel = 1
    elif inStation == WorkStationID or outStation == WorkStationID:
        TripLabel = 2
        
    return TripLabel

In [3]:
def travelDemand(interval):  # extrac travel demand (in/out flow) every interval minutes

    # metro schedule  6:00-23:00 
    startHour = 6
    endHour = 23
    timeslotsPerDay = int((endHour - startHour)*60/interval)

    #################
    # Create Day Bins
    dayIndex = {} # {yearmonthday: index}
    count = 0
    for month in range(5, 9):
        yearmonth = '2017' + str(month).zfill(2)
        for day in range(32):
            dd = yearmonth + str(day).zfill(2)
            try:
                daytime = datetime.datetime.strptime(dd, '%Y%m%d')
                dayIndex[dd] = count
                count += 1
            except:
                continue

    ##################
    # Load Station Info
    # stationsData = open("results/metroStations.csv", 'r') # should be new ID
    stationsData = open ("../Metro/station_transInfo_cleaned.csv", 'r') 
    next(stationsData)
    stations = [] # list of station id
    zeroFlow = {} # {station ID: flow}
    for row in stationsData:
        row = row.rstrip().split(',')
        station = int(row[1])
        stations.append(station)
        zeroFlow[station] = 0

    # transferStation, transferStationToNewID = pickle.load(open("results/transferStations.pkl", 'rb'))
    transferStation, transferStationToNewID = pickle.load(open("../Metro/transferStations.pkl",'rb'))

    # sort stations
    stations.sort()

    ##################
    # Load User Info
    with open('/data6/peiyan/SH-METR/results/station_lightLabels.pkl','rb') as infile:
        station_lightLabels = pickle.load(infile)


    ##################
    # Extract Demand
    # load data
    #inFile = "/media/xu/OS/Data/SmartCardData/metroData_ts.csv"
    inFile = "../Metro/metroData_ts_v3.csv"
    inData = open(inFile, 'r')
    next(inData)
    inDataList = list (inData)


    inStationFlow = [copy.deepcopy(zeroFlow) for _ in range(timeslotsPerDay)]
    inStationFlowC = [copy.deepcopy(zeroFlow) for _ in range(timeslotsPerDay)]
    inStationFlow_HO = [copy.deepcopy(zeroFlow) for _ in range(timeslotsPerDay)]
    inStationFlow_WO = [copy.deepcopy(zeroFlow) for _ in range(timeslotsPerDay)]
    
    outStationFlow =[copy.deepcopy(zeroFlow) for _ in range(timeslotsPerDay)]
    outStationFlowC = [copy.deepcopy(zeroFlow) for _ in range(timeslotsPerDay)]
    outStationFlow_HO = [copy.deepcopy(zeroFlow) for _ in range(timeslotsPerDay)]
    outStationFlow_WO = [copy.deepcopy(zeroFlow) for _ in range(timeslotsPerDay)]

    currentTimeslot = 0
    slot_startTime = datetime.datetime.strptime('06:00:00', "%H:%M:%S") 
    slot_endTime = slot_startTime + datetime.timedelta (minutes = interval)
    slot_startTime = slot_startTime.strftime("%H%M%S")
    slot_endTime = slot_endTime.strftime("%H%M%S")      
    currentDay = ''

    #outFile = "/media/xu/OS/Data/SmartCardData/metroData_ODflow_" + str(interval) + ".csv"
    outFile = "/data6/peiyan/SH-METR/results/metroData_Demandflow_"+str(interval)+".csv"
    #outData = open(outFile, 'wb')
    outData = open(outFile, 'w')
    #outData.writelines("date, timeslot, station, inFlow, outFlow\n")
    outData.writelines("date, timeslot, startTime, endTime, station, inFlow, outFlow, CinFlow, HO_inFlow, WO_inFlow, CoutFlow, HO_outFlow, WO_outFlow\n")  # add "startTime, endTime" %H%M%S

    for row in tqdm(inDataList):
        row = row.rstrip().split(',')
        userID = int(row[0])
        transDay = row[1]
        InTime = row [2]
        outTime = row [3]
        transHour = int(InTime[:2])
        if transHour < startHour or transHour >= endHour:     # metro schedule  6:00-23:00 
            continue
        minute = int(InTime[2:4])
        try:
            dayIdx = dayIndex[transDay]   
        except:
            continue

        currentTimeslot_in = int(dayIdx * timeslotsPerDay + (transHour - startHour) * 60 / interval + minute // interval)
        currentTimeslot_out = int(dayIdx * timeslotsPerDay + (int(outTime[:2]) - startHour) * 60 / interval + int(outTime[2:4]) // interval)
        currentTimeslotToday_in = currentTimeslot_in%timeslotsPerDay
        currentTimeslotToday_out = currentTimeslot_out%timeslotsPerDay
 

        if currentTimeslot_in < 0:
            continue

        if currentDay == '':
            currentDay = transDay
        
        if InTime > outTime: # For save
            continue

        if transDay != currentDay:
            # save the demand of this day
            for t in range(timeslotsPerDay):
                timeslot = (dayIdx-1)*timeslotsPerDay + t
                slot_startTime = datetime.datetime.strptime('06:00:00', "%H:%M:%S") + datetime.timedelta(minutes = interval * (timeslot % timeslotsPerDay))
                slot_endTime = slot_startTime + datetime.timedelta (minutes = interval)
                slot_startTime = slot_startTime.strftime("%H%M%S")
                slot_endTime = slot_endTime.strftime("%H%M%S")     
                for station in stations:
                    inFlow = str(inStationFlow[t][station])
                    outFlow = str(outStationFlow[t][station])
                    CinFlow = str(inStationFlowC[t][station])
                    HO_inFlow = str(inStationFlow_HO[t][station])
                    WO_inFlow = str(inStationFlow_WO[t][station])
                    CoutFlow = str(outStationFlowC[t][station])
                    HO_outFlow = str(outStationFlow_HO[t][station])
                    WO_outFlow = str(outStationFlow_WO[t][station])
                    outData.writelines(','.join([currentDay, str(timeslot), slot_startTime ,slot_endTime, str(station), inFlow, outFlow, CinFlow, HO_inFlow, WO_inFlow, CoutFlow, HO_outFlow, WO_outFlow ]) + '\n')
            # initilize
            inStationFlow = [copy.deepcopy(zeroFlow) for _ in range(timeslotsPerDay)]
            inStationFlowC = [copy.deepcopy(zeroFlow) for _ in range(timeslotsPerDay)]
            inStationFlow_HO = [copy.deepcopy(zeroFlow) for _ in range(timeslotsPerDay)]
            inStationFlow_WO = [copy.deepcopy(zeroFlow) for _ in range(timeslotsPerDay)]
            
            outStationFlow =[copy.deepcopy(zeroFlow) for _ in range(timeslotsPerDay)]
            outStationFlowC = [copy.deepcopy(zeroFlow) for _ in range(timeslotsPerDay)]
            outStationFlow_HO = [copy.deepcopy(zeroFlow) for _ in range(timeslotsPerDay)]
            outStationFlow_WO = [copy.deepcopy(zeroFlow) for _ in range(timeslotsPerDay)]
 
            currentDay = transDay

        inStation = int(row[4])
        outStation = int(row[5])

        # # Updata Station ID
        # try:
        #     inStation_new = transferStationToNewID[inStation]
        # except:
        #     inStation_new = inStation
        # try:
        #     outStation_new = transferStationToNewID[outStation]
        # except:
        #     outStation_new = outStation

        # if inStation_new not in set(stations) or outStation_new not in set(stations):
        #     continue

        inStationFlow[currentTimeslotToday_in][inStation] += 1
        outStationFlow[currentTimeslotToday_out][outStation] += 1

        try:
            tripLabel = get_tripLabel(userID, inStation, outStation, station_lightLabels)
            if tripLabel == 0:
                inStationFlowC[currentTimeslotToday_in][inStation] += 1
                outStationFlowC[currentTimeslotToday_out][outStation] +=1
            elif tripLabel == 1:
                inStationFlow_HO[currentTimeslotToday_in][inStation] += 1
                outStationFlow_HO[currentTimeslotToday_out][outStation] +=1
            elif tripLabel == 2 :
                inStationFlow_WO[currentTimeslotToday_in][inStation] += 1
                outStationFlow_WO[currentTimeslotToday_out][outStation] +=1
        except:
             pass # key missing   

    # last timeslot
   
    for t in range(timeslotsPerDay):
        timeslot = dayIdx*timeslotsPerDay + t 
        slot_startTime = datetime.datetime.strptime('06:00:00', "%H:%M:%S") + datetime.timedelta(minutes = interval * (timeslot % timeslotsPerDay))
        slot_endTime = slot_startTime + datetime.timedelta (minutes = interval)
        slot_startTime = slot_startTime.strftime("%H%M%S")
        slot_endTime = slot_endTime.strftime("%H%M%S")     
        for station in stations:
            inFlow = str(inStationFlow[t][station])
            outFlow = str(outStationFlow[t][station])
            CinFlow = str(inStationFlowC[t][station])
            HO_inFlow = str(inStationFlow_HO[t][station])
            WO_inFlow = str(inStationFlow_WO[t][station])
            CoutFlow = str(outStationFlowC[t][station])
            HO_outFlow = str(outStationFlow_HO[t][station])
            WO_outFlow = str(outStationFlow_WO[t][station])
            outData.writelines(','.join([currentDay, str(timeslot), slot_startTime ,slot_endTime, str(station), inFlow, outFlow, CinFlow, HO_inFlow, WO_inFlow, CoutFlow, HO_outFlow, WO_outFlow ]) + '\n')
    inData.close()
    outData.close()

    

In [4]:
travelDemand(interval=10)

100%|██████████| 736847403/736847403 [1:11:02<00:00, 172871.69it/s]
