In [1]:
import os
import pandas as pd 
from datetime import datetime
from key import API_LINKS, CONSUMER_KEY
from api import get_citycodes_by_pref

In [2]:
def filter_by_transport(df, transport_type, citycodes):
    """Filters floating population data by transportaition type"""
    targetcols = ['dailyid', 'year', 'month', 'day', 'dayofweek', 'hour', 'minute', 'latitude', 'longitude', 'citycode', 'transportation_type']
    transcodes = { 'stop': 1,'walk': 2,'bicycle': 5,'train': 7,'car': 8 } # refer to Agoop data document

    if df is None:
        raise Exception("No dataframe was passed to function!")

    if citycodes is None:
        raise Exception("Please specify citycodes!")

    if transport_type is None:
        raise Exception("Please specify transport type!")
    elif transport_type not in transcodes.keys():
        raise Exception("Invalid trasport type. Available: (stop, walk, bicycle, train, car)")

    df = df.loc[:,targetcols]
    df = df[df.citycode.isin(citycodes)]
    df = df[df.transportation_type == transcodes[transport_type]]

    return df

In [3]:
def get_data(path: str, start: str, days: int, prefname: str, transport: str, verbose=True):
    """Get Agoop floating population data

    Keyword arguments:
    path - Path to directory in which the data will be stored 
    start - Starting date in 'yyyy-mm-dd' format
    days - Number of days of data we want to obtain
    city - Data from which city?
    transport - Which mode of transportation
    verbose - Show progress?
    """

    citycodes = get_citycodes_by_pref(prefname=prefname)
    periods = days * 24
    df_list = []
    for timestamp in pd.date_range(start=start, periods=periods, freq='H').tolist():
        if timestamp.hour > 9: # No data available before 10 a.m. 
            datestring = datetime.strftime(timestamp, "%Y%m%d_%H")
            try:
                if verbose: print('Obtaining data from {}'.format(timestamp))

                df = pd.read_csv(API_LINKS['floatpoint']['url'].format(datestring, CONSUMER_KEY))
                df = filter_by_transport(df, transport, citycodes)
                df['datetime'] = timestamp

                if df is not None:
                    df_list.append(df) 
            except:
                continue
        if timestamp.hour == 23: # end of day
            try: 
                day = pd.concat(df_list)
                day.to_csv(os.path.join(path, '{}.csv'.format(datestring)))
                df_list = [] # empty list
            except:
                df_list = []
                continue
    
    print('Done!')

In [4]:
df = get_data('../../data/floatpop', '2019-06-01', 7, '東京都', 'train')

Obtaining data from 2019-06-01 10:00:00
Obtaining data from 2019-06-01 11:00:00
Obtaining data from 2019-06-01 12:00:00
Obtaining data from 2019-06-01 13:00:00
Obtaining data from 2019-06-01 14:00:00
Obtaining data from 2019-06-01 15:00:00
Obtaining data from 2019-06-01 16:00:00
Obtaining data from 2019-06-01 17:00:00
Obtaining data from 2019-06-01 18:00:00
Obtaining data from 2019-06-01 19:00:00
Obtaining data from 2019-06-01 20:00:00
Obtaining data from 2019-06-01 21:00:00
Obtaining data from 2019-06-01 22:00:00
Obtaining data from 2019-06-01 23:00:00
Obtaining data from 2019-06-02 10:00:00
Obtaining data from 2019-06-02 11:00:00
Obtaining data from 2019-06-02 12:00:00
Obtaining data from 2019-06-02 13:00:00
Obtaining data from 2019-06-02 14:00:00
Obtaining data from 2019-06-02 15:00:00
Obtaining data from 2019-06-02 16:00:00
Obtaining data from 2019-06-02 17:00:00
Obtaining data from 2019-06-02 18:00:00
Obtaining data from 2019-06-02 19:00:00
Obtaining data from 2019-06-02 20:00:00


In [38]:
len(df.dailyid.unique())/len(df)

0.10880362272970735