In [1]:
!pip install statsforecast=='1.0.0'

Collecting statsforecast==1.0.0
  Downloading statsforecast-1.0.0-py3-none-any.whl (57 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m58.0/58.0 kB[0m [31m713.8 kB/s[0m eta [36m0:00:00[0m
Installing collected packages: statsforecast
Successfully installed statsforecast-1.0.0
[0m

In [2]:
"""
Note from http://web.mta.info/developers/fare.html:
These files show the number of MetroCard swipes made each week by customers entering each station 
of the New York City Subway, PATH, AirTrain JFK and the Roosevelt Island Tram, broken out to show 
the relative popularity of the various types of MetroCards. MTA New York City Transit posts the 
latest data every Saturday by 1 a.m., and the dates listed in the links reference the date the 
data is posted. The data in the files covers seven-day periods beginning on the Saturday two 
weeks prior to the posting date and ending on the following Friday. Thus, as an example, the file 
labeled Saturday, January 15, 2011, has data covering the period from Saturday, January 1, 2011, 
through Friday, January 7. The file labeled January 22 has data covering the period from Saturday, 
January 8, through Friday, January 14. And so on and so forth.
"""

import os, sys
import pandas as pd
from tqdm import tqdm
from datetime import timedelta, datetime

from statsforecast import StatsForecast
from statsforecast.models import AutoARIMA

In [3]:
def download_files(begin_week, num_weeks, save_dir):
    """   
    Parameters
    ----------
    begin_week : str
        Ending date of the week (in yymmdd format) to begin the download.
    num_weeks : int
        Number of weeks to download starting from begin_week.
    save_dir : str
        Directory to save the files.

    Returns
    -------
    None.
    """
    begin_week = datetime.strptime(begin_week, '%y%m%d')
    for i in range(num_weeks):
        print('Downloading Week ' + str(i+1) + '/' + str(num_weeks) + '...')
        report_week = begin_week + timedelta(days= i * 7)
        data_week =  report_week - timedelta(days=7)
        file_name = '{:%y%m%d}'.format(report_week) + '.csv'
        data_name = '{:%y%m%d}'.format(data_week) + '.csv'
        link = 'http://web.mta.info/developers/data/nyct/fares/fares_' + file_name
        df = pd.read_csv(link, skiprows=2, index_col=False)
        df = df.drop(columns=[column for column in df.columns.tolist() if column.isspace()])
        df.to_csv(os.path.join(save_dir, data_name), index=False)
        
def add_data(df, data_path):
    """
    Parameters
    ----------
    df : str or pandas.DataFrame
        Path to the existing main data file, or existing pandas.DataFrame.
    data_dir : str
        Path to the new data file.

    Returns
    -------
    df : pandas.DataFrame
        Updated data.
    df : boolean
        Signal whether the new data is successfully added.    
    """
    if isinstance(df, str) :
        df = pd.read_csv(df)
    if df is None:
        df = pd.DataFrame()    
    if 'web.mta.info' in data_path:
        new_data = pd.read_csv(data_path, skiprows=2, index_col=False)
        file_date = datetime.strptime(data_path[-10:-4], '%y%m%d')
        data_date = file_date - timedelta(days=7)
        new_data['WEEK'] = '{:%Y-%m-%d}'.format(data_date)
    else:
        new_data = pd.read_csv(data_path)
        new_data['WEEK'] = '{:%Y-%m-%d}'.format(datetime.strptime(data_path[-10:-4], '%y%m%d'))
    new_data = new_data.drop(columns=[column for column in new_data.columns.tolist() if column.isspace()])
    new_data.columns = [column.strip() for column in new_data.columns.tolist()]
    new_data['STATION'] = new_data['STATION'].apply(lambda x: x.strip())
    if new_data['WEEK'].unique()[0] not in df['WEEK'].unique().tolist():
        df = df.append(new_data)
        df.index = range(len(df))
        added = True
        print('New data added.')
    else:
        added = False
        print('Data already in existing data frame. No new data added.')
    return df, added


In [4]:
def add_data(df, data_path):
    """
    Parameters
    ----------
    df : str or pandas.DataFrame
        Path to the existing main data file, or existing pandas.DataFrame.
    data_dir : str
        Path to the new data file.

    Returns
    -------
    df : pandas.DataFrame
        Updated data.
    df : boolean
        Signal whether the new data is successfully added.    
    """
    if isinstance(df, str) :
        df = pd.read_csv(df)
    if df is None:
        df = pd.DataFrame()    
    if 'web.mta.info' in data_path:
        new_data = pd.read_csv(data_path, skiprows=2, index_col=False)
        file_date = datetime.strptime(data_path[-10:-4], '%y%m%d')
        data_date = file_date - timedelta(days=7)
        new_data['WEEK'] = '{:%Y-%m-%d}'.format(data_date)
    else:
        new_data = pd.read_csv(data_path)
        new_data['WEEK'] = '{:%Y-%m-%d}'.format(datetime.strptime(data_path[-10:-4], '%y%m%d'))
    new_data = new_data.drop(columns=[column for column in new_data.columns.tolist() if column.isspace()])
    new_data.columns = [column.strip() for column in new_data.columns.tolist()]
    new_data['STATION'] = new_data['STATION'].apply(lambda x: x.strip())
    if new_data['WEEK'].unique()[0] not in df['WEEK'].unique().tolist():
        df = df.append(new_data)
        df.index = range(len(df))
        added = True
        print('New data added.')
    else:
        added = False
        print('Data already in existing data frame. No new data added.')
    return df, added

def combine_all(load_dir):
    """
    Parameters
    ----------
    load_dir : str
        Directory storing all data files.

    Returns
    -------
    df : pandas.DataFrame
        Combined data frame.
    """
    file_names = os.listdir(load_dir)
    for idx, file in tqdm(enumerate(file_names)):
        data_dir = os.path.join(load_dir, file)
        if idx == 0:
            df = add_data(None, data_dir)
        else:
            df = add_data(df, data_dir)
    return df

def read_data(df_file='main.csv', files_dir='data', save_df='main.csv'):
    """
    Parameters
    ----------
    df_file : str, optional
        Path to the main data frame file. The default is 'main.csv'.
    files_dir : str, optional
        Directory to the files. The default is 'data'.
    save_df : str, optional
        Path to where the data to save. The default is 'main.csv'.

    Returns
    -------
    df : pandas.DataFrame
        DESCRIPTION.
    """
    if os.path.exists(df_file):
        df = pd.read_csv(df_file)
    else:
        df = combine_all(files_dir)
    if save_df is not None:
        df.to_csv(save_df, index=False)
        print('Saving main data frame as', save_df+'.')
    return df

In [5]:
df = pd.read_csv('../input/mta-subway-fare-data/main.csv')
df

Unnamed: 0,WEEK,REMOTE,STATION,FF,SEN/DIS,7-D AFAS UNL,30-D AFAS/RMF UNL,JOINT RR TKT,7-D UNL,30-D UNL,...,AIRTRAIN 30-D,AIRTRAIN 10-T,AIRTRAIN MTHLY,STUDENTS,NICE 2-T,CUNY-120,CUNY-60,FF VALUE,FF 7-DAY,FF 30-DAY
0,2019-01-05,R001,WHITEHALL STREET,70771,2408,411,934,198,41443,24743,...,0,0,0,2092,0,31,0,,,
1,2019-01-05,R003,CYPRESS HILLS,2463,167,9,44,0,1864,874,...,0,0,0,467,0,1,0,,,
2,2019-01-05,R004,75TH STREET & ELDERTS LANE,6121,353,46,131,1,3991,2845,...,0,0,0,1687,0,11,0,,,
3,2019-01-05,R005,85TH STREET & FOREST PKWAY,6927,488,57,190,0,3567,3244,...,0,0,0,900,0,14,0,,,
4,2019-01-05,R006,WOODHAVEN BOULEVARD,6652,421,74,153,0,5431,3696,...,0,0,0,1182,0,13,0,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
98192,2022-11-26,R571,86TH STREET - 2 AVENUE,16012,3481,158,306,3,4798,8205,...,0,0,0,1784,0,65,0,631.0,208.0,299.0
98193,2022-11-26,R572,96TH STREET - 2 AVENUE,10725,2293,135,484,1,4050,6526,...,0,0,0,2648,0,138,0,1094.0,300.0,561.0
98194,2022-11-26,R573,SBS-Q52/53 @ LIVINGSTON PLAZA,0,0,0,0,0,0,0,...,0,0,0,0,0,535,0,0.0,0.0,0.0
98195,2022-11-26,R574,SBS-B82 @ LIVINGSTON PLAZA,0,0,0,0,0,0,0,...,0,0,0,0,0,272,0,0.0,0.0,0.0


In [6]:
last_date = datetime.strptime(df['WEEK'].max(), '%Y-%m-%d')
new_date = last_date + timedelta(days=7)
file_date = new_date + timedelta(days=7)
file_name = 'fares_{:%y%m%d}.csv'.format(file_date)
new_data_url = 'http://web.mta.info/developers/data/nyct/fares/' + file_name
print('Last date: ', last_date)
print('New date: ', new_date)
print('File date: ', file_date)
print('File name: ', file_name)
print('URL: ', new_data_url)

Last date:  2022-11-26 00:00:00
New date:  2022-12-03 00:00:00
File date:  2022-12-10 00:00:00
File name:  fares_221210.csv
URL:  http://web.mta.info/developers/data/nyct/fares/fares_221210.csv


In [7]:
try:
    df, new_data_added = add_data(df, new_data_url)
except:
    print('Unexpected error:', sys.exc_info()[0])
    print('The data link generated is: ', new_data_url)
    new_data_added = False

Unexpected error: <class 'urllib.error.HTTPError'>
The data link generated is:  http://web.mta.info/developers/data/nyct/fares/fares_221210.csv


In [8]:
geo_df = pd.read_csv('../input/mta-subway-fare-data/station_gis.csv')
geo_df.to_csv('station_gis.csv', index=False)
df.to_csv('main.csv', index=False)

In [9]:
start_date = '2020-06-01'
mask = (df.WEEK > start_date) & (df.STATION.isin(geo_df.STATION.unique().tolist()))
df = df[mask].drop(columns=['REMOTE', '14-D RFM UNL', '1-D UNL', '14-D UNL'])
card_types = [itm for itm in df.columns if itm not in ['WEEK', 'STATION']]
df['y'] = df[card_types].sum(axis=1) / 7

In [10]:
df = df.drop(columns=card_types).rename(columns={'WEEK':'ds', 'STATION':'unique_id'})
df.ds = pd.to_datetime(df.ds)
df = df[['unique_id', 'ds', 'y']]
df = df.sort_values(['unique_id', 'ds']).reset_index(drop=True)

In [11]:
df

Unnamed: 0,unique_id,ds,y
0,103RD ST-CENTRAL PARK WEST,2020-06-06,487.142857
1,103RD ST-CENTRAL PARK WEST,2020-06-13,589.714286
2,103RD ST-CENTRAL PARK WEST,2020-06-20,654.285714
3,103RD ST-CENTRAL PARK WEST,2020-06-27,720.285714
4,103RD ST-CENTRAL PARK WEST,2020-07-04,742.142857
...,...,...,...
60885,ZEREGA AVE-WESTCHESTER AVE,2022-10-29,1028.571429
60886,ZEREGA AVE-WESTCHESTER AVE,2022-11-05,1042.142857
60887,ZEREGA AVE-WESTCHESTER AVE,2022-11-12,973.285714
60888,ZEREGA AVE-WESTCHESTER AVE,2022-11-19,1039.000000


In [12]:
models = [
    AutoARIMA(season_length=52),
]

sf = StatsForecast(
    df=df, 
    models=models,
    freq='W', 
    n_jobs=-1
)

In [13]:
%%time
forecasts_df = sf.forecast(h=52, level=[90, 95])

CPU times: user 3.45 s, sys: 1.35 s, total: 4.8 s
Wall time: 1h 39s


In [14]:
forecasts_df['interval-95-square'] = (forecasts_df['AutoARIMA-hi-95'] - forecasts_df['AutoARIMA']) ** 2
forecasts_df['interval-90-square'] = (forecasts_df['AutoARIMA-hi-90'] - forecasts_df['AutoARIMA']) ** 2
forecasts_df = forecasts_df.rename(columns={'ds':'WEEK'})
forecasts_df.index.name = 'STATION'
forecasts_df.to_csv('forecast.csv')

In [15]:
forecasts_df

Unnamed: 0_level_0,WEEK,AutoARIMA,AutoARIMA-lo-95,AutoARIMA-lo-90,AutoARIMA-hi-90,AutoARIMA-hi-95,interval-95-square,interval-90-square
STATION,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
103RD ST-CENTRAL PARK WEST,2022-11-27,1695.693115,1430.199585,1472.883911,1918.502319,1961.186646,70486.812500,49643.941406
103RD ST-CENTRAL PARK WEST,2022-12-04,1800.639771,1471.448853,1524.374146,2076.905518,2129.830811,108366.742188,76322.765625
103RD ST-CENTRAL PARK WEST,2022-12-11,1706.251221,1335.597290,1395.188721,2017.313721,2076.905029,137384.250000,96759.875000
103RD ST-CENTRAL PARK WEST,2022-12-18,1276.234009,871.713745,936.749878,1615.718018,1680.754150,163636.546875,115249.390625
103RD ST-CENTRAL PARK WEST,2022-12-25,1024.534912,589.824341,659.714294,1389.355469,1459.245361,188973.171875,133094.031250
...,...,...,...,...,...,...,...,...
ZEREGA AVE-WESTCHESTER AVE,2023-10-22,928.413086,353.375702,445.826508,1410.999634,1503.450439,330667.968750,232889.781250
ZEREGA AVE-WESTCHESTER AVE,2023-10-29,941.984497,361.799805,455.078156,1428.890869,1522.169189,336614.281250,237077.812500
ZEREGA AVE-WESTCHESTER AVE,2023-11-05,873.127380,287.840637,381.939270,1364.315552,1458.414185,342560.656250,241265.812500
ZEREGA AVE-WESTCHESTER AVE,2023-11-12,938.841675,348.496979,443.408783,1434.274536,1529.186401,348506.906250,245453.718750
