# Getting and cleaning the data

## Step 1: Import Libraries
We'll use the following libraries in this notebook:

In [3]:
import pandas as pd
from datetime import datetime, timedelta
import numpy as np
from scipy.ndimage.interpolation import shift
from collections import defaultdict

## Step 2: Getting the right data 
We pulled in weekly turnstile data from the MTA portal: http://web.mta.info/developers/turnstile.html

First, we create a list of the weeks we're interested in fetching data for

In [8]:
# Define list of weeks we want to pull from the MTA portal

def datelist(startdate):
    """
    For a given Saturday, make a list of dates for the 14 previous Saturdays
    """
    week_list = [startdate + ((timedelta(days=-7))*i) for i in range(14)]
    clean_weeks = [i.strftime('%y%m%d') for i in week_list]
    return clean_weeks


# Define the last Saturday we're interested in for 2016 and 2017
start17 = datetime(2017, 7, 1)
start16 = datetime(2016, 7, 2)

# We'll import data for the 14 weeks preceeding July 1st for both 2016 and 2017
weeks_to_import = datelist(start17) + datelist(start16)

We then iterate through our list of dates to pull weekly files from the MTA portal

In [11]:
def loadturndata(date):
    # Build the filename
    strdate = str(date)
    filename = 'http://web.mta.info/developers/data/nyct/turnstile/turnstile_'+strdate+'.txt'

    # Read in the csv
    df = pd.read_csv(filename)
    return df


def loadturnlist(dates):
    """
    We'll iterarte through the list of weeks to create dataframes using loadturndata and then concat together into one dataframe 
    """
    data = pd.DataFrame()
    x = []
    for i in dates:
        df = (loadturndata(i))
        x.append(df)
    data = pd.concat(x)
    return data

In [17]:
# Note: This takes a few minutes to run - go treat yourself to a cup of tea!
raw = loadturnlist(weeks_to_import)

In [37]:
# Pickle the raw data in case things go south in the cleaning process and you need to start over from here
raw.to_pickle('raw_turnstile_data.pkl')

## Step 3: Cleaning the data
All the fun stuff (jk!)

Rename columns and add datetime columns

In [50]:
# Rename columns
df = raw.rename(columns=lambda x: x.strip().lower())

# Concat date and time and convert to datetime object
df['datetime'] = df['date'] + ' ' + df['time']
df['datetime_clean'] = [datetime.strptime(x, '%m/%d/%Y %H:%M:%S') for x in df['datetime']]

In [51]:
# Add some helpful date-part columns
df['year'] = [x.year for x in df['datetime_clean']]
df['weekday'] = df[['datetime_clean']].apply(lambda x: datetime.strftime(x['datetime_clean'], '%A'), axis=1)

Create group ID for partioning off counts for distinct turnstiles when ordered by datetime

In [52]:
# Create group ID for distinct turnstiles
df['group'] = df['c/a'].astype(str) + \
                df['unit'].astype(str) + \
                df['scp'].astype(str) + \
                df['station'].astype(str)  + \
                df['linename'].astype(str) + \
                df['division'].astype(str) + \
                df['year'].astype(str)
                
# Map 'group' string to integer id     
groups = set(df['group'])


def groups_dict(groups):
    group_dict = defaultdict(int)
    for i in enumerate(list(groups)):
        group_dict[i[1]]= i[0]

    return group_dict

group_id_dict = groups_dict(groups)

df['group_id'] = [group_id_dict[x] for x in df['group']]

Create station ID for later grouping on distinct stations

In [53]:
# Create station ID for distinct stations
df['station_id'] = df['station'].astype(str) + \
                df['linename'].astype(str) + \
                df['division'].astype(str)

Sort values in dataframe by group id and datatime to find diff in counts from prev row

In [54]:
# Sort values by group id and date to find diff in turnstile counts from prev row
df.sort_values(['group_id','datetime_clean'], inplace=True)
df.reset_index(drop=True)

def find_diff_prev_row(df_series_col):
    col_array = np.array(df_series_col)
    col_array_shifted = shift(col_array, 1, cval=np.NaN)
    col_diff = abs(col_array - col_array_shifted)

    return col_diff


df['entries_diff'] = find_diff_prev_row(df['entries'])
df['exit_diff'] = find_diff_prev_row(df['exits'])

Set invalid diff values to nan (first row of turnstile partitions and negative values from reboots)

In [57]:
# Identify first rows for each group partition to use as mask when setting invalid values to nan
def find_first_rows_groups(df_series_col):
    col_array = np.array(df_series_col)
    col_array_shifted = shift(col_array, 1, cval=np.NaN)
    first_row_mask = col_array != col_array_shifted

    return first_row_mask


df['first_row_group'] = find_first_rows_groups(df['group_id'])

# Make entries_diff and exit_diff nan when first row in group or negative value
df.loc[df['first_row_group'], 'entries_diff'] = None
df.loc[df['entries_diff'] < 0, 'entries_diff'] = None

df.loc[df['first_row_group'], 'exit_diff'] = None
df.loc[df['exit_diff'] < 0, 'exit_diff'] = None

In [58]:
df.head()

Unnamed: 0,c/a,unit,scp,station,linename,division,date,time,desc,entries,...,datetime_clean,year,weekday,group,group_id,station_id,entries_diff,exit_diff,first_row_group,entires_diff
114963,PTH07,R550,00-02-01,CITY / BUS,1,PTH,03/25/2017,02:58:33,REGULAR,22298,...,2017-03-25 02:58:33,2017,Saturday,PTH07R55000-02-01CITY / BUS1PTH2017,0,CITY / BUS1PTH,,,True,
114964,PTH07,R550,00-02-01,CITY / BUS,1,PTH,03/25/2017,07:10:33,REGULAR,22298,...,2017-03-25 07:10:33,2017,Saturday,PTH07R55000-02-01CITY / BUS1PTH2017,0,CITY / BUS1PTH,0.0,0.0,False,
114965,PTH07,R550,00-02-01,CITY / BUS,1,PTH,03/25/2017,11:22:33,REGULAR,22298,...,2017-03-25 11:22:33,2017,Saturday,PTH07R55000-02-01CITY / BUS1PTH2017,0,CITY / BUS1PTH,0.0,0.0,False,
114966,PTH07,R550,00-02-01,CITY / BUS,1,PTH,03/25/2017,15:34:33,REGULAR,22298,...,2017-03-25 15:34:33,2017,Saturday,PTH07R55000-02-01CITY / BUS1PTH2017,0,CITY / BUS1PTH,0.0,3.0,False,
114967,PTH07,R550,00-02-01,CITY / BUS,1,PTH,03/25/2017,19:46:33,REGULAR,22298,...,2017-03-25 19:46:33,2017,Saturday,PTH07R55000-02-01CITY / BUS1PTH2017,0,CITY / BUS1PTH,0.0,1.0,False,


Pickle dataframe for further exploration!! 

In [None]:
df.to_pickle('cleaned_turnstile_data.pkl')