## Instructions: ##
  
1) Place this file at the root of the folder you started jupyter notebook in.  
2) Create a "data/" folder containing all turnstile data you want to parse.  

_Example Structure:_  
~~~~
/MTA_data_parser.ipynb  
/data  
/data/turnstile_180922.txt
~~~~

In [45]:
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline

# Put filepath here
fp = 'data/turnstile_180922.txt'
df = pd.read_csv(fp)

In [46]:
# Use to do preliminary inspection of data

def data_inspect(df):
    print("Summarize NAs:")
    print(df.isna().sum())
    
    u_station = df['STATION'].unique()
    print("\nNum Unique Stations:", len(u_station))
    print(u_station)
    
    u_desc = df['DESC'].unique()
    print("\n Unique Descriptions")
    print(u_desc)
    
    print("\nUnique Time Vals per station")
    for s in u_station:
        u_time = df[df['STATION'] == s]['TIME'].unique()
        print('Station:', s)
        print(u_time, '\n')
        
# data_inspect(df)

In [78]:
# Organize data before using diff() to find absolute diff in entries/exits.
# Zero out counter rollovers (discard)
# Aggregate times to ensure 4 hour interval in-between

def clean_data(df):
    u_station = df['STATION'].unique()
    
    group_order = ['C/A', 'STATION', 'UNIT', 'SCP', 'DATE', 'TIME']
    # Sort by Control Area (station), Unit (Remote unit), SCP (turnstile), Date, Time
    df = df.sort_values(by=group_order)
    # Substract next row by prev row for magnitude of change
    df['DIFF'] = df['ENTRIES'].diff()
    
    # Set time entries less than 04:00:00 to 0
    # Omit rollover entries
    print('Rollover entries ommitted:', len(df[df['DIFF'] <0]))
    df.loc[df['DIFF'] < 0, 'DIFF'] = 0
    
#     for s in u_station:
#         group_station = df[df['STATION'] == s]
#         group_station = group_station.groupby(group_order).sum()
#         print(group_station['DIFF'])
    
clean_data(df)

Rollover entries ommitted: 4028
