# Cleaning and Merging Both Tables

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import datetime, math

In [None]:
%%capture
mta = pd.read_csv('../../data/mta_1708.csv', error_bad_lines=False)
# mta.set_index('PublishedLineName', inplace=True)
m100 = mta.loc[(mta['PublishedLineName']== 'M100') & (mta['DestinationName'] == 'INWOOD 220 ST via AMSTERDAM via BWAY'),]

In [None]:
arrivals = m100.loc[m100['ArrivalProximityText']=='at stop']
arrivals.head(20)

In [None]:
new = arrivals.loc[:,['RecordedAtTime','VehicleRef']].sort_values(['VehicleRef', 'RecordedAtTime'], ascending=True)
uniqueRef = new['VehicleRef'].unique()
new.RecordedAtTime = pd.to_datetime(new.RecordedAtTime)

## Setting the time deltas only within the same vehicle refs
https://stackoverflow.com/questions/20648346/computing-diffs-within-groups-of-a-dataframe


In [None]:
new.sort_values(['VehicleRef', 'RecordedAtTime'], inplace=True)
new['diffs'] = new['RecordedAtTime'].diff()
mask = new.VehicleRef != new.VehicleRef.shift(1)
new['diffs'][mask] = np.nan
new

In [None]:
new.dropna()
new = new[new.diffs > '02:00:00']
new

mta.describe()

# Saving our Progress/Merging Weather Data

In [None]:
%%capture
weather = pd.read_csv('../../data/1401011_weather_data.csv', error_bad_lines=False)

In [None]:
# weather.tail()

In [None]:
%%capture
m100 = pd.read_csv('../../data/busBoarding.csv', error_bad_lines=False)

In [None]:
m100.tail()

In [None]:
m100 = m100[['passengerArrivalTime', 'numPassengersPerBus', 'BusDepartureTime']]

In [None]:
m100.head()

## Data Cleaning
Credit: Angelika

In [None]:
newWeather = weather[['DATE','HOURLYVISIBILITY', 'HOURLYWindSpeed', 'HOURLYPrecip']]

In [None]:
newWeather = newWeather.dropna()
newWeather = newWeather[~newWeather.HOURLYPrecip.str.contains("T")]
newWeather = newWeather[~newWeather.HOURLYPrecip.str.contains("s")]

In [None]:
newWeather = newWeather[~newWeather.HOURLYVISIBILITY.str.contains("V")]
newWeather.HOURLYVISIBILITY=pd.to_numeric(newWeather.HOURLYVISIBILITY)

This is a function that combines the weather and bus tables into one based on the day of the month and the hour of the day. We'll call this function after we've cleaned up the weather tables.

In [None]:
'''
    @params:
        weather: weather table
        bus: bus table
''' 

def mergeOnDateTime(bus, weather):
    weather['DATE'] = pd.to_datetime(weather['DATE'])
    bus['BusDepartureTime'] = pd.to_datetime(bus['BusDepartureTime'])
    
    weather['stringTime'] = weather['DATE'].apply(lambda x: x.strftime('%Y-%m-%d %H'))
    bus['stringTime'] = bus['BusDepartureTime'].apply(lambda x: x.strftime('%Y-%m-%d %H'))
    
    newTable = pd.merge(left=bus, right=weather,  how='inner', on=['stringTime'])
    
    newTable.drop(columns=['stringTime', 'DATE'], inplace=True, axis=1)
    
    return newTable

In [None]:
# Time gate to August

newWeather = newWeather[(newWeather['DATE'] > '2017-08-01') & (newWeather['DATE'] < '2017-09-01')].reset_index().dropna()

We'll enforce the datatype on the Precipitation and Visibility to floats.

In [None]:

# Fix some data types
newWeather['HOURLYPrecip'] = pd.to_numeric(newWeather['HOURLYPrecip'], downcast='float', errors='coerce')
newWeather['HOURLYVISIBILITY'] = pd.to_numeric(newWeather['HOURLYVISIBILITY'], downcast='float', errors='coerce')
# Bound hour of day
# newWeather = newWeather[(newWeather['HOUR'] > 4) & (newWeather['HOUR'] < 20)]
newWeather.reset_index()
newWeather.drop(columns=['index'], inplace=True, axis=1)


In [None]:
newWeather.dtypes
newWeather.shape

In [None]:
m100.dtypes
m100.shape

Using the function above, we merge both the tables into one

In [None]:
df = mergeOnDateTime(m100, newWeather)

In [None]:
df.head()
df.dtypes

In [None]:
df.sort_values(['BusDepartureTime'], inplace=True)

Redoing time deltas

In [None]:
df.head()

In [None]:
df['timeTillNext'] = df['RecordedAtTime'].diff().dt.total_seconds()
df['timeTillNext'] = df['timeTillNext'].shift(-1).apply(lambda x: x/60)
df['timeTillNext'] = pd.to_numeric(df['timeTillNext'])

In [None]:
%%capture

df.drop(columns='Unnamed: 0', inplace=True, axis=1)

In [None]:
df.reset_index()


In [None]:
%%capture
df.drop(columns=['index'], inplace=True, axis=1)

In [None]:
df.drop(columns=['timeDelta'], inplace=True, axis=1)
df.timeDelta.dropna(how='any')

In [None]:
df.drop_duplicates()
df.shape

In [None]:
df['ArrivalHour'] = pd.to_datetime(df['passengerArrivalTime']).dt.hour
df['ArrivalSeconds'] = pd.to_datetime(df['passengerArrivalTime']).dt.second
df['ArrivalMinutes'] = pd.to_datetime(df['passengerArrivalTime']).dt.minute


In [None]:
df.tail()

Now we can see that the rows have a corresponding `timeDelta` in which we calculate the next bus arrival in minutes for the next arrival.

### Saving our Model

In [None]:
df.to_csv('../../data/Merged_Bus_Weather.csv')