In [1]:
import pandas as pd
import pickle as pkl
import datetime as dt
pd.set_option('display.max_columns',None)

In [2]:
with open('model_data.pkl', 'rb') as f:
    df = pkl.load(f)

In [3]:
df.reset_index(inplace=True)
df.drop('index',axis=1,inplace=True)

In [4]:
delayed_2 = df.loc[df['subject'].str.contains('2 ')]
indices = delayed_2.index

In [5]:
df.insert(loc=0, column='delayed_2',value=0)
df.loc[indices, 'delayed_2'] = 1

In [6]:
causes = ['track', 'customer','passenger','mechanical','maintenance','signal','switch','brake','struck', 'light', 'station',
          'weather','door','police','nypd','fire','fdny','emt','ems','investigation','remov','smoke','medical', 'flood','water']
direction = ['northbound','soundbound','both direction','nb','sb','bd','bothdirections','both way','bothway','n /','n/b',
            'n/ b','s /','s/b','s/ b','b/d','b / d','n / b','s / b','b/ d','b /d','b/ ','manhattan bound', 'brooklyn bound',
            'queens bound', 'bronx bound','manh bound','bk bound','bx bound','qns bound']

In [7]:
df['causes'] = df['message'].str.extract('({0})'.format('|'.join(causes)))

In [8]:
causes = pd.get_dummies(df['causes'])

In [9]:
def become_one(one):
    if one >= 1:
        return 1
    else:
        return 0

In [10]:
# fire/smoke or weather related conditions
causes['causes_conditions'] = (causes['fire'] + causes['smoke'] + causes['fdny'] + causes['weather'] + causes['flood'] + causes['water'])
causes['causes_conditions'].apply(become_one)

# unruly passengers or medical emergency
causes['causes_passenger'] = (causes['nypd'] + causes['investigation'] + causes['police'] + causes['ems']
                      + causes['medical'] + causes['passenger'] + causes['customer'] + causes['struck'])
causes['causes_passenger'].apply(become_one)

# issues at the station including switches, signals, tracks, people/trains/debris needing to be removed from the track
causes['causes_station'] = (causes['signal'] + causes['switch'] + causes['track'] + causes['remov'] + causes['maintenance']
                           + causes['station'])
causes['causes_station'].apply(become_one)

causes['causes_train'] = (causes['brake'] + causes['light'] + causes['door'] + causes['mechanical'])
causes['causes_train'].apply(become_one)

0        1
1        1
2        1
3        1
4        0
        ..
67181    0
67182    0
67183    0
67184    0
67185    0
Name: causes_train, Length: 67186, dtype: int64

In [11]:
causes

Unnamed: 0,brake,customer,door,ems,fdny,fire,flood,investigation,light,maintenance,mechanical,medical,nypd,passenger,police,remov,signal,smoke,station,struck,switch,track,water,weather,causes_conditions,causes_passenger,causes_station,causes_train
0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1
1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1
2,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1
3,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1
4,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
67181,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0
67182,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
67183,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0
67184,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0


In [12]:
causes.drop(['smoke','fdny','nypd','investigation','police','ems','medical', 'struck', 'door','track','flood', 'water',
            'customer','switch','remov','signal','brake','light','weather', 'maintenance','passenger','fire',
            'mechanical','station'],axis=1,inplace=True)

In [13]:
causes.reset_index(inplace=True)
causes.drop('index',axis=1,inplace=True)

In [14]:
df.reset_index(inplace=True)

In [15]:
df.drop('index',axis=1,inplace=True)

In [16]:
df = df.merge(causes, how='left', left_index=True,right_index=True)

In [17]:
df['direction'] = df['message'].str.extract('({0})'.format('|'.join(direction)))

In [18]:
direction = pd.get_dummies(df['direction'])

In [19]:
# including bronx since it's by default north-bound
direction['direction_northbound'] = direction['northbound'] + direction['n /'] + direction['nb'] + direction['bx bound']
direction['direction_northbound'].apply(become_one)

# including brooklyn since it's by default south-bound
direction['direction_southbound'] = direction['sb'] + direction['s /'] + direction['bk bound'] + direction['brooklyn bound']
direction['direction_southbound'].apply(become_one)

# both directions
direction['direction_both'] = direction['b / d'] + direction['both direction'] + direction['bd']
direction['direction_both'].apply(become_one)

# manhattan will be unique since it's either direction (of course it is -_-)
direction['direction_manhattan'] = direction['manhattan bound']
direction['direction_manhattan'].apply(become_one)

0        0
1        0
2        0
3        0
4        0
        ..
67181    0
67182    0
67183    0
67184    0
67185    0
Name: direction_manhattan, Length: 67186, dtype: int64

In [20]:
direction.drop(['n /', 'nb','bx bound','b / d', 'bd','bk bound','sb','s /','brooklyn bound',
                'northbound','both direction','manhattan bound'],axis=1,inplace=True)

In [21]:
df = df.merge(direction, how='left', left_index=True,right_index=True)

In [22]:
df.drop(['causes','direction'],axis=1,inplace=True)

Fixing column names.. then we model!!!!

In [23]:
st_7 = df.loc[df['subject'].str.contains('7 ')]
st_123 = df.loc[df['subject'].str.contains('1 |2 |3 ')]

In [24]:
st_123_st = st_123[['34 st','33 st']]

In [25]:
train_123_st = st_123_st.loc[st_123_st['34 st'] == 1]
indices = train_123_st.index
train_123_st

Unnamed: 0,34 st,33 st
54,1,0
178,1,0
179,1,0
292,1,0
293,1,0
...,...,...
58484,1,0
58485,1,0
58486,1,0
60191,1,0


In [26]:
df.insert(loc=0, column='34 st-penn st (1/2/3)',value=0)
df.loc[indices, '34 st-penn st (1/2/3)'] = 1

In [27]:
st_7_st = st_7[['34 st', '33 st']]

In [28]:
train_7_st = st_7_st.loc[st_7_st['34 st'] == 1]
indices = train_7_st.index
train_7_st

Unnamed: 0,34 st,33 st
20280,1,0
22922,1,0
22923,1,0
22934,1,0
22935,1,0
...,...,...
53183,1,0
53184,1,0
53185,1,0
53336,1,0


In [29]:
df.insert(loc=0, column='34 st-hudson yds (7)',value=0)
df.loc[indices, '34 st-hudson yds (7)'] = 1

In [30]:
df['van cortlandt park (1)'] = df['van cortlandt'] + df['242 st']
df['van cortlandt park (1)'].apply(become_one)

df['33 st-rawson st (7)'] = df['33 st'] + df['33 st']
df['33 st-rawson st (7)'].apply(become_one)



0        0
1        0
2        0
3        0
4        0
        ..
67181    0
67182    0
67183    0
67184    0
67185    0
Name: 33 st-rawson st (7), Length: 67186, dtype: int64

In [31]:
df.rename(columns={'111 st':'111 st (7)','junction b':'junction blvd (7)','mets':'mets willis pt (7)',
                  '90 st':'90 st (7)', '82 st':'82 st (7)', '74 st':'74 st (7)', '69 st': '69 st (7)',
                  '61 st': '61 st (7)', '52 st': '52 st (7)', '46 st': '46 st (7)', '40 st': '40 st (7)',
                   'qnsboro pl': 'qnsboro plza (7)', 'court sq': 'court sq (7)',
                  'hunters p': 'hunters pt (7)','grand central':'42 st-grand central (4/5/6/7)',
                   'times sq':'42 st-times sq (2/3/4/5/6/7)','238 st':'238 st (1)', '231 st': '231 st (1)',
                   '225 st': 'marble hill-225 st (1)','215 st':'215 st (1)', '207 st': '207 st (1)', 'dyckman st':'dyckman st (1)',
                  '191 st':'191 st (1)','181 st':'181 st (1)', '168 st': '168 st (1)','157 st':'157 st (1)',
                  '145 st':'145 st (1)','137 st':'137 st (1)','125 st':'125 st (1)','116 st':'116 st-columbia (1)',
                  'cathedral p':'110 st-cathedral pkwy (1)', '18 st':'18 st (1)',
                  '14 st':'14 st-7 av (1/2/3)','union sq':'14 st-union sq (4/5/6)','christopher st':'christopher st (1)',
                  'houston st':'houston st (1)', 'franklin st':'franklin st (1)', 'chambers st':'chambers st (1/2/3)',
                  'wtc':'wtc cortlandt (1)', 'rector st':'rector st (1)', 'ferry':'south ferry (1)','241 st':'wakefield-241 st (2/5)',
                  'nereid av':'nereid av (2/5)', '233 st':'233 st (2/5)', '225 st': '225 st (2/5)', '219 st': '219 st (2/5)',
                  'burke av': 'burke av (2/5)', 'allerton av':'allerton av (2/5)', 'pelham p':'pelham pkwy (2/5)',
                  'bx park east':'bronx park east (2/5)', 'e 180':'e 180 st (2/5)', 'west farms':'west farm sq (2/5)',
                  '174 st':'174 st (2/5)','freeman st':'freeman st (2/5)','simpson st':'simpson st (2/5)',
                  'intervale av':'interval av (2/5)', 'prospect av':'prospect av (2/5)', 'jackson av':'jackson av (2/5)',
                  '110 st':'central park n-110 st (2/3)','135 st':'135 st (2/3)','138 st-grand concourse':'138 st-grand concourse (4/5)',
                  '148 st':'Harlem-148 st (3)','149 st-grand concourse':'149 st-grand concourse (2/4/5)','161 st':'161 st-yankee (4)',
                  '167 st':'167 st (4)','170 st':'170 st (4)','176 st':'176 st (4)','183 st':'183 st (4)','50 st':'50 st (1)',
                   '51 st':'51 st (6)', '59 st':'59 st (6)','66 st':'66 st-lincoln ctr (1)','68 st':'68 st-hunter clg (6)',
                   '77 st':'77 st (6)','79 st':'79 st (1)','astor':'astor pl (6)','atlantic av':'atlantic av-barclays ctr (2/3/4/5)',
                   'baychester av':'baychester av (5)','bedford p':'bedford pk (4)','bergen st':'bergen st (2/3)','beverly r':'beverly rd (2/5)',
                   'bleeker st':'bleeker st (6)', 'borough hall':'borough hall (2/3/4/5)','bowling green':'bowling green (4/5)',
                   'brook av':'brook av (6)','brooklyn bridge':'brooklyn bridge-city hall (4/5/6)','buhre':'buhre av (6)',
                   'burnside':'burnside av (4)','castle hill':'castle hill av (6)','church av':'church av (2/5)','clark st':'clark st (2/3)',
                   'crown heights':'crown hts-utica av (3/4)','cypress av':'cypress av (6)','e 143':'e 143 st (6)', 'e 149':'e 149 st (6)',
                   'eastchester':'eastchester dyre av (5)','eastern p':'eastern pkwy (2/3)','elder av':'elder av (6)',
                   'flatbush av':'flatbush av-bk clg (2/5)','flushing':'flushing main st (7)','fordham':'fordham rd (4)',
                   'franklin av':'franklin av-medgar evers clg (2/3/4/5)','grand army':'grand army plza (2/3)','hoyt st':'hoyt st (2/3)',
                   'hunts point':'hunts pt (6)','junius st':'junius st (3)', 'kingston av':'kingston av (3)','kingsbridge':'kingsbridge rd (4)',
                   'longwood av':'longwood av (6)','middletown':'middletown rd (6)','morris':'morris pk (5)','moshulu':'moshulu pkwy (4)',
                   'mt eden':'mt eden av (4)','nevins st':'nevins st (2/3/4/5)','new lots':'new lots av (3)','newkirk av':'newkirk av (2/5)',
                   'nostrand av':'nostrand av (3)','park pl':'park pl (2/3)','parkchester':'parkchester (6)','pelham bay':'pelham bay pk (6)',
                   'pennsylvania av':'pennsylvania av (3)','president st':'president st-medgar evers clg (2/5)','rockaway av':'rockaway av (3)',
                   'saratoga av':'saratoga av (3)','spring st':'spring st (6)','st lawrence':'st lawrence av (6)','sterling st':'sterling st (2/5)',
                   'sutter av':'sutter av (3)','van siclen':'van siclen av (3)','vernon b':'vernon blvd (7)','westchester':'westchester sq-e tremont av (6)',
                   'whitlock':'whitlock av (6)','winthrop st':'winthrop st (2/5)','woodlawn':'woodlawn (4)','zerega':'zerega av (6)',
                   '3 av-138 st':'3 av-138 st (6)','5 av':'5 av (7)','72 st':'72 st (1/2/3)'
                  },inplace=True)

In [32]:
df.drop(['23 st', '242 st','28 st','34 st','33 st','86 st','96 st','canal st',
        'fulton st','gun hill','wall st','103 st','rawson', 'van cortlandt'],axis=1,inplace=True)

In [33]:
cols = list(df.columns)

In [34]:
cols = sorted(cols)

In [35]:
df['wed']

Unnamed: 0,wed,wed.1
0,1,0
1,1,0
2,1,0
3,1,0
4,1,0
...,...,...
67181,0,0
67182,0,0
67183,0,0
67184,0,0


In [36]:
thankful = pd.read_csv('modeling_data.csv')
thankful

Unnamed: 0.1,Unnamed: 0,date,subject,message,day_of_week,rush_hour,weekend,2,brooklyn,queens,bronx,manhattan
0,0,9/30/2021 21:34,Update : bx 2 Train Delays,2 trains are proceeding with delays after our ...,3,0,0,1,0,0,1,0
1,1,9/30/2021 21:28,bx 2 Train Delays,2 trains are delayed in both directions while ...,3,0,0,1,0,0,1,0
2,2,9/30/2021 19:14,Update : bx 2 Train Delays,Southbound 2 trains are proceeding after we mo...,3,0,0,1,0,0,1,0
3,3,9/30/2021 19:10,bx 2 Train Delays,Southbound 2 trains are delayed while we inves...,3,0,0,1,0,0,1,0
4,4,9/30/2021 18:48,Update : bx 2 Train Trains Rerouted,Wakefield-bound 2 trains have resumed stopping...,3,0,0,1,0,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...
67181,67181,1/2/2010 8:28,qns 7 train Track Maintenance,Times Sq-bound 7 trains operating on normal ro...,5,0,1,0,0,1,0,0
67182,67182,1/1/2010 0:56,UPDATED : bx 4 Train Police Investigation,4 train service has resumed with residual delays.,4,0,0,0,0,0,1,0
67183,67183,1/1/2010 0:49,UPDATED : bx 4 Train Police Investigation,BK-bound. 4 train express Burnside Av to 125 S...,4,0,0,0,0,0,1,0
67184,67184,1/1/2010 0:35,UPDATED : bx 4 Train Police Investigation,Both directions. 4 train susp btwn Burnside Av...,4,0,0,0,0,0,1,0


In [37]:
dates = thankful['date']

In [38]:
dates = pd.DataFrame(dates)

In [39]:
dates

Unnamed: 0,date
0,9/30/2021 21:34
1,9/30/2021 21:28
2,9/30/2021 19:14
3,9/30/2021 19:10
4,9/30/2021 18:48
...,...
67181,1/2/2010 8:28
67182,1/1/2010 0:56
67183,1/1/2010 0:49
67184,1/1/2010 0:35


In [40]:
df.reset_index(inplace=True)
df.drop('index',axis=1,inplace=True)
df

Unnamed: 0,34 st-hudson yds (7),34 st-penn st (1/2/3),delayed_2,gun hill rd (2/5),gun hill rd (5),103 st (7),canal st (6),96 st (6),28 st (6),23 st (6),110 st (6),103 st (6),canal st (1),23 st (1),28 st (1),86 st (1),96 st (1),110 st (1),103 st (1),subject,message,day_of_week,rush_hour,weekend,wed,brooklyn,queens,bronx,manhattan,stations,central park n-110 st (2/3),111 st (7),116 st-columbia (1),125 st (1),135 st (2/3),137 st (1),138 st-grand concourse (4/5),14 st-7 av (1/2/3),145 st (1),Harlem-148 st (3),149 st-grand concourse (2/4/5),157 st (1),161 st-yankee (4),167 st (4),168 st (1),170 st (4),174 st (2/5),176 st (4),18 st (1),181 st (1),183 st (4),207 st (1),215 st (1),219 st (2/5),225 st (2/5),233 st (2/5),238 st (1),wakefield-241 st (2/5),40 st (7),46 st (7),5 av (7),50 st (1),51 st (6),52 st (7),59 st (6),61 st (7),66 st-lincoln ctr (1),68 st-hunter clg (6),69 st (7),72 st (1/2/3),74 st (7),77 st (6),79 st (1),82 st (7),90 st (7),allerton av (2/5),astor pl (6),atlantic av-barclays ctr (2/3/4/5),baychester av (5),bedford pk (4),bergen st (2/3),beverly rd (2/5),bleeker st (6),borough hall (2/3/4/5),bowling green (4/5),brook av (6),brooklyn bridge-city hall (4/5/6),buhre av (6),burke av (2/5),burnside av (4),bronx park east (2/5),castle hill av (6),110 st-cathedral pkwy (1),chambers st (1/2/3),christopher st (1),church av (2/5),clark st (2/3),court sq (7),crown hts-utica av (3/4),cypress av (6),dyckman st (1),e 143 st (6),e 149 st (6),e 180 st (2/5),eastchester dyre av (5),eastern pkwy (2/3),elder av (6),flatbush av-bk clg (2/5),flushing main st (7),fordham rd (4),franklin av-medgar evers clg (2/3/4/5),franklin st (1),freeman st (2/5),grand army plza (2/3),42 st-grand central (4/5/6/7),houston st (1),hoyt st (2/3),hunters pt (7),hunts pt (6),interval av (2/5),jackson av (2/5),junction blvd (7),junius st (3),kingsbridge rd (4),kingston av (3),longwood av (6),mets willis pt (7),middletown rd (6),morris pk (5),moshulu pkwy (4),mt eden av (4),nereid av (2/5),nevins st (2/3/4/5),new lots av (3),newkirk av (2/5),nostrand av (3),park pl (2/3),parkchester (6),pelham bay pk (6),pelham pkwy (2/5),pennsylvania av (3),president st-medgar evers clg (2/5),prospect av (2/5),qnsboro plza (7),rector st (1),rockaway av (3),saratoga av (3),simpson st (2/5),spring st (6),st lawrence av (6),sterling st (2/5),sutter av (3),42 st-times sq (2/3/4/5/6/7),14 st-union sq (4/5/6),van siclen av (3),vernon blvd (7),west farm sq (2/5),westchester sq-e tremont av (6),whitlock av (6),winthrop st (2/5),woodlawn (4),wtc cortlandt (1),zerega av (6),3 av-138 st (6),avg_temp,min_temp,max_temp,snowfall,snowfall_depth,prcp,avg_wind_spd,fastest_wind_directions,fastest_wind_spd,major_conditions,minor_conditions,neutral_conditions,clear_conditions,125 st (4/5/6),wall st (4/5),fulton st (4/5),86 st (4/5/6),125 st (1/2/3),fulton st (2/3),wall st (2/3),mon,tue,wed.1,th,fri,sat,sun,causes_conditions,causes_passenger,causes_station,causes_train,direction_northbound,direction_southbound,direction_both,direction_manhattan,van cortlandt park (1),33 st-rawson st (7)
0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,update : bx 2 train delays,2 trains are proceeding with delays after our ...,3,0,0,1,0,0,1,0,nereid av,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,62.0,54.5,66.0,0.00,0.0,0.000,9.955,"nw, n",26.95,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0
1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,bx 2 train delays,2 trains are delayed in both directions while ...,3,0,0,1,0,0,1,0,nereid av,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,62.0,54.5,66.0,0.00,0.0,0.000,9.955,"nw, n",26.95,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,1,0,0,0
2,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,update : bx 2 train delays,southbound 2 trains are proceeding after we mo...,3,0,0,1,0,0,1,0,allerton av,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,62.0,54.5,66.0,0.00,0.0,0.000,9.955,"nw, n",26.95,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0
3,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,bx 2 train delays,southbound 2 trains are delayed while we inves...,3,0,0,1,0,0,1,0,allerton av,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,62.0,54.5,66.0,0.00,0.0,0.000,9.955,"nw, n",26.95,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0
4,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,update : bx 2 train trains rerouted,wakefield-bound 2 trains have resumed stopping...,3,0,0,1,0,0,1,0,bx park east,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,62.0,54.5,66.0,0.00,0.0,0.000,9.955,"nw, n",26.95,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
67181,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,qns 7 train track maintenance,times sq-bound 7 trains operating on normal ro...,5,0,1,0,0,1,0,0,,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,25.0,17.0,33.5,0.15,0.0,0.010,18.120,"nw, w",39.95,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0,0,0,0,0,0
67182,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,updated : bx 4 train police investigation,4 train service has resumed with residual delays.,4,0,0,0,0,0,1,0,,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,36.0,32.5,40.5,0.00,0.0,0.035,4.365,"nw, w",22.45,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0
67183,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,updated : bx 4 train police investigation,bk-bound. 4 train express burnside av to 125 s...,4,0,0,0,0,0,1,0,161 st,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,36.0,32.5,40.5,0.00,0.0,0.035,4.365,"nw, w",22.45,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0,0,0,0,0,0,0
67184,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,updated : bx 4 train police investigation,both directions. 4 train susp btwn burnside av...,4,0,0,0,0,0,1,0,161 st,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,36.0,32.5,40.5,0.00,0.0,0.035,4.365,"nw, w",22.45,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0,0,0,1,0,0,0


In [41]:
df = df.merge(dates,left_index=True,right_index=True)

In [42]:
df.drop(['mon','tue','wed','th','fri','sat','sun'],axis=1,inplace=True)

In [43]:
df['date'] = pd.to_datetime(df['date'])
df['day_of_week'] = df['date'].dt.dayofweek

In [44]:
days = pd.get_dummies(df['day_of_week'])

In [45]:
days.rename({0:'mon',1:'tue',2:'wed',3:'th',4:'fri',5:'sat',6:'sun'},axis=1,inplace=True)
days

Unnamed: 0,mon,tue,wed,th,fri,sat,sun
0,0,0,0,1,0,0,0
1,0,0,0,1,0,0,0
2,0,0,0,1,0,0,0
3,0,0,0,1,0,0,0
4,0,0,0,1,0,0,0
...,...,...,...,...,...,...,...
67181,0,0,0,0,0,1,0
67182,0,0,0,0,1,0,0
67183,0,0,0,0,1,0,0
67184,0,0,0,0,1,0,0


In [46]:
df = df.merge(days,left_index=True,right_index=True)

In [47]:
df.drop(['day_of_week','stations'],axis=1,inplace=True)

In [48]:
col_order = ['delayed_2','date', 'subject','message','rush_hour','weekend','major_conditions','minor_conditions',
             'neutral_conditions','clear_conditions','mon','tue','wed','th', 'fri','sat', 'sun','max_temp','avg_temp',
             'min_temp','avg_wind_spd', 'prcp','snowfall', 'snowfall_depth','fastest_wind_spd','fastest_wind_directions', 
             'bronx', 'brooklyn','manhattan','queens','causes_conditions', 'causes_passenger', 'causes_station',
             'causes_train', 'direction_both', 'direction_manhattan', 'direction_northbound', 'direction_southbound',
             '103 st (1)', '103 st (6)', '103 st (7)', '110 st (1)', '110 st (6)', '110 st-cathedral pkwy (1)',
             '111 st (7)', '116 st-columbia (1)', '125 st (1)', '125 st (1/2/3)', '125 st (4/5/6)', '135 st (2/3)',
             '137 st (1)', '138 st-grand concourse (4/5)', '14 st-7 av (1/2/3)', '14 st-union sq (4/5/6)', '145 st (1)',
             '149 st-grand concourse (2/4/5)', '157 st (1)', '161 st-yankee (4)', '167 st (4)', '168 st (1)', '170 st (4)',
             '174 st (2/5)', '176 st (4)', '18 st (1)', '181 st (1)', '183 st (4)', '207 st (1)', '215 st (1)', '219 st (2/5)',
             '225 st (2/5)', '23 st (1)', '23 st (6)', '233 st (2/5)', '238 st (1)', '28 st (1)', '28 st (6)',
             '3 av-138 st (6)', '33 st-rawson st (7)', '34 st-hudson yds (7)', '34 st-penn st (1/2/3)', '40 st (7)',
             '42 st-grand central (4/5/6/7)', '42 st-times sq (2/3/4/5/6/7)', '46 st (7)', '5 av (7)', '50 st (1)',
             '51 st (6)', '52 st (7)', '59 st (6)', '61 st (7)', '66 st-lincoln ctr (1)', '68 st-hunter clg (6)', '69 st (7)',
             '72 st (1/2/3)', '74 st (7)', '77 st (6)', '79 st (1)', '82 st (7)', '86 st (1)', '86 st (4/5/6)', '90 st (7)',
             '96 st (1)', '96 st (6)', 'Harlem-148 st (3)', 'allerton av (2/5)', 'astor pl (6)',
             'atlantic av-barclays ctr (2/3/4/5)',  'baychester av (5)', 'bedford pk (4)', 'bergen st (2/3)',
             'beverly rd (2/5)', 'bleeker st (6)', 'borough hall (2/3/4/5)', 'bowling green (4/5)', 'bronx park east (2/5)',
             'brook av (6)', 'brooklyn bridge-city hall (4/5/6)', 'buhre av (6)', 'burke av (2/5)', 'burnside av (4)',
             'canal st (1)', 'canal st (6)', 'castle hill av (6)', 'central park n-110 st (2/3)', 'chambers st (1/2/3)',
             'christopher st (1)', 'church av (2/5)', 'clark st (2/3)',  'court sq (7)',
             'crown hts-utica av (3/4)', 'cypress av (6)',  'dyckman st (1)', 'e 143 st (6)', 'e 149 st (6)', 'e 180 st (2/5)',
             'eastchester dyre av (5)', 'eastern pkwy (2/3)', 'elder av (6)',  'flatbush av-bk clg (2/5)', 
             'flushing main st (7)', 'fordham rd (4)', 'franklin av-medgar evers clg (2/3/4/5)',
             'franklin st (1)', 'freeman st (2/5)', 'fulton st (2/3)', 'fulton st (4/5)', 'grand army plza (2/3)',
             'gun hill rd (2/5)', 'gun hill rd (5)', 'houston st (1)', 'hoyt st (2/3)', 'hunters pt (7)', 'hunts pt (6)',
             'interval av (2/5)', 'jackson av (2/5)', 'junction blvd (7)', 'junius st (3)', 'kingsbridge rd (4)',
             'kingston av (3)', 'longwood av (6)', 'mets willis pt (7)', 'middletown rd (6)',  'morris pk (5)',
             'moshulu pkwy (4)', 'mt eden av (4)', 'nereid av (2/5)', 'nevins st (2/3/4/5)', 'new lots av (3)',
             'newkirk av (2/5)', 'nostrand av (3)', 'park pl (2/3)', 'parkchester (6)', 'pelham bay pk (6)',
             'pelham pkwy (2/5)', 'pennsylvania av (3)', 'president st-medgar evers clg (2/5)', 'prospect av (2/5)',
             'qnsboro plza (7)', 'rector st (1)', 'rockaway av (3)', 'saratoga av (3)', 'simpson st (2/5)',
             'spring st (6)', 'st lawrence av (6)', 'sterling st (2/5)', 'sutter av (3)', 'van cortlandt park (1)',
             'van siclen av (3)', 'vernon blvd (7)', 'wakefield-241 st (2/5)', 'wall st (2/3)', 'wall st (4/5)',
             'west farm sq (2/5)', 'westchester sq-e tremont av (6)', 'whitlock av (6)', 'winthrop st (2/5)', 'woodlawn (4)',
             'wtc cortlandt (1)', 'zerega av (6)']

In [49]:
df = df.reindex(columns=col_order)

In [51]:
with open('data_backup.pkl','wb') as f:
    pkl.dump(df, f)

In [52]:
df.drop(['subject','message'],axis=1,inplace=True)

In [53]:
with open('modeling_data.pkl','wb') as f:
    pkl.dump(df, f)