# Create Jan_trips_MERGED.csv
**Merge Leavetimes and trips data. Prior; Leavetiems data is filtered for STOPPOINTIDs which are presently served by dublin Bus**

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from dython import nominal

## Source of Truth
### Source of Truth gathered in prior EDA
* **TRIPID**
    * **TRIPID are shared across days for perticular LINEID on particular ROUTEID**
    * For a single TRIPID, only one corresponding LINEID and ROUTEID are present
    * TRIPID for inbound and outbound directions are mutually exclusive
    * For certein TRIPIDs; PROGNUMBER do not start at 1; which should. It means, some entries are missing.
* **ROUTEID**
    * A LINEID has multiple ROUTEID 
    * A LINEID may / may not have trips with both INBOUND-going(1)/OUTBOUND-returning(2) direction 

### Source of Truth gathered in this notebook
* **LINEID**
    * **For a LINEID; ROUTEIDs and TRIPIDs used for DIFFERENT DIRECTION are MUTUALLY EXCLUSIVE**
* **ROUTEID**
    * **THOUGH 2 ROUTEIDs FOR A LINEID ARE HAVING SAME DIRECTION; STOPPOINTIDs VISITED ARE DIFFERENT**
    * **FOR A SINGLE ROUTEID; STOPS VISITED DURING DIFFERENT TRIPS ARE DIFEENRENT**
    * **AS ROUTEIDs ARE NOT KNOWN TO USER _AND_ ROUTEIDs ARE NOT TIMEBOUND; IT IS NOT POSSIBLE TO TRAIN ML MODEL USING ALL ROUTEIDs**

## read rt_trips_DB_2018.csv
rt_trips_DB_2018.txt consists data for ENTIRE 2018
* Based on priliminary EDA, only following features are preserved while converting trips file into csv<br>
 [DAYOFSERVICE, TRIPID, LINEID, ROUTEID, DIRECTION, PLANNEDTIME_ARR, PLANNEDTIME_DEP, ACTUALTIME_ARR, ACTUALTIME_DEP ]

In [2]:
df_trips = pd.read_csv("../DB/monthlyData/rt_trips_DB_2018.csv",skip_blank_lines=True,index_col=False)

In [3]:
df_trips['DAYOFSERVICE'] = pd.to_datetime(df_trips['DAYOFSERVICE'])
df_trips['TRIPID'] = pd.to_numeric(df_trips['TRIPID'], downcast='integer', errors='coerce')
df_trips['LINEID'] = df_trips['LINEID'].astype('str')
df_trips['ROUTEID'] = df_trips['ROUTEID'].astype('str')
df_trips['DIRECTION'] = pd.to_numeric(df_trips['DIRECTION'], downcast='integer', errors='coerce')
df_trips['PLANNEDTIME_ARR'] = pd.to_numeric(df_trips['PLANNEDTIME_ARR'], downcast='integer', errors='coerce')
df_trips['PLANNEDTIME_DEP'] = pd.to_numeric(df_trips['PLANNEDTIME_DEP'], downcast='integer', errors='coerce')
df_trips['ACTUALTIME_ARR'] = pd.to_numeric(df_trips['ACTUALTIME_ARR'], downcast='integer', errors='coerce')
df_trips['ACTUALTIME_DEP'] = pd.to_numeric(df_trips['ACTUALTIME_DEP'], downcast='integer', errors='coerce')

### Object types and valid entries

In [4]:
print(df_trips.info(null_counts = True))

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2182637 entries, 0 to 2182636
Data columns (total 9 columns):
 #   Column           Non-Null Count    Dtype         
---  ------           --------------    -----         
 0   DAYOFSERVICE     2182637 non-null  datetime64[ns]
 1   TRIPID           2182637 non-null  int32         
 2   LINEID           2182637 non-null  object        
 3   ROUTEID          2182637 non-null  object        
 4   DIRECTION        2182637 non-null  int8          
 5   PLANNEDTIME_ARR  2182637 non-null  int32         
 6   PLANNEDTIME_DEP  2182637 non-null  int32         
 7   ACTUALTIME_ARR   2045430 non-null  float64       
 8   ACTUALTIME_DEP   2018086 non-null  float64       
dtypes: datetime64[ns](1), float64(2), int32(3), int8(1), object(2)
memory usage: 110.3+ MB
None


### Nature of data

In [5]:
df_trips.head()

Unnamed: 0,DAYOFSERVICE,TRIPID,LINEID,ROUTEID,DIRECTION,PLANNEDTIME_ARR,PLANNEDTIME_DEP,ACTUALTIME_ARR,ACTUALTIME_DEP
0,2018-02-07,6253783,68,68_80,1,87245,84600,87524.0,84600.0
1,2018-02-07,6262138,25B,25B_271,2,30517,26460,32752.0,
2,2018-02-07,6254942,45A,45A_70,2,35512,32100,36329.0,32082.0
3,2018-02-07,6259460,25A,25A_273,1,57261,54420,58463.0,54443.0
4,2018-02-07,6253175,14,14_15,1,85383,81600,84682.0,81608.0


### Unique entries

In [6]:
df_trips.nunique()

DAYOFSERVICE          360
TRIPID             658964
LINEID                130
ROUTEID               588
DIRECTION               2
PLANNEDTIME_ARR     64461
PLANNEDTIME_DEP       791
ACTUALTIME_ARR      68122
ACTUALTIME_DEP      66771
dtype: int64

## read Janury_2018.csv
* rt_leavetimes_DB_2018.txt is of size ~ 11 GB. It is memory expensive to read and anlyze
* Hence it is devided into 1 GB chunks -- resulting into 11 subfiles
* Further, these **1 GB subfiles files are succesively processed to generate seperate files named <month_year.csv> of data for each month**
* Based on priliminary EDA, only following features are preserved while converting trips file into csv<br>
 [DAYOFSERVICE, TRIPID, PROGRNUMBER, STOPPOINTID, PLANNEDTIME_ARR, PLANNEDTIME_DEP, ACTUALTIME_ARR, ACTUALTIME_DEP]

In [7]:
df_Jan = pd.read_csv("../DB/monthlyData/January_2018.csv",skip_blank_lines=True,index_col=False)
df_Jan.drop('VEHICLEID',axis=1,inplace=True)

### Filter DATA for presently served DUBLIN BUS STOPS
**Historic data consists many STOPPOINTIDs which no longer exist. Hence; such entries are ommitted**  

In [8]:
df_presently_served_stops = pd.read_csv('../DB/bus_stop.csv')
list_presently_served_stops = list(df_presently_served_stops["STOPPOINTID"])
df_Jan = df_Jan.loc[df_Jan['STOPPOINTID'].isin(list_presently_served_stops)]

### Data type casting

In [9]:
df_Jan['DAYOFSERVICE'] = pd.to_datetime(df_Jan['DAYOFSERVICE'])
df_Jan['TRIPID'] = pd.to_numeric(df_Jan['TRIPID'], downcast='integer', errors='coerce')
df_Jan['PROGRNUMBER'] = pd.to_numeric(df_Jan['PROGRNUMBER'], downcast='integer', errors='coerce')
df_Jan['STOPPOINTID'] = pd.to_numeric(df_Jan['STOPPOINTID'], downcast='integer', errors='coerce')
df_Jan['PLANNEDTIME_ARR'] = pd.to_numeric(df_Jan['PLANNEDTIME_ARR'], downcast='integer', errors='coerce')
df_Jan['PLANNEDTIME_DEP'] = pd.to_numeric(df_Jan['PLANNEDTIME_DEP'], downcast='integer', errors='coerce')
df_Jan['ACTUALTIME_ARR'] = pd.to_numeric(df_Jan['ACTUALTIME_ARR'], downcast='integer', errors='coerce')
df_Jan['ACTUALTIME_DEP'] = pd.to_numeric(df_Jan['ACTUALTIME_DEP'], downcast='integer', errors='coerce')

### Object types and valid entries

In [10]:
print(df_Jan.info(null_counts = True))

<class 'pandas.core.frame.DataFrame'>
Int64Index: 10034307 entries, 0 to 10525003
Data columns (total 8 columns):
 #   Column           Non-Null Count     Dtype         
---  ------           --------------     -----         
 0   DAYOFSERVICE     10034307 non-null  datetime64[ns]
 1   TRIPID           10034307 non-null  int32         
 2   PROGRNUMBER      10034307 non-null  int8          
 3   STOPPOINTID      10034307 non-null  int16         
 4   PLANNEDTIME_ARR  10034307 non-null  int32         
 5   PLANNEDTIME_DEP  10034307 non-null  int32         
 6   ACTUALTIME_ARR   10034307 non-null  int32         
 7   ACTUALTIME_DEP   10034307 non-null  int32         
dtypes: datetime64[ns](1), int16(1), int32(5), int8(1)
memory usage: 373.2 MB
None


### Nature of data

In [12]:
df_Jan.head()

Unnamed: 0,DAYOFSERVICE,TRIPID,PROGRNUMBER,STOPPOINTID,PLANNEDTIME_ARR,PLANNEDTIME_DEP,ACTUALTIME_ARR,ACTUALTIME_DEP
0,2018-01-01,5958355,3,1172,23799,23799,23863,23863
1,2018-01-01,5958355,4,1173,23860,23860,23934,24072
2,2018-01-01,5958355,5,1174,23937,23937,24114,24140
3,2018-01-01,5958355,6,1175,24048,24048,24180,24192
4,2018-01-01,5958355,7,15,24130,24130,24227,24257


### Unique entries

In [13]:
df_Jan.nunique()

DAYOFSERVICE          31
TRIPID             76433
PROGRNUMBER          103
STOPPOINTID         4166
PLANNEDTIME_ARR    68842
PLANNEDTIME_DEP    68842
ACTUALTIME_ARR     71974
ACTUALTIME_DEP     71953
dtype: int64

## Merge df_Jan with df_trips on 'TRIPID'
* Arrival and departure time related features of df_trips give End to End Planned and Atcual timings for trip
* They can be calculated by grouping TRIPID - DAYOFSERVICE data available in leavetimes [i.e df_Jan] dataframe; hence are dropped while making merged datframe

In [14]:
df_Jan_trips = df_Jan.drop_duplicates().merge(df_trips[['DAYOFSERVICE','TRIPID', 'LINEID', 'DIRECTION','ROUTEID']],on=['TRIPID','DAYOFSERVICE']).drop_duplicates()

### Object types and valid entries

In [15]:
print(df_Jan_trips.info(null_counts = True))

<class 'pandas.core.frame.DataFrame'>
Int64Index: 10034307 entries, 0 to 10034306
Data columns (total 11 columns):
 #   Column           Non-Null Count     Dtype         
---  ------           --------------     -----         
 0   DAYOFSERVICE     10034307 non-null  datetime64[ns]
 1   TRIPID           10034307 non-null  int32         
 2   PROGRNUMBER      10034307 non-null  int8          
 3   STOPPOINTID      10034307 non-null  int16         
 4   PLANNEDTIME_ARR  10034307 non-null  int32         
 5   PLANNEDTIME_DEP  10034307 non-null  int32         
 6   ACTUALTIME_ARR   10034307 non-null  int32         
 7   ACTUALTIME_DEP   10034307 non-null  int32         
 8   LINEID           10034307 non-null  object        
 9   DIRECTION        10034307 non-null  int8          
 10  ROUTEID          10034307 non-null  object        
dtypes: datetime64[ns](1), int16(1), int32(5), int8(2), object(2)
memory usage: 535.9+ MB
None


### Nature of data

In [16]:
df_Jan_trips.head(100)

Unnamed: 0,DAYOFSERVICE,TRIPID,PROGRNUMBER,STOPPOINTID,PLANNEDTIME_ARR,PLANNEDTIME_DEP,ACTUALTIME_ARR,ACTUALTIME_DEP,LINEID,DIRECTION,ROUTEID
0,2018-01-01,5958355,3,1172,23799,23799,23863,23863,41,1,41_3
1,2018-01-01,5958355,4,1173,23860,23860,23934,24072,41,1,41_3
2,2018-01-01,5958355,5,1174,23937,23937,24114,24140,41,1,41_3
3,2018-01-01,5958355,6,1175,24048,24048,24180,24192,41,1,41_3
4,2018-01-01,5958355,7,15,24130,24130,24227,24257,41,1,41_3
...,...,...,...,...,...,...,...,...,...,...,...
95,2018-01-01,5958088,47,3958,27943,27943,28473,28473,66,1,66_11
96,2018-01-01,5958088,48,3959,27990,27990,28508,28508,66,1,66_11
97,2018-01-01,5958088,49,3960,28024,28024,28534,28534,66,1,66_11
98,2018-01-01,5958088,50,3961,28058,28058,28551,28562,66,1,66_11


### Unique entries

In [17]:
df_Jan_trips.nunique()

DAYOFSERVICE          31
TRIPID             76433
PROGRNUMBER          103
STOPPOINTID         4166
PLANNEDTIME_ARR    68842
PLANNEDTIME_DEP    68842
ACTUALTIME_ARR     71974
ACTUALTIME_DEP     71953
LINEID               126
DIRECTION              2
ROUTEID              498
dtype: int64

### Save to CSV

In [18]:
df_Jan_trips.to_csv("../DB/ML/Jan_trips_MERGED.csv",index=False)

## Check nature of data for a random LINEID

In [19]:
import pandas as pd

In [20]:
df_Jan_trips = pd.read_csv("../DB/ML/Jan_trips_MERGED.csv")

In [21]:
df_Jan_trips['DAYOFSERVICE'] = pd.to_datetime(df_Jan_trips['DAYOFSERVICE'])
df_Jan_trips['TRIPID'] = pd.to_numeric(df_Jan_trips['TRIPID'], downcast='integer', errors='coerce')
df_Jan_trips['PROGRNUMBER'] = pd.to_numeric(df_Jan_trips['PROGRNUMBER'], downcast='integer', errors='coerce')
df_Jan_trips['STOPPOINTID'] = pd.to_numeric(df_Jan_trips['STOPPOINTID'], downcast='integer', errors='coerce')
df_Jan_trips['PLANNEDTIME_ARR'] = pd.to_numeric(df_Jan_trips['PLANNEDTIME_ARR'], downcast='integer', errors='coerce')
df_Jan_trips['PLANNEDTIME_DEP'] = pd.to_numeric(df_Jan_trips['PLANNEDTIME_DEP'], downcast='integer', errors='coerce')
df_Jan_trips['ACTUALTIME_ARR'] = pd.to_numeric(df_Jan_trips['ACTUALTIME_ARR'], downcast='integer', errors='coerce')
df_Jan_trips['ACTUALTIME_DEP'] = pd.to_numeric(df_Jan_trips['ACTUALTIME_DEP'], downcast='integer', errors='coerce')
df_Jan_trips['LINEID'] = df_Jan_trips['LINEID'].astype('str')
df_Jan_trips['ROUTEID'] = df_Jan_trips['ROUTEID'].astype('str')
df_Jan_trips['DIRECTION'] = pd.to_numeric(df_Jan_trips['DIRECTION'], downcast='integer', errors='coerce')

### Take samples for a random LINEID 41

In [22]:
df_Jan_trips_41 = df_Jan_trips.loc[(df_Jan_trips['LINEID'] == '41')]

In [23]:
df_Jan_trips_41

Unnamed: 0,DAYOFSERVICE,TRIPID,PROGRNUMBER,STOPPOINTID,PLANNEDTIME_ARR,PLANNEDTIME_DEP,ACTUALTIME_ARR,ACTUALTIME_DEP,LINEID,DIRECTION,ROUTEID
0,2018-01-01,5958355,3,1172,23799,23799,23863,23863,41,1,41_3
1,2018-01-01,5958355,4,1173,23860,23860,23934,24072,41,1,41_3
2,2018-01-01,5958355,5,1174,23937,23937,24114,24140,41,1,41_3
3,2018-01-01,5958355,6,1175,24048,24048,24180,24192,41,1,41_3
4,2018-01-01,5958355,7,15,24130,24130,24227,24257,41,1,41_3
...,...,...,...,...,...,...,...,...,...,...,...
10032506,2018-01-31,6230682,51,48,86831,86831,86772,86772,41,2,41_7
10032507,2018-01-31,6230682,52,49,86890,86890,86797,86811,41,2,41_7
10032508,2018-01-31,6230682,53,51,86947,86947,86853,86853,41,2,41_7
10032509,2018-01-31,6230682,54,52,86978,86978,86869,86869,41,2,41_7


### Object types and valid entries

In [24]:
df_Jan_trips_41.info(null_counts=True)

<class 'pandas.core.frame.DataFrame'>
Int64Index: 154604 entries, 0 to 10032510
Data columns (total 11 columns):
 #   Column           Non-Null Count   Dtype         
---  ------           --------------   -----         
 0   DAYOFSERVICE     154604 non-null  datetime64[ns]
 1   TRIPID           154604 non-null  int32         
 2   PROGRNUMBER      154604 non-null  int8          
 3   STOPPOINTID      154604 non-null  int16         
 4   PLANNEDTIME_ARR  154604 non-null  int32         
 5   PLANNEDTIME_DEP  154604 non-null  int32         
 6   ACTUALTIME_ARR   154604 non-null  int32         
 7   ACTUALTIME_DEP   154604 non-null  int32         
 8   LINEID           154604 non-null  object        
 9   DIRECTION        154604 non-null  int8          
 10  ROUTEID          154604 non-null  object        
dtypes: datetime64[ns](1), int16(1), int32(5), int8(2), object(2)
memory usage: 8.3+ MB


### Varify if ROUTEIDs and associated TRIPIDs for different DIRECTION for a LINEID are different

#### Direction 1

In [25]:
df_41_DIR1 = df_Jan_trips_41.loc[(df_Jan_trips_41['DIRECTION'] == 1)].sort_values(by = 'PROGRNUMBER')

In [26]:
df_41_DIR1.head()

Unnamed: 0,DAYOFSERVICE,TRIPID,PROGRNUMBER,STOPPOINTID,PLANNEDTIME_ARR,PLANNEDTIME_DEP,ACTUALTIME_ARR,ACTUALTIME_DEP,LINEID,DIRECTION,ROUTEID
5978678,2018-01-19,6115935,1,288,43800,43800,43803,43803,41,1,41_3
4084469,2018-01-13,6093453,1,288,58800,58800,59684,59684,41,1,41_3
9211903,2018-01-29,6238592,1,288,63000,63000,63055,63055,41,1,41_3
2226510,2018-01-08,6093329,1,288,41100,41100,41114,41114,41,1,41_3
9531589,2018-01-30,6234994,1,288,57000,57000,57100,57100,41,1,41_3


#### Take unique values into dictionary

In [27]:
uniques = {}
for col in df_41_DIR1.columns:
    uniques[col] = df_41_DIR1[col].unique()

##### Direction 1 ROUTES

In [28]:
LINE_41_DIR1_Routes = uniques['ROUTEID']

In [29]:
LINE_41_DIR1_Routes

array(['41_3', '41_21', '41_6', '41_20', '41_4', '41_5'], dtype=object)

##### Direction 1 TRIPS

In [30]:
LINE_41_DIR1_Trips = uniques['TRIPID']

In [31]:
LINE_41_DIR1_Trips[:10]

array([6115935, 6093453, 6238592, 6093329, 6234994, 5956158, 6094587,
       6109496, 6220913, 6094016], dtype=int32)

<br>

#### Direction 2

In [32]:
df_41_DIR2 = df_Jan_trips_41.loc[(df_Jan_trips_41['DIRECTION'] == 2)].sort_values(by = 'PROGRNUMBER')

In [33]:
df_41_DIR2.head()

Unnamed: 0,DAYOFSERVICE,TRIPID,PROGRNUMBER,STOPPOINTID,PLANNEDTIME_ARR,PLANNEDTIME_DEP,ACTUALTIME_ARR,ACTUALTIME_DEP,LINEID,DIRECTION,ROUTEID
181,2018-01-01,5965379,1,7347,26100,26100,26251,26251,41,2,41_10
299508,2018-01-02,5963831,1,4843,38700,38700,38624,38624,41,2,41_7
4277420,2018-01-14,6106116,1,4843,53400,53400,53423,53423,41,2,41_7
3161740,2018-01-10,6105495,1,4843,77400,77400,77533,77533,41,2,41_7
1739028,2018-01-06,5965700,1,4843,46200,46200,46223,46223,41,2,41_7


#### Take unique values into dictionary

In [34]:
uniques = {}
for col in df_41_DIR2.columns:
    uniques[col] = df_41_DIR2[col].unique()

##### Direction 2 ROUTES

In [35]:
LINE_41_DIR2_Routes = uniques['ROUTEID']

In [36]:
LINE_41_DIR2_Routes

array(['41_10', '41_7', '41_8', '41_9'], dtype=object)

##### Direction 2 TRIPS

In [37]:
LINE_41_DIR2_Trips = uniques['TRIPID']

In [38]:
LINE_41_DIR2_Trips[:10]

array([5965379, 5963831, 6106116, 6105495, 5965700, 6220909, 6107697,
       6109505, 6228135, 6236988], dtype=int32)

<br>

#### Confirm mutual exclusion for TRIPIDs and ROUTEIDs for different directions

In [39]:
print("Common ROUTEIDs between LINE 43 : Direction 1 & 2 ",set(LINE_41_DIR1_Routes).intersection(LINE_41_DIR2_Routes))

Common ROUTEIDs between LINE 43 : Direction 1 & 2  set()


In [40]:
print("Common TRIPIDs between LINE 43 : Direction 1 & 2 ",set(LINE_41_DIR1_Trips).intersection(LINE_41_DIR2_Trips))

Common TRIPIDs between LINE 43 : Direction 1 & 2  set()


**$HENCE,\ For\ a\ LINEID;\ ROUTEIDs\ and\ TRIPIDs\ used\ for\ DIFFERENT\ DIRECTION\ are\ mutually\ exclusive$**
<br>
<br>

### Check if STOPIDs visited by different ROUTEIDs for a LINEID are different

#### Take unique values into dictionary

In [41]:
uniques = {}
for col in df_Jan_trips_41.columns:
    uniques[col] = df_Jan_trips_41[col].unique()

In [42]:
uniques['ROUTEID']

array(['41_3', '41_10', '41_7', '41_8', '41_5', '41_4', '41_6', '41_9',
       '41_20', '41_21'], dtype=object)

In [43]:
uniques['DAYOFSERVICE']

array(['2018-01-01T00:00:00.000000000', '2018-01-02T00:00:00.000000000',
       '2018-01-03T00:00:00.000000000', '2018-01-04T00:00:00.000000000',
       '2018-01-05T00:00:00.000000000', '2018-01-06T00:00:00.000000000',
       '2018-01-07T00:00:00.000000000', '2018-01-08T00:00:00.000000000',
       '2018-01-09T00:00:00.000000000', '2018-01-10T00:00:00.000000000',
       '2018-01-11T00:00:00.000000000', '2018-01-12T00:00:00.000000000',
       '2018-01-13T00:00:00.000000000', '2018-01-14T00:00:00.000000000',
       '2018-01-15T00:00:00.000000000', '2018-01-16T00:00:00.000000000',
       '2018-01-17T00:00:00.000000000', '2018-01-18T00:00:00.000000000',
       '2018-01-19T00:00:00.000000000', '2018-01-20T00:00:00.000000000',
       '2018-01-21T00:00:00.000000000', '2018-01-22T00:00:00.000000000',
       '2018-01-23T00:00:00.000000000', '2018-01-24T00:00:00.000000000',
       '2018-01-25T00:00:00.000000000', '2018-01-26T00:00:00.000000000',
       '2018-01-27T00:00:00.000000000', '2018-01-28

#### Check random entries for ROUTEID

In [44]:
df_Jan_trips_41.loc[(df_Jan_trips_41['ROUTEID'] == '41_3')].sort_values(by = 'PROGRNUMBER').head()

Unnamed: 0,DAYOFSERVICE,TRIPID,PROGRNUMBER,STOPPOINTID,PLANNEDTIME_ARR,PLANNEDTIME_DEP,ACTUALTIME_ARR,ACTUALTIME_DEP,LINEID,DIRECTION,ROUTEID
7039666,2018-01-23,6216050,1,288,21600,21600,21581,21581,41,1,41_3
8396578,2018-01-26,6212621,1,288,66300,66300,66514,66514,41,1,41_3
4752227,2018-01-16,6115930,1,288,19800,19800,19783,19783,41,1,41_3
2794890,2018-01-09,6098110,1,288,76500,76500,76500,76500,41,1,41_3
1700955,2018-01-06,5972065,1,288,38400,38400,38357,38357,41,1,41_3


In [45]:
df_Jan_trips_41.loc[(df_Jan_trips_41['ROUTEID'] == '41_20')].sort_values(by = 'PROGRNUMBER').head()

Unnamed: 0,DAYOFSERVICE,TRIPID,PROGRNUMBER,STOPPOINTID,PLANNEDTIME_ARR,PLANNEDTIME_DEP,ACTUALTIME_ARR,ACTUALTIME_DEP,LINEID,DIRECTION,ROUTEID
1795666,2018-01-06,5955994,1,1174,57600,57600,57622,57622,41,1,41_20
8641782,2018-01-27,6225628,1,1174,57600,57600,57757,57757,41,1,41_20
6365935,2018-01-20,6121831,1,1174,57600,57600,57552,57552,41,1,41_20
4077964,2018-01-13,6097348,1,1174,57600,57600,57614,57614,41,1,41_20
1795667,2018-01-06,5955994,2,1175,57703,57703,57650,57650,41,1,41_20


**IT IS EVIDENT FROM ABOVE SAMPLES THAT:**<BR>
**THERE IS NO REALTION BETWEEN TIME OF OPERATION AND ROUTEID SELECTED**    
<BR>
<BR>
<BR>    

#### Choose a combination of keys for ROUTEID 41_3

In [46]:
df_41_3_sample = df_Jan_trips_41.loc[(df_Jan_trips_41['LINEID'] == '41') & 
                                       (df_Jan_trips_41['ROUTEID'] == '41_3') &
                                       (df_Jan_trips_41['DAYOFSERVICE'] == '2018-01-08 00:00:00') &
                                       (df_Jan_trips_41['TRIPID'] == 6099945) &                                       
                                       (df_Jan_trips_41['DIRECTION'] == 1)
                                      ].sort_values(by = 'PROGRNUMBER')
df_41_3_sample.head()

Unnamed: 0,DAYOFSERVICE,TRIPID,PROGRNUMBER,STOPPOINTID,PLANNEDTIME_ARR,PLANNEDTIME_DEP,ACTUALTIME_ARR,ACTUALTIME_DEP,LINEID,DIRECTION,ROUTEID
2248532,2018-01-08,6099945,1,288,45000,45000,44996,44996,41,1,41_3
2248533,2018-01-08,6099945,2,1171,45067,45067,45039,45039,41,1,41_3
2248534,2018-01-08,6099945,3,1172,45118,45118,45055,45055,41,1,41_3
2248535,2018-01-08,6099945,4,1173,45191,45191,45134,45148,41,1,41_3
2248536,2018-01-08,6099945,5,1174,45285,45285,45182,45182,41,1,41_3


#### Choose a combination of keys for ROUTEID 41_20

In [47]:
df_41_20_sample = df_Jan_trips_41.loc[(df_Jan_trips_41['LINEID'] == '41') &
                                        (df_Jan_trips_41['ROUTEID'] == '41_20') &
                                        (df_Jan_trips_41['TRIPID'] == 5955994) &
                                        (df_Jan_trips_41['DAYOFSERVICE'] == '2018-01-06 00:00:00') &
                                        (df_Jan_trips_41['DIRECTION'] == 1)
                                       ].sort_values(by = 'PROGRNUMBER')
df_41_20_sample.head()

Unnamed: 0,DAYOFSERVICE,TRIPID,PROGRNUMBER,STOPPOINTID,PLANNEDTIME_ARR,PLANNEDTIME_DEP,ACTUALTIME_ARR,ACTUALTIME_DEP,LINEID,DIRECTION,ROUTEID
1795666,2018-01-06,5955994,1,1174,57600,57600,57622,57622,41,1,41_20
1795667,2018-01-06,5955994,2,1175,57703,57703,57650,57650,41,1,41_20
1795668,2018-01-06,5955994,3,15,57780,57780,57716,57750,41,1,41_20
1795669,2018-01-06,5955994,4,17,57900,57900,57798,57798,41,1,41_20
1795670,2018-01-06,5955994,5,18,58001,58001,57865,57865,41,1,41_20


#### View sets of STOPPOINTIDs visited for particular  ROUTEIDs 

In [48]:
print("ROUTE 41_3",set(df_41_3_sample['STOPPOINTID']),"\n")
print("ROUTE 41_20",set(df_41_20_sample['STOPPOINTID']))

ROUTE 41_3 {15, 17, 18, 1171, 1172, 1173, 1174, 1175, 19, 21, 3864, 3865, 3669, 288, 6054, 4910, 4911, 4912, 4913, 7602, 4914, 7348, 4915, 5076, 203, 204, 205, 5073, 5074, 5075, 1620, 85, 1622, 1623, 1624, 1625, 1626, 1627, 220, 1628, 1629, 1630, 3671, 3672, 3674, 3675, 3676, 3679, 3682, 4330, 4957} 

ROUTE 41_20 {15, 17, 18, 19, 21, 1174, 1175, 3864, 3865, 3669, 4910, 4911, 4912, 4913, 7602, 4914, 7348, 4915, 5076, 203, 204, 205, 5075, 1620, 85, 1622, 1623, 1624, 1625, 1626, 1627, 220, 1628, 1629, 1630, 3679, 4957, 3682}


**OBSERVATIONS FROM ABOVE SETS**<BR>
* **EVEN FOR ROUTEIDs HAAVING SAME DIRECTION (DIRECTION = 1); STOPPOINTIDs VISITED ARE DIFFERENT**
* **AS ROUTEIDs ARE NOT KNOWN TO USER $\Large \&$ ROUTEIDs ARE NOT TIMEBOUND; IT IS NOT POSSIBLE TO TRAIN ML MODEL USING ALL ROUTEIDs**

### Varify if STOPIDs visited by same ROUTEIDs for a LINEID are constant thoughout database

In [49]:
df_41_3 =  df_Jan_trips_41.loc[(df_Jan_trips_41['ROUTEID'] == '41_3')].sort_values(by = 'PROGRNUMBER')

In [50]:
df_41_3

Unnamed: 0,DAYOFSERVICE,TRIPID,PROGRNUMBER,STOPPOINTID,PLANNEDTIME_ARR,PLANNEDTIME_DEP,ACTUALTIME_ARR,ACTUALTIME_DEP,LINEID,DIRECTION,ROUTEID
7039666,2018-01-23,6216050,1,288,21600,21600,21581,21581,41,1,41_3
8396578,2018-01-26,6212621,1,288,66300,66300,66514,66514,41,1,41_3
4752227,2018-01-16,6115930,1,288,19800,19800,19783,19783,41,1,41_3
2794890,2018-01-09,6098110,1,288,76500,76500,76500,76500,41,1,41_3
1700955,2018-01-06,5972065,1,288,38400,38400,38357,38357,41,1,41_3
...,...,...,...,...,...,...,...,...,...,...,...
4324376,2018-01-14,6112919,51,4957,68636,68636,68737,68737,41,1,41_3
9813327,2018-01-31,6231183,51,4957,47131,47131,47595,47595,41,1,41_3
8999146,2018-01-29,6245143,51,4957,31490,31490,31762,31762,41,1,41_3
4195780,2018-01-14,6109174,51,4957,26496,26496,26293,26293,41,1,41_3


#### Take unique values into dictionary

In [51]:
uniques = {}
for col in df_41_3.columns:
    uniques[col] = df_41_3[col].unique()

In [52]:
uniques['ROUTEID']

array(['41_3'], dtype=object)

####  View DAYOFSERVICE  when ROUTE 41_3 was served

In [53]:
DAYOFSERVICE_43_3 = list(uniques['DAYOFSERVICE'])

In [54]:
DAYOFSERVICE_43_3[:5]

[numpy.datetime64('2018-01-23T00:00:00.000000000'),
 numpy.datetime64('2018-01-26T00:00:00.000000000'),
 numpy.datetime64('2018-01-16T00:00:00.000000000'),
 numpy.datetime64('2018-01-09T00:00:00.000000000'),
 numpy.datetime64('2018-01-06T00:00:00.000000000')]

####  View TRIPIDs used to serve ROUTE 41_3

In [55]:
TRIPID_43_3 = list(uniques['TRIPID'])

In [56]:
TRIPID_43_3[:5]

[6216050, 6212621, 6115930, 6098110, 5972065]

#### Make dictionary of STOPPOINTIDs visited during each of the TRIP on Route 41_3

In [57]:
STOPS_trips_43_3 = {}
trip_dayOfService_41_3 = df_41_3.reset_index(drop=True).groupby(['DAYOFSERVICE','TRIPID'])['STOPPOINTID'].apply(set)
print("Examples of Visited stoppoints",trip_dayOfService_41_3[:100])

LIST_trip_dayOfService_41_3 =  list(trip_dayOfService_41_3 )

print("\nAll stops visited in a trip for a single ROUTE 41_3 are the same: ", (LIST_trip_dayOfService_41_3[1:] == LIST_trip_dayOfService_41_3[:-1]))

Examples of Visited stoppoints DAYOFSERVICE  TRIPID 
2018-01-01    5955626    {15, 17, 18, 1171, 1172, 1173, 1174, 1175, 19,...
              5955630    {15, 17, 18, 1171, 1172, 1173, 1174, 1175, 19,...
              5955634    {15, 17, 18, 1171, 1172, 1173, 1174, 1175, 19,...
              5955636    {15, 17, 18, 1171, 1172, 1173, 1174, 1175, 19,...
              5957237    {15, 17, 18, 1171, 1172, 1173, 1174, 1175, 19,...
                                               ...                        
2018-01-03    5964154    {15, 17, 18, 1171, 1172, 1173, 1174, 1175, 19,...
              5964190    {15, 17, 18, 1171, 1172, 1173, 1174, 1175, 19,...
              5964193    {15, 17, 18, 1171, 1172, 1173, 1174, 1175, 19,...
              5964207    {15, 17, 18, 1171, 1172, 1173, 1174, 1175, 19,...
              5964586    {15, 17, 18, 1171, 1172, 1173, 1174, 1175, 19,...
Name: STOPPOINTID, Length: 100, dtype: object

All stops visited in a trip for a single ROUTE 41_3 are the same:  False


**OBSERVATIONS FROM ABOVE SETS**<BR>
* **FOR A SINGLE ROUTE; STOPS VISITED DURING DIFFERENT TRIPS ARE DIFEENRENT**

## Check nature of Arrival and departure times
* [Documentation on Dublin Bus data](https://brightspace.ucd.ie/d2l/le/content/54595/viewContent/922112/View) defines 
    * PlannedTime_Arr : Planned arrival time at the stop point, in seconds 
    * PlannedTime_Dep: Planned departure time from the stop point, in seconds
    * ActualTime_Arr: Actual arrival time at the stop point, in seconds
    * ActualTime_Dep: Actual departure time from the stop point, in seconds
* **Value in seconds** is **offset in seconds from 00:00:00 for particular date**

**EXTENDED TIMES**<br>
* FOR LATE NIGHT TRIPS WHICH TECHNICALLY SPAN OVER 2 DAYS; **EXTENDED TIMES** ARE USED<br>
For example a trip starting at 23:30 of 1st ofApril and ending at 00:30 of the 2nd of April could have the last AVL data tracked at “24:30of 1st of April”. To express this “extended time” the interface requires the use of secondspast the beginning of the operation day. In our example the DayOfService is “01/04/2014”and the extended time is “88200” (= 24*60*60 + 30*6)

In [58]:
import pandas as pd

In [59]:
df_Jan_trips = pd.read_csv("../DB/ML/Jan_trips_MERGED.csv")

In [60]:
df_Jan_trips['DAYOFSERVICE'] = pd.to_datetime(df_Jan_trips['DAYOFSERVICE'])
df_Jan_trips['TRIPID'] = pd.to_numeric(df_Jan_trips['TRIPID'], downcast='integer', errors='coerce')
df_Jan_trips['PROGRNUMBER'] = pd.to_numeric(df_Jan_trips['PROGRNUMBER'], downcast='integer', errors='coerce')
df_Jan_trips['STOPPOINTID'] = pd.to_numeric(df_Jan_trips['STOPPOINTID'], downcast='integer', errors='coerce')
df_Jan_trips['PLANNEDTIME_ARR'] = pd.to_numeric(df_Jan_trips['PLANNEDTIME_ARR'], downcast='integer', errors='coerce')
df_Jan_trips['PLANNEDTIME_DEP'] = pd.to_numeric(df_Jan_trips['PLANNEDTIME_DEP'], downcast='integer', errors='coerce')
df_Jan_trips['ACTUALTIME_ARR'] = pd.to_numeric(df_Jan_trips['ACTUALTIME_ARR'], downcast='integer', errors='coerce')
df_Jan_trips['ACTUALTIME_DEP'] = pd.to_numeric(df_Jan_trips['ACTUALTIME_DEP'], downcast='integer', errors='coerce')
df_Jan_trips['LINEID'] = df_Jan_trips['LINEID'].astype('str')
df_Jan_trips['ROUTEID'] = df_Jan_trips['ROUTEID'].astype('str')
df_Jan_trips['DIRECTION'] = pd.to_numeric(df_Jan_trips['DIRECTION'], downcast='integer', errors='coerce')

In [61]:
df_Jan_trips.head()

Unnamed: 0,DAYOFSERVICE,TRIPID,PROGRNUMBER,STOPPOINTID,PLANNEDTIME_ARR,PLANNEDTIME_DEP,ACTUALTIME_ARR,ACTUALTIME_DEP,LINEID,DIRECTION,ROUTEID
0,2018-01-01,5958355,3,1172,23799,23799,23863,23863,41,1,41_3
1,2018-01-01,5958355,4,1173,23860,23860,23934,24072,41,1,41_3
2,2018-01-01,5958355,5,1174,23937,23937,24114,24140,41,1,41_3
3,2018-01-01,5958355,6,1175,24048,24048,24180,24192,41,1,41_3
4,2018-01-01,5958355,7,15,24130,24130,24227,24257,41,1,41_3


In [62]:
df_Jan_trips.info(null_counts=True)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10034307 entries, 0 to 10034306
Data columns (total 11 columns):
 #   Column           Non-Null Count     Dtype         
---  ------           --------------     -----         
 0   DAYOFSERVICE     10034307 non-null  datetime64[ns]
 1   TRIPID           10034307 non-null  int32         
 2   PROGRNUMBER      10034307 non-null  int8          
 3   STOPPOINTID      10034307 non-null  int16         
 4   PLANNEDTIME_ARR  10034307 non-null  int32         
 5   PLANNEDTIME_DEP  10034307 non-null  int32         
 6   ACTUALTIME_ARR   10034307 non-null  int32         
 7   ACTUALTIME_DEP   10034307 non-null  int32         
 8   LINEID           10034307 non-null  object        
 9   DIRECTION        10034307 non-null  int8          
 10  ROUTEID          10034307 non-null  object        
dtypes: datetime64[ns](1), int16(1), int32(5), int8(2), object(2)
memory usage: 459.3+ MB


### Check Entries containing Exteded Time
**All TIME feature entries greater than “86400” (=24*60*60) are extended time entries**

In [63]:
df_Jan_trips_extendedTime = df_Jan_trips.loc[(df_Jan_trips['PLANNEDTIME_ARR'] > 86400) |
                                             (df_Jan_trips['PLANNEDTIME_DEP'] > 86400) |
                                             (df_Jan_trips['ACTUALTIME_ARR'] > 86400) |
                                             (df_Jan_trips['ACTUALTIME_DEP'] > 86400)
                                            ]
df_Jan_trips_extendedTime.head()

Unnamed: 0,DAYOFSERVICE,TRIPID,PROGRNUMBER,STOPPOINTID,PLANNEDTIME_ARR,PLANNEDTIME_DEP,ACTUALTIME_ARR,ACTUALTIME_DEP,LINEID,DIRECTION,ROUTEID
183170,2018-01-01,5960398,83,1280,86202,86202,86443,86443,40,2,40_31
183171,2018-01-01,5960398,84,6348,86257,86257,86565,86565,40,2,40_31
183410,2018-01-01,5961860,71,2601,85882,85882,86439,86439,27,1,27_19
183411,2018-01-01,5961860,72,4446,85931,85931,86471,86471,27,1,27_19
183412,2018-01-01,5961860,73,2603,86002,86002,86513,86522,27,1,27_19


In [64]:
df_Jan_trips_extendedTime.shape

(40491, 11)

In [65]:
df_Jan_trips_extendedTime.nunique()

DAYOFSERVICE         31
TRIPID             1300
PROGRNUMBER         102
STOPPOINTID        1764
PLANNEDTIME_ARR    3192
PLANNEDTIME_DEP    3192
ACTUALTIME_ARR     4543
ACTUALTIME_DEP     4513
LINEID               70
DIRECTION             2
ROUTEID             148
dtype: int64

### Check Entries where all TIME features are in extended format 

In [66]:
df_Jan_trips_extendedTime_all = df_Jan_trips.loc[(df_Jan_trips['PLANNEDTIME_ARR'] > 86400) &
                                             (df_Jan_trips['PLANNEDTIME_DEP'] > 86400) &
                                             (df_Jan_trips['ACTUALTIME_ARR'] > 86400) &
                                             (df_Jan_trips['ACTUALTIME_DEP'] > 86400)
                                            ]
df_Jan_trips_extendedTime_all.head()

Unnamed: 0,DAYOFSERVICE,TRIPID,PROGRNUMBER,STOPPOINTID,PLANNEDTIME_ARR,PLANNEDTIME_DEP,ACTUALTIME_ARR,ACTUALTIME_DEP,LINEID,DIRECTION,ROUTEID
183420,2018-01-01,5961860,81,2621,86450,86450,87023,87036,27,1,27_19
183421,2018-01-01,5961860,82,2622,86481,86481,87081,87081,27,1,27_19
183422,2018-01-01,5961860,83,2623,86509,86509,87106,87121,27,1,27_19
183423,2018-01-01,5961860,84,2624,86567,86567,87163,87163,27,1,27_19
183424,2018-01-01,5961860,85,4441,86610,86610,87199,87207,27,1,27_19


In [67]:
df_Jan_trips_extendedTime_all.shape

(26966, 11)

In [68]:
df_Jan_trips_extendedTime_all.nunique()

DAYOFSERVICE         31
TRIPID              923
PROGRNUMBER         102
STOPPOINTID        1253
PLANNEDTIME_ARR    2264
PLANNEDTIME_DEP    2264
ACTUALTIME_ARR     3569
ACTUALTIME_DEP     3550
LINEID               61
DIRECTION             2
ROUTEID             110
dtype: int64

### Entries where few of the TIME features is in Extended Time but not all

In [69]:
df_Jan_trips_extendedTime_few = df_Jan_trips_extendedTime[~ df_Jan_trips_extendedTime.isin(df_Jan_trips_extendedTime_all)].dropna()

In [70]:
df_Jan_trips_extendedTime_few.head()

Unnamed: 0,DAYOFSERVICE,TRIPID,PROGRNUMBER,STOPPOINTID,PLANNEDTIME_ARR,PLANNEDTIME_DEP,ACTUALTIME_ARR,ACTUALTIME_DEP,LINEID,DIRECTION,ROUTEID
183170,2018-01-01,5960398.0,83.0,1280.0,86202.0,86202.0,86443.0,86443.0,40,2.0,40_31
183171,2018-01-01,5960398.0,84.0,6348.0,86257.0,86257.0,86565.0,86565.0,40,2.0,40_31
183410,2018-01-01,5961860.0,71.0,2601.0,85882.0,85882.0,86439.0,86439.0,27,1.0,27_19
183411,2018-01-01,5961860.0,72.0,4446.0,85931.0,85931.0,86471.0,86471.0,27,1.0,27_19
183412,2018-01-01,5961860.0,73.0,2603.0,86002.0,86002.0,86513.0,86522.0,27,1.0,27_19


In [71]:
df_Jan_trips_extendedTime_few.shape

(13525, 11)

In [72]:
df_Jan_trips_extendedTime_few.nunique()

DAYOFSERVICE         31
TRIPID             1214
PROGRNUMBER          90
STOPPOINTID        1472
PLANNEDTIME_ARR    1559
PLANNEDTIME_DEP    1559
ACTUALTIME_ARR     1907
ACTUALTIME_DEP     1914
LINEID               70
DIRECTION             2
ROUTEID             148
dtype: int64