In [87]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from dython import nominal

## Source of Truth gathered in earlier notebooks
* **TRIPID**
    * TRIPID are shared across days for perticular LINEID on particular ROUTEID
    * For a single TRIPID, only one corresponding LINEID and ROUTEID are present
    * TRIPID for inbound and outbound directions are mutually exclusive
    * For certein TRIPIDs; PROGNUMBER do not start at 1; which should. It means, some entries are missing.
* **ROUTEID**
    * A LINEID has multiple ROUTEID 
    * A LINEID may / may not have trips with both INBOUND-going(1)/OUTBOUND-returning(2) direction 


## rt_trips_DB_2018.csv

In [88]:
df_trips = pd.read_csv("../DB/monthlyData/rt_trips_DB_2018.csv",skip_blank_lines=True,index_col=False)

In [89]:
df_trips['DAYOFSERVICE'] = pd.to_datetime(df_trips['DAYOFSERVICE'])
df_trips['TRIPID'] = pd.to_numeric(df_trips['TRIPID'], downcast='integer', errors='coerce')
df_trips['LINEID'] = df_trips['LINEID'].astype('str')
df_trips['ROUTEID'] = df_trips['ROUTEID'].astype('str')
df_trips['DIRECTION'] = pd.to_numeric(df_trips['DIRECTION'], downcast='integer', errors='coerce')
df_trips['PLANNEDTIME_ARR'] = pd.to_numeric(df_trips['PLANNEDTIME_ARR'], downcast='integer', errors='coerce')
df_trips['PLANNEDTIME_DEP'] = pd.to_numeric(df_trips['PLANNEDTIME_DEP'], downcast='integer', errors='coerce')
df_trips['ACTUALTIME_ARR'] = pd.to_numeric(df_trips['ACTUALTIME_ARR'], downcast='integer', errors='coerce')
df_trips['ACTUALTIME_DEP'] = pd.to_numeric(df_trips['ACTUALTIME_DEP'], downcast='integer', errors='coerce')

### Object types and valid entries

In [90]:
print(df_trips.info(null_counts = True))

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2182637 entries, 0 to 2182636
Data columns (total 9 columns):
 #   Column           Non-Null Count    Dtype         
---  ------           --------------    -----         
 0   DAYOFSERVICE     2182637 non-null  datetime64[ns]
 1   TRIPID           2182637 non-null  int32         
 2   LINEID           2182637 non-null  object        
 3   ROUTEID          2182637 non-null  object        
 4   DIRECTION        2182637 non-null  int8          
 5   PLANNEDTIME_ARR  2182637 non-null  int32         
 6   PLANNEDTIME_DEP  2182637 non-null  int32         
 7   ACTUALTIME_ARR   2045430 non-null  float64       
 8   ACTUALTIME_DEP   2018086 non-null  float64       
dtypes: datetime64[ns](1), float64(2), int32(3), int8(1), object(2)
memory usage: 110.3+ MB
None


### Nature of data

In [91]:
df_trips.head()

Unnamed: 0,DAYOFSERVICE,TRIPID,LINEID,ROUTEID,DIRECTION,PLANNEDTIME_ARR,PLANNEDTIME_DEP,ACTUALTIME_ARR,ACTUALTIME_DEP
0,2018-02-07,6253783,68,68_80,1,87245,84600,87524.0,84600.0
1,2018-02-07,6262138,25B,25B_271,2,30517,26460,32752.0,
2,2018-02-07,6254942,45A,45A_70,2,35512,32100,36329.0,32082.0
3,2018-02-07,6259460,25A,25A_273,1,57261,54420,58463.0,54443.0
4,2018-02-07,6253175,14,14_15,1,85383,81600,84682.0,81608.0


### Unique entries

In [92]:
df_trips.nunique()

DAYOFSERVICE          360
TRIPID             658964
LINEID                130
ROUTEID               588
DIRECTION               2
PLANNEDTIME_ARR     64461
PLANNEDTIME_DEP       791
ACTUALTIME_ARR      68122
ACTUALTIME_DEP      66771
dtype: int64

## Janury_2018.csv

In [93]:
df_Jan = pd.read_csv("../DB/monthlyData/January_2018.csv",skip_blank_lines=True,index_col=False)
df_Jan.drop('VEHICLEID',axis=1,inplace=True)

In [94]:
df_Jan['DAYOFSERVICE'] = pd.to_datetime(df_Jan['DAYOFSERVICE'])
df_Jan['TRIPID'] = pd.to_numeric(df_Jan['TRIPID'], downcast='integer', errors='coerce')
df_Jan['PROGRNUMBER'] = pd.to_numeric(df_Jan['PROGRNUMBER'], downcast='integer', errors='coerce')
df_Jan['STOPPOINTID'] = pd.to_numeric(df_Jan['STOPPOINTID'], downcast='integer', errors='coerce')
df_Jan['PLANNEDTIME_ARR'] = pd.to_numeric(df_Jan['PLANNEDTIME_ARR'], downcast='integer', errors='coerce')
df_Jan['PLANNEDTIME_DEP'] = pd.to_numeric(df_Jan['PLANNEDTIME_DEP'], downcast='integer', errors='coerce')
df_Jan['ACTUALTIME_ARR'] = pd.to_numeric(df_Jan['ACTUALTIME_ARR'], downcast='integer', errors='coerce')
df_Jan['ACTUALTIME_DEP'] = pd.to_numeric(df_Jan['ACTUALTIME_DEP'], downcast='integer', errors='coerce')

### Object types and valid entries

In [95]:
print(df_Jan.info(null_counts = True))

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10525004 entries, 0 to 10525003
Data columns (total 8 columns):
 #   Column           Non-Null Count     Dtype         
---  ------           --------------     -----         
 0   DAYOFSERVICE     10525004 non-null  datetime64[ns]
 1   TRIPID           10525004 non-null  int32         
 2   PROGRNUMBER      10525004 non-null  int8          
 3   STOPPOINTID      10525004 non-null  int16         
 4   PLANNEDTIME_ARR  10525004 non-null  int32         
 5   PLANNEDTIME_DEP  10525004 non-null  int32         
 6   ACTUALTIME_ARR   10525004 non-null  int32         
 7   ACTUALTIME_DEP   10525004 non-null  int32         
dtypes: datetime64[ns](1), int16(1), int32(5), int8(1)
memory usage: 311.2 MB
None


### Nature of data

In [96]:
df_Jan.head()

Unnamed: 0,DAYOFSERVICE,TRIPID,PROGRNUMBER,STOPPOINTID,PLANNEDTIME_ARR,PLANNEDTIME_DEP,ACTUALTIME_ARR,ACTUALTIME_DEP
0,2018-01-01,5958355,3,1172,23799,23799,23863,23863
1,2018-01-01,5958355,4,1173,23860,23860,23934,24072
2,2018-01-01,5958355,5,1174,23937,23937,24114,24140
3,2018-01-01,5958355,6,1175,24048,24048,24180,24192
4,2018-01-01,5958355,7,15,24130,24130,24227,24257


### Unique entries

In [97]:
df_Jan.nunique()

DAYOFSERVICE          31
TRIPID             76453
PROGRNUMBER          103
STOPPOINTID         4714
PLANNEDTIME_ARR    68859
PLANNEDTIME_DEP    68859
ACTUALTIME_ARR     71982
ACTUALTIME_DEP     71965
dtype: int64

## Merge df_Jan with df_trips on 'TRIPID'
* **OBJECTIVE 1: Obtain a unique mapping for LINEID - STOPOINTID -  PROGRNUMBER - DIRECTION**
    * To obtain a detailed route for bus, we need to know successive stoppoints taken by a bus line
    * Above mapping gives us exact route followed by bus
    * A single LINEID may have different internal ROUTEIDs. But, USER IS UNAWAREA ABOUT THEM; SO ARE THE DESIGNERS of WEBAPP.
    * This happens because there is no definite timetable/ relation between LINEID and ROUTEID
    * Hence, we filter on LINEID --- choose random Date for a LINEID --- We choose a random TRIPID on that DATE --- Fetch LINE-STOPS-PROGRNUMBER-DIRECTION relation for them
* **OBJECTIVE 2: Obtain metadata about LINEID - ROUTEID**
    * Above step selects data for perticular TRIPID -- having a definite sequence of STOPOINTIDs covered under the ROUTEID.
    * It practically looses data from original source data
    * Hence, we need to consider data only for coresponding ROUTEID entries for ML training --- else errors would occur
    * So, metadata about ROOUTEID chosen for corresponding LINEID is stored
* **OBJECTIVE 3: Obtain a mapping for LINEID - Start PROGRNUMBER - END PROGRNUMBER - DIRECTION**
    * This helps in displaying the routes consisting STOPPOINTIDs available at particular STOPOINTID

In [150]:
df_Jan_trips = df_Jan.drop_duplicates().merge(df_trips[['DAYOFSERVICE','TRIPID', 'LINEID', 'DIRECTION','ROUTEID']],on=['TRIPID','DAYOFSERVICE']).drop_duplicates()

### Object types and valid entries

In [151]:
print(df_Jan_trips.info(null_counts = True))

<class 'pandas.core.frame.DataFrame'>
Int64Index: 10525004 entries, 0 to 10525003
Data columns (total 11 columns):
 #   Column           Non-Null Count     Dtype         
---  ------           --------------     -----         
 0   DAYOFSERVICE     10525004 non-null  datetime64[ns]
 1   TRIPID           10525004 non-null  int32         
 2   PROGRNUMBER      10525004 non-null  int8          
 3   STOPPOINTID      10525004 non-null  int16         
 4   PLANNEDTIME_ARR  10525004 non-null  int32         
 5   PLANNEDTIME_DEP  10525004 non-null  int32         
 6   ACTUALTIME_ARR   10525004 non-null  int32         
 7   ACTUALTIME_DEP   10525004 non-null  int32         
 8   LINEID           10525004 non-null  object        
 9   DIRECTION        10525004 non-null  int8          
 10  ROUTEID          10525004 non-null  object        
dtypes: datetime64[ns](1), int16(1), int32(5), int8(2), object(2)
memory usage: 562.1+ MB
None


### Nature of data

In [105]:
df_Jan_trips.head(100)

Unnamed: 0,DAYOFSERVICE,TRIPID,PROGRNUMBER,STOPPOINTID,PLANNEDTIME_ARR,PLANNEDTIME_DEP,ACTUALTIME_ARR,ACTUALTIME_DEP,LINEID,DIRECTION,ROUTEID
0,2018-01-01,5958355,3,1172,23799,23799,23863,23863,41,1,41_3
1,2018-01-01,5958355,4,1173,23860,23860,23934,24072,41,1,41_3
2,2018-01-01,5958355,5,1174,23937,23937,24114,24140,41,1,41_3
3,2018-01-01,5958355,6,1175,24048,24048,24180,24192,41,1,41_3
4,2018-01-01,5958355,7,15,24130,24130,24227,24257,41,1,41_3
...,...,...,...,...,...,...,...,...,...,...,...
95,2018-01-01,5958088,47,3958,27943,27943,28473,28473,66,1,66_11
96,2018-01-01,5958088,48,3959,27990,27990,28508,28508,66,1,66_11
97,2018-01-01,5958088,49,3960,28024,28024,28534,28534,66,1,66_11
98,2018-01-01,5958088,50,3961,28058,28058,28551,28562,66,1,66_11


### Unique entries

In [106]:
df_Jan_trips.nunique()

DAYOFSERVICE          31
TRIPID             76453
PROGRNUMBER          103
STOPPOINTID         4714
PLANNEDTIME_ARR    68859
PLANNEDTIME_DEP    68859
ACTUALTIME_ARR     71982
ACTUALTIME_DEP     71965
LINEID               126
DIRECTION              2
ROUTEID              498
dtype: int64

### Save to CSV

In [107]:
df_Jan_trips.to_csv("../DB/ML/Jan_trips_MERGED.csv",index=False)

## Check nature of data for a random LINEID

In [108]:
df_Jan_trips_test = df_Jan_trips.loc[(df_Jan_trips['LINEID'] == '41')]

### Object types and valid entries

In [155]:
df_Jan_trips_test.info(null_counts=True)

<class 'pandas.core.frame.DataFrame'>
Int64Index: 156082 entries, 0 to 10523252
Data columns (total 11 columns):
 #   Column           Non-Null Count   Dtype         
---  ------           --------------   -----         
 0   DAYOFSERVICE     156082 non-null  datetime64[ns]
 1   TRIPID           156082 non-null  int32         
 2   PROGRNUMBER      156082 non-null  int8          
 3   STOPPOINTID      156082 non-null  int16         
 4   PLANNEDTIME_ARR  156082 non-null  int32         
 5   PLANNEDTIME_DEP  156082 non-null  int32         
 6   ACTUALTIME_ARR   156082 non-null  int32         
 7   ACTUALTIME_DEP   156082 non-null  int32         
 8   LINEID           156082 non-null  object        
 9   DIRECTION        156082 non-null  int8          
 10  ROUTEID          156082 non-null  object        
dtypes: datetime64[ns](1), int16(1), int32(5), int8(2), object(2)
memory usage: 8.3+ MB


### Check if STOPIDs visited by different ROUTEIDs are different

#### Take unique values into dictionary

In [156]:
uniques = {}
for col in df_Jan_trips_test.columns:
    uniques[col] = df_Jan_trips_test[col].unique()

In [157]:
uniques['ROUTEID']

array(['41_3', '41_10', '41_7', '41_8', '41_5', '41_4', '41_6', '41_9',
       '41_20', '41_21'], dtype=object)

#### Check random entries for ROUTEID

In [158]:
df_Jan_trips_test.loc[(df_Jan_trips['ROUTEID'] == '41_3')].sort_values(by = 'PROGRNUMBER')

Unnamed: 0,DAYOFSERVICE,TRIPID,PROGRNUMBER,STOPPOINTID,PLANNEDTIME_ARR,PLANNEDTIME_DEP,ACTUALTIME_ARR,ACTUALTIME_DEP,LINEID,DIRECTION,ROUTEID
7383709,2018-01-23,6216050,1,288,21600,21600,21581,21581,41,1,41_3
8807333,2018-01-26,6212621,1,288,66300,66300,66514,66514,41,1,41_3
4983921,2018-01-16,6115930,1,288,19800,19800,19783,19783,41,1,41_3
2931133,2018-01-09,6098110,1,288,76500,76500,76500,76500,41,1,41_3
1783885,2018-01-06,5972065,1,288,38400,38400,38357,38357,41,1,41_3
...,...,...,...,...,...,...,...,...,...,...,...
4535257,2018-01-14,6112919,51,4957,68636,68636,68737,68737,41,1,41_3
10293070,2018-01-31,6231183,51,4957,47131,47131,47595,47595,41,1,41_3
9438985,2018-01-29,6245143,51,4957,31490,31490,31762,31762,41,1,41_3
4400804,2018-01-14,6109174,51,4957,26496,26496,26293,26293,41,1,41_3


In [159]:
df_Jan_trips_test.loc[(df_Jan_trips['ROUTEID'] == '41_20')].sort_values(by = 'PROGRNUMBER')

Unnamed: 0,DAYOFSERVICE,TRIPID,PROGRNUMBER,STOPPOINTID,PLANNEDTIME_ARR,PLANNEDTIME_DEP,ACTUALTIME_ARR,ACTUALTIME_DEP,LINEID,DIRECTION,ROUTEID
1883383,2018-01-06,5955994,1,1174,57600,57600,57622,57622,41,1,41_20
9064766,2018-01-27,6225628,1,1174,57600,57600,57757,57757,41,1,41_20
6677252,2018-01-20,6121831,1,1174,57600,57600,57552,57552,41,1,41_20
4277167,2018-01-13,6097348,1,1174,57600,57600,57614,57614,41,1,41_20
1883384,2018-01-06,5955994,2,1175,57703,57703,57650,57650,41,1,41_20
...,...,...,...,...,...,...,...,...,...,...,...
4277210,2018-01-13,6097348,47,4915,60917,60917,60640,60654,41,1,41_20
1883420,2018-01-06,5955994,48,4957,60960,60960,60461,60461,41,1,41_20
4277211,2018-01-13,6097348,48,4957,60960,60960,60692,60692,41,1,41_20
6677298,2018-01-20,6121831,48,4957,60960,60960,60569,60569,41,1,41_20


**IT IS EVIDENT FROM ABOVE SAMPLES THAT:**<BR>
**THERE IS NO REALTION BETWEEN TIME OF OPERATION AND ROUTEID SELECTED**    
<BR>
<BR>
<BR>    

#### Choose a combination of keys for ROUTEID 41_3

In [160]:
df_41_3_sample = df_Jan_trips_test.loc[(df_Jan_trips['LINEID'] == '41') & 
                (df_Jan_trips['ROUTEID'] == '41_3') & 
                (df_Jan_trips['TRIPID'] == 6216050) &
                (df_Jan_trips['DAYOFSERVICE'] == '2018-01-23 00:00:00') &
                (df_Jan_trips['DIRECTION'] == 1)
                ].sort_values(by = 'PROGRNUMBER')
df_41_3_sample

Unnamed: 0,DAYOFSERVICE,TRIPID,PROGRNUMBER,STOPPOINTID,PLANNEDTIME_ARR,PLANNEDTIME_DEP,ACTUALTIME_ARR,ACTUALTIME_DEP,LINEID,DIRECTION,ROUTEID
7383709,2018-01-23,6216050,1,288,21600,21600,21581,21581,41,1,41_3
7383710,2018-01-23,6216050,2,1171,21643,21643,21629,21629,41,1,41_3
7383711,2018-01-23,6216050,3,1172,21676,21676,21645,21645,41,1,41_3
7383712,2018-01-23,6216050,4,1173,21723,21723,21674,21696,41,1,41_3
7383713,2018-01-23,6216050,5,1174,21783,21783,21758,21758,41,1,41_3
7383714,2018-01-23,6216050,6,1175,21859,21859,21782,21808,41,1,41_3
7383715,2018-01-23,6216050,7,15,21916,21916,21900,21927,41,1,41_3
7383716,2018-01-23,6216050,8,17,21997,21997,21992,22018,41,1,41_3
7383717,2018-01-23,6216050,9,18,22046,22046,22065,22065,41,1,41_3
7383718,2018-01-23,6216050,10,19,22067,22067,22081,22094,41,1,41_3


#### Choose a combination of keys for ROUTEID 41_20

In [164]:
df_41_20_sample = df_Jan_trips_test.loc[(df_Jan_trips['LINEID'] == '41') & 
                (df_Jan_trips['ROUTEID'] == '41_20') & 
                (df_Jan_trips['TRIPID'] == 5955994) &
                (df_Jan_trips['DAYOFSERVICE'] == '2018-01-06 00:00:00') &
                (df_Jan_trips['DIRECTION'] == 1)
                ].sort_values(by = 'PROGRNUMBER')
df_41_20_sample

Unnamed: 0,DAYOFSERVICE,TRIPID,PROGRNUMBER,STOPPOINTID,PLANNEDTIME_ARR,PLANNEDTIME_DEP,ACTUALTIME_ARR,ACTUALTIME_DEP,LINEID,DIRECTION,ROUTEID
1883383,2018-01-06,5955994,1,1174,57600,57600,57622,57622,41,1,41_20
1883384,2018-01-06,5955994,2,1175,57703,57703,57650,57650,41,1,41_20
1883385,2018-01-06,5955994,3,15,57780,57780,57716,57750,41,1,41_20
1883386,2018-01-06,5955994,4,17,57900,57900,57798,57798,41,1,41_20
1883387,2018-01-06,5955994,5,18,58001,58001,57865,57865,41,1,41_20
1883388,2018-01-06,5955994,6,19,58043,58043,57876,57876,41,1,41_20
1883389,2018-01-06,5955994,7,21,58140,58140,57909,57920,41,1,41_20
1883390,2018-01-06,5955994,8,7602,58186,58186,57949,57949,41,1,41_20
1883391,2018-01-06,5955994,9,85,58247,58247,57997,57997,41,1,41_20
1883392,2018-01-06,5955994,10,203,58320,58320,58109,58119,41,1,41_20


#### View sets of STOPPOINTIDs visited for particular  ROUTEIDs 

In [165]:
print("ROUTE 41_3",set(df_41_3_sample['STOPPOINTID']),"\n")
print("ROUTE 41_20",set(df_41_20_sample['STOPPOINTID']))

ROUTE 41_3 {15, 17, 18, 1171, 1172, 1173, 1174, 1175, 19, 21, 3864, 3865, 3669, 288, 6054, 4910, 4911, 4912, 4913, 7602, 4914, 7348, 4915, 5076, 203, 204, 205, 5073, 5074, 5075, 1620, 85, 1622, 1623, 1624, 1625, 1626, 1627, 220, 1628, 1629, 1630, 3671, 3672, 3674, 3675, 3676, 3679, 3682, 4330, 4957} 

ROUTE 41_20 {15, 17, 18, 19, 21, 1174, 1175, 3864, 3865, 3669, 4910, 4911, 4912, 4913, 7602, 4914, 7348, 4915, 5076, 203, 204, 205, 5075, 1620, 85, 1622, 1623, 1624, 1625, 1626, 1627, 220, 1628, 1629, 1630, 3679, 4957, 3682}


**OBSERVATIONS FROM ABOVE SETS**<BR>
* **THOUGH BOTH ROUTES ARE INBOUND (DIRECTION = 1); STOPPOINTIDs VISITED ARE DIFFERENT**
* **AS ROUTEIDs ARE NOT KNOWN TO USER _AND_ ROUTEIDs ARE NOT TIMEBOUND; IT IS NOT POSSIBLE TO TRAIN ML MODEL USING ALL ROUTEIDs**