In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from dython import nominal

## Source of Truth gathered in earlier notebooks
* **TRIPID**
    * TRIPID are shared across days for perticular LINEID on particular ROUTEID
    * For a single TRIPID, only one corresponding LINEID and ROUTEID are present
    * TRIPID for inbound and outbound directions are mutually exclusive
    * For certein TRIPIDs; PROGNUMBER do not start at 1; which should. It means, some entries are missing.
* **ROUTEID**
    * A LINEID has multiple ROUTEID 
    * A LINEID may / may not have trips with both INBOUND-going(1)/OUTBOUND-returning(2) direction 
    * **THOUGH 2 ROUTES FOR A LINEID ARE HAVING SAME DIRECTION; STOPPOINTIDs VISITED ARE DIFFERENT**
    * **AS ROUTEIDs ARE NOT KNOWN TO USER _AND_ ROUTEIDs ARE NOT TIMEBOUND; IT IS NOT POSSIBLE TO TRAIN ML MODEL USING ALL ROUTEIDs**


## Merge df_Jan with df_trips on 'TRIPID'

In [2]:
df_Jan_trips = pd.read_csv("../DB/ML/Jan_trips_MERGED.csv")

### Object types and valid entries

In [3]:
print(df_Jan_trips.info(null_counts = True))

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10525004 entries, 0 to 10525003
Data columns (total 11 columns):
 #   Column           Non-Null Count     Dtype 
---  ------           --------------     ----- 
 0   DAYOFSERVICE     10525004 non-null  object
 1   TRIPID           10525004 non-null  int64 
 2   PROGRNUMBER      10525004 non-null  int64 
 3   STOPPOINTID      10525004 non-null  int64 
 4   PLANNEDTIME_ARR  10525004 non-null  int64 
 5   PLANNEDTIME_DEP  10525004 non-null  int64 
 6   ACTUALTIME_ARR   10525004 non-null  int64 
 7   ACTUALTIME_DEP   10525004 non-null  int64 
 8   LINEID           10525004 non-null  object
 9   DIRECTION        10525004 non-null  int64 
 10  ROUTEID          10525004 non-null  object
dtypes: int64(8), object(3)
memory usage: 883.3+ MB
None


### Nature of data

In [4]:
df_Jan_trips.head(100)

Unnamed: 0,DAYOFSERVICE,TRIPID,PROGRNUMBER,STOPPOINTID,PLANNEDTIME_ARR,PLANNEDTIME_DEP,ACTUALTIME_ARR,ACTUALTIME_DEP,LINEID,DIRECTION,ROUTEID
0,2018-01-01,5958355,3,1172,23799,23799,23863,23863,41,1,41_3
1,2018-01-01,5958355,4,1173,23860,23860,23934,24072,41,1,41_3
2,2018-01-01,5958355,5,1174,23937,23937,24114,24140,41,1,41_3
3,2018-01-01,5958355,6,1175,24048,24048,24180,24192,41,1,41_3
4,2018-01-01,5958355,7,15,24130,24130,24227,24257,41,1,41_3
...,...,...,...,...,...,...,...,...,...,...,...
95,2018-01-01,5958088,47,3958,27943,27943,28473,28473,66,1,66_11
96,2018-01-01,5958088,48,3959,27990,27990,28508,28508,66,1,66_11
97,2018-01-01,5958088,49,3960,28024,28024,28534,28534,66,1,66_11
98,2018-01-01,5958088,50,3961,28058,28058,28551,28562,66,1,66_11


### Unique entries

In [5]:
df_Jan_trips.nunique()

DAYOFSERVICE          31
TRIPID             76453
PROGRNUMBER          103
STOPPOINTID         4714
PLANNEDTIME_ARR    68859
PLANNEDTIME_DEP    68859
ACTUALTIME_ARR     71982
ACTUALTIME_DEP     71965
LINEID               126
DIRECTION              2
ROUTEID              498
dtype: int64

## Check nature of data for a random LINEID

In [None]:
import pandas as pd

In [6]:
df_Jan_trips_test = df_Jan_trips.loc[(df_Jan_trips['LINEID'] == '41')]

### Object types and valid entries

In [7]:
df_Jan_trips_test.info(null_counts=True)

<class 'pandas.core.frame.DataFrame'>
Int64Index: 156082 entries, 0 to 10523252
Data columns (total 11 columns):
 #   Column           Non-Null Count   Dtype 
---  ------           --------------   ----- 
 0   DAYOFSERVICE     156082 non-null  object
 1   TRIPID           156082 non-null  int64 
 2   PROGRNUMBER      156082 non-null  int64 
 3   STOPPOINTID      156082 non-null  int64 
 4   PLANNEDTIME_ARR  156082 non-null  int64 
 5   PLANNEDTIME_DEP  156082 non-null  int64 
 6   ACTUALTIME_ARR   156082 non-null  int64 
 7   ACTUALTIME_DEP   156082 non-null  int64 
 8   LINEID           156082 non-null  object
 9   DIRECTION        156082 non-null  int64 
 10  ROUTEID          156082 non-null  object
dtypes: int64(8), object(3)
memory usage: 14.3+ MB


### Check if STOPIDs visited by different ROUTEIDs are different

#### Take unique values into dictionary

In [8]:
uniques = {}
for col in df_Jan_trips_test.columns:
    uniques[col] = df_Jan_trips_test[col].unique()

In [9]:
uniques['ROUTEID']

array(['41_3', '41_10', '41_7', '41_8', '41_5', '41_4', '41_6', '41_9',
       '41_20', '41_21'], dtype=object)

#### Check random entries for ROUTEID

In [10]:
df_Jan_trips_test.loc[(df_Jan_trips['ROUTEID'] == '41_3')].sort_values(by = 'PROGRNUMBER')

Unnamed: 0,DAYOFSERVICE,TRIPID,PROGRNUMBER,STOPPOINTID,PLANNEDTIME_ARR,PLANNEDTIME_DEP,ACTUALTIME_ARR,ACTUALTIME_DEP,LINEID,DIRECTION,ROUTEID
7383709,2018-01-23,6216050,1,288,21600,21600,21581,21581,41,1,41_3
8807333,2018-01-26,6212621,1,288,66300,66300,66514,66514,41,1,41_3
4983921,2018-01-16,6115930,1,288,19800,19800,19783,19783,41,1,41_3
2931133,2018-01-09,6098110,1,288,76500,76500,76500,76500,41,1,41_3
1783885,2018-01-06,5972065,1,288,38400,38400,38357,38357,41,1,41_3
...,...,...,...,...,...,...,...,...,...,...,...
4535257,2018-01-14,6112919,51,4957,68636,68636,68737,68737,41,1,41_3
10293070,2018-01-31,6231183,51,4957,47131,47131,47595,47595,41,1,41_3
9438985,2018-01-29,6245143,51,4957,31490,31490,31762,31762,41,1,41_3
4400804,2018-01-14,6109174,51,4957,26496,26496,26293,26293,41,1,41_3


In [11]:
df_Jan_trips_test.loc[(df_Jan_trips['ROUTEID'] == '41_20')].sort_values(by = 'PROGRNUMBER')

Unnamed: 0,DAYOFSERVICE,TRIPID,PROGRNUMBER,STOPPOINTID,PLANNEDTIME_ARR,PLANNEDTIME_DEP,ACTUALTIME_ARR,ACTUALTIME_DEP,LINEID,DIRECTION,ROUTEID
1883383,2018-01-06,5955994,1,1174,57600,57600,57622,57622,41,1,41_20
9064766,2018-01-27,6225628,1,1174,57600,57600,57757,57757,41,1,41_20
6677252,2018-01-20,6121831,1,1174,57600,57600,57552,57552,41,1,41_20
4277167,2018-01-13,6097348,1,1174,57600,57600,57614,57614,41,1,41_20
1883384,2018-01-06,5955994,2,1175,57703,57703,57650,57650,41,1,41_20
...,...,...,...,...,...,...,...,...,...,...,...
4277210,2018-01-13,6097348,47,4915,60917,60917,60640,60654,41,1,41_20
1883420,2018-01-06,5955994,48,4957,60960,60960,60461,60461,41,1,41_20
4277211,2018-01-13,6097348,48,4957,60960,60960,60692,60692,41,1,41_20
6677298,2018-01-20,6121831,48,4957,60960,60960,60569,60569,41,1,41_20


**IT IS EVIDENT FROM ABOVE SAMPLES THAT:**<BR>
**THERE IS NO REALTION BETWEEN TIME OF OPERATION AND ROUTEID SELECTED**    
<BR>
<BR>
<BR>    

#### Choose a combination of keys for ROUTEID 41_3

In [12]:
df_41_3_sample = df_Jan_trips_test.loc[(df_Jan_trips['LINEID'] == '41') & 
                (df_Jan_trips['ROUTEID'] == '41_3') & 
                (df_Jan_trips['TRIPID'] == 6216050) &
                (df_Jan_trips['DAYOFSERVICE'] == '2018-01-23 00:00:00') &
                (df_Jan_trips['DIRECTION'] == 1)
                ].sort_values(by = 'PROGRNUMBER')
df_41_3_sample

Unnamed: 0,DAYOFSERVICE,TRIPID,PROGRNUMBER,STOPPOINTID,PLANNEDTIME_ARR,PLANNEDTIME_DEP,ACTUALTIME_ARR,ACTUALTIME_DEP,LINEID,DIRECTION,ROUTEID


#### Choose a combination of keys for ROUTEID 41_20

In [13]:
df_41_20_sample = df_Jan_trips_test.loc[(df_Jan_trips['LINEID'] == '41') & 
                (df_Jan_trips['ROUTEID'] == '41_20') & 
                (df_Jan_trips['TRIPID'] == 5955994) &
                (df_Jan_trips['DAYOFSERVICE'] == '2018-01-06 00:00:00') &
                (df_Jan_trips['DIRECTION'] == 1)
                ].sort_values(by = 'PROGRNUMBER')
df_41_20_sample

Unnamed: 0,DAYOFSERVICE,TRIPID,PROGRNUMBER,STOPPOINTID,PLANNEDTIME_ARR,PLANNEDTIME_DEP,ACTUALTIME_ARR,ACTUALTIME_DEP,LINEID,DIRECTION,ROUTEID


#### View sets of STOPPOINTIDs visited for particular  ROUTEIDs 

In [14]:
print("ROUTE 41_3",set(df_41_3_sample['STOPPOINTID']),"\n")
print("ROUTE 41_20",set(df_41_20_sample['STOPPOINTID']))

ROUTE 41_3 set() 

ROUTE 41_20 set()
