In [20]:
# library
import statsmodels.api as sm
import seaborn as sns
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
from datetime import datetime
import re

# # ML
# from sklearn.model_selection import train_test_split
# # Classification
# from sklearn.metrics import accuracy_score
# from sklearn.metrics import confusion_matrix
# # Regression
# from statsmodels.tools.eval_measures import rmse
# from statsmodels.stats.outliers_influence import variance_inflation_factor
# # KNN & Decision tree
# from sklearn.neighbors import KNeighborsClassifier
# from sklearn.tree import DecisionTreeClassifier
# # MinMax Scaler (Normalisation)
# from sklearn.preprocessing import MinMaxScaler
# from sklearn.preprocessing import StandardScaler
# Warnings
import warnings
warnings.filterwarnings('ignore')

# **DTI-DS Capstone 2 (Exploratory Data Analysis)**

## General Overview

1. <mark>**Background:**</mark>
    
Transjakarta is the first Bus Rapid Transit (BRT) transportation system in Southeast Asia operating since 2004 in Jakarta, Indonesia. TransJakarta was designed as a mass transportation mode to support Indonesia’s Capital city’s around the clock activities. 

With the longest track in the world (251.2 km), as well as having 260 bus stops spread across 13 corridors. Transjakarta initially operates from 05.00 - 22.00 WIB, now it operates 24 hours available on certain corridors only. With its extensive network of routes and ease of use, Transjakarta has become the leading and favorite transportation for so called “Jakartans”. 

However, there are several problems that still lingers around and thus must be handled. Problems such as sexual harassment that still often occurs to women, pickpocketing, bus crashes, schedule punctuality, accumulation of passengers at bus stops, overcrowding inside the bus itself. All stemming from overcrowding and less than optimal fleet distribution and fleet schedule.

2. <mark>**Problem Statement:**</mark>

Overcrowding has led to several of Transjakarta’s pre-existing problems aforementioned in the Background. Thus, Transjakarta wants to research on its “overcrowding” problem associated with fleet distribution and schedule to help evaluate and improve its services to passengers (“Jakartans”) 

3. <mark>**Data:**</mark>

This data is the passenger data for the month of April 2023. It initially consists of 37,900 rows (reduced to 35,476 post-preprocessing) and 22 columns. The Data Can be seen as follows:

4. <mark>**Data Analysis:**</mark>

Overcrowding can be identified through several variables that helps us measure overall quantity of passengers. These variables can also help describe the demography of our passengers in the form of customer segmentation. For this research, we are focusing on the variables that can be associated with “overcrowding”. These variables will be <mark>**highlighted**</mark> with the <mark>**arrow (->)**</mark> notation below:

#### -> Biodata:
1.	<mark>**transID:**</mark> <u>Unique transaction id for every transaction</u>
2.	<mark>**payCardID:**</mark> <u>Customers main identifier. The card customers use as a ticket for entrance and exit.</u>
3.	<mark>**payCardBank:**</mark> <u>Customers card bank issuer name</u> <mark>**-> Payment Gateway Analysis**</mark>
4.	<mark>**payCardName:**</mark> <u>Customers name that is embedded in the card.</u>
5.	<mark>**payCardSex**</mark> <u>Customers sex that is embedded in the card</u> <mark>**-> Gender Analysis**</mark>
6.	<mark>**payCardBirthDate:**</mark> <u>Customers birth year</u> <mark>**-> Customer Segmentation by Age**</mark>
#### -> Journey (Trip Details):
7.	<mark>**corridorID:**</mark> <u>Corridor ID / Route ID as key for route grouping.</u> <mark>**-> Corridor Analysis**</mark>
8.	<mark>**corridorName:**</mark> <u>Corridor Name / Route Name contains Start and Finish for each route.</u> <mark>**-> Corridor Analysis**</mark>
9.	<mark>**direction:**</mark> <u>0 for Go, 1 for Back. Direction of the route. (0: Right_address -> Left_address & 1: Left_address -> Right_address)</u> <mark>**-> In/Out Analysis**</mark>
#### -> Journey (Tap-In details):
10.	<mark>**tapInStops:**</mark> <u>Tap In (entrance) Stops ID for identifying stops name</u>
11.	<mark>**tapInStopsName:**</mark> <u>Tap In (entrance) Stops Name where customers tap in.</u> <mark>**-> Bus Stop Analysis**</mark>
12.	<mark>**tapInStopsLat:**</mark> <u>Latitude of Tap In Stops</u> <mark>**-> Geo Analysis**</mark>
13.	<mark>**tapInStopsLon:**</mark> <u>Longitude of Tap In Stops</u>
14.	<mark>**stopStartSeq:**</mark> <u>Sequence of the stops, 1st stop, 2nd stops etc. Related to direction. (the N-th startingStop to the endingStop from Right_address (direc: 0) OR Left_address (direc: 1))</u> <mark>**-> stopCount Analysis**</mark>
15.	<mark>**tapInTime:**</mark> <u>Time of tap in. Date and time</u> <mark>**-> Time-Based Analysis**</mark>
#### -> Journey (Tap-Out details):
16.	<mark>**tapOutStops:**</mark> <u>Tap Out (Exit) Stops ID for identifying stops name</u>
17.	<mark>**tapOutStopsName:**</mark> <u>Tap out (exit) Stops Name where customers tap out.</u> <mark>**-> Bus Stop Analysis**</mark>
18.	<mark>**tapOutStopsLat:**</mark> <u>Latitude of Tap Out Stops</u> <mark>**-> Geo Analysis**</mark>
19.	<mark>**tapOutStopsLon:**</mark> <u>Longitude of Tap Out Stops</u>
20.	<mark>**stopEndSeq:**</mark> <u>Sequence of the stops, 1st stop, 2nd stops etc. Related to direction.(the N-th startingStop to the endingStop from Right_address (direc: 0) OR Left_address (direc: 1))</u> <mark>**-> stopCount Analysis**</mark>
21.	<mark>**tapOutTime:**</mark> <u>Time of tap out. Date and time</u> <mark>**-> Time-Based Analysis**</mark>
#### -> Journey (Trip Details):
22.	<mark>**payAmount:**</mark> <u>The number of what customers pay. Some are free. Some not.</u> <mark>**-> Revenue Analysis**</mark>

<br>
<br>
<br>
5. <mark>**Final Initial Hypothetical Thoughts (guiding concerns): **</mark>

Overcrowding can be best described as a phenomenon where the quantity of people exceed the threshold of collective and overall comfort of a cohort of people. To mitigate such problems. We can list the overall data analysis research findings, along with recommendations such as addressing customer segments that majorly contributes to “overcrowding”, fleet distribution and schedule that accommodate peak demand hours based on some of its busiest corridors (and its stops) whilst accommodating concerns such as female passenger safety by way of dedicated female rows on buses and female-only buses. Such implementation requires a certain degree of supervision, hence the optimal number of staff and their respective distribution along with comprehensive CCTV coverage. 

“Overcrowding” stems from congestion, and congestion does not always happen at the bus but rather the bus stops itself. This might cause certain problems such as pickpocketing and uncomfortable waiting conditions. To reduce such congestion, Transjakarta can increase its number of fleets, along with the aforementioned fleet distribution and schedule. This will also lessen the already high operating hours that might result in unexpected vehicle breakdowns, which might further worsen the problem as a delay in supply might trickle down to the whole system’s operational efficiency

In [21]:
# Import Data
df_tj = pd.read_csv('Transjakarta_cleaned.csv', sep= ',')

In [22]:
# ===== 'Transjakarta.csv'' =====
pd.set_option("display.max_columns", None)
df_tj

Unnamed: 0,transID,payCardID,payCardBank,payCardName,payCardSex,payCardBirthDate,corridorID,corridorName,direction,tapInStops,tapInStopsName,tapInStopsLat,tapInStopsLon,stopStartSeq,tapInTime,tapOutStops,tapOutStopsName,tapOutStopsLat,tapOutStopsLon,stopEndSeq,tapOutTime,payAmount
0,LDEA875J4U32YZ,213107623841273,emoney,Natalia Pratama,M,1964,11D,Pulo Gebang - Pulo Gadung 2 via PIK,0.0,B05823P,United Tractors 1,-6.183260,106.93243,28,2023-04-01 06:22:27,B03090P,Raya Penggilingan,-6.183068,106.93194,29.0,2023-04-01 08:15:45,3500.0
1,MSNJ848P3Q44GI,4139531858845,online,Tgk. Kacung Nashiruddin,F,1997,7E,Kampung Rambutan - Ragunan,1.0,B01062P,Jln. Gabus Raya,-6.301222,106.83623,7,2023-04-01 06:54:35,B03148P,RS Jantung Binawaluya,-6.308410,106.87071,15.0,2023-04-01 07:27:31,3500.0
2,JZZV014Y9O33OV,60459139923,flazz,Zelda Thamrin,M,1972,JIS3,Harmoni - Jakarta International Stadium,0.0,P00046,Danau Agung,-6.146869,106.85805,8,2023-04-01 07:32:50,P00161,Pecenongan,-6.167710,106.82819,12.0,2023-04-01 08:16:02,3500.0
3,EVRV919Q6A86EC,4301046448643115806,brizzi,Rama Firmansyah,F,1992,5C,PGC - Juanda,0.0,P00016,BKN,-6.257751,106.87000,1,2023-04-01 09:41:03,P00033,Cawang UKI,-6.250309,106.87360,2.0,2023-04-01 10:27:36,3500.0
4,MLDJ961C1Q41OU,3567863915368369,dki,"Ciaobella Prasetyo, S.Gz",F,1988,11D,Pulo Gebang - Pulo Gadung 2 via PIK,1.0,B00396P,Gg. Aim 2,-6.200395,106.93515,13,2023-04-01 10:33:57,P00270,Walikota Jakarta Timur,-6.212540,106.94537,30.0,2023-04-01 12:12:59,3500.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
35471,RQCC152K8Z33OB,370582193494134,emoney,Tomi Winarsih,F,1997,JAK.08,Roxy - Benhil,0.0,B01364P,Jln. Marabahan Cideng,-6.172066,106.80538,2,2023-04-30 21:43:18,B02995P,PU Irigasi,-6.201479,106.81079,26.0,2023-04-30 22:02:27,0.0
35472,UDKH290G9T77LN,348952647256687,emoney,Estiawan Januar,M,1992,6M,Stasiun Manggarai - Blok M,1.0,P00068,Gatot Subroto LIPI Arah Timur,-6.226549,106.81740,6,2023-04-30 21:46:00,P00066,Gatot Subroto Jamsostek Arah Timur,-6.232602,106.82162,7.0,2023-04-30 22:09:47,3500.0
35473,LUUF688T8X16WC,30441106310287,bni,"Hasan Nugroho, M.Kom.",M,2004,D11,Depok - BKN,1.0,B02822P,Pesona Khayangan,-6.382532,106.83001,7,2023-04-30 21:48:15,B00106P,Balaikota Depok 2,-6.394973,106.82277,11.0,2023-04-30 22:57:47,3500.0
35474,WBEY632U5D57DZ,180096606431591,emoney,KH. Laksana Kuswandari,F,1993,M7,Kampung Rambutan - Monas,1.0,P00137,Monas,-6.176248,106.82286,0,2023-04-30 21:51:08,P00204,RS Harapan Bunda,-6.301956,106.86803,23.0,2023-04-30 23:23:18,3500.0


## **Data Exploration**

In [23]:
df_tj.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 35476 entries, 0 to 35475
Data columns (total 22 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   transID           35476 non-null  object 
 1   payCardID         35476 non-null  int64  
 2   payCardBank       35476 non-null  object 
 3   payCardName       35476 non-null  object 
 4   payCardSex        35476 non-null  object 
 5   payCardBirthDate  35476 non-null  int64  
 6   corridorID        35476 non-null  object 
 7   corridorName      35476 non-null  object 
 8   direction         35476 non-null  float64
 9   tapInStops        35446 non-null  object 
 10  tapInStopsName    35476 non-null  object 
 11  tapInStopsLat     35476 non-null  float64
 12  tapInStopsLon     35476 non-null  float64
 13  stopStartSeq      35476 non-null  int64  
 14  tapInTime         35476 non-null  object 
 15  tapOutStops       35458 non-null  object 
 16  tapOutStopsName   35476 non-null  object

In [24]:
df_tj.describe()

Unnamed: 0,payCardID,payCardBirthDate,direction,tapInStopsLat,tapInStopsLon,stopStartSeq,tapOutStopsLat,tapOutStopsLon,stopEndSeq,payAmount
count,35476.0,35476.0,35476.0,35476.0,35476.0,35476.0,35476.0,35476.0,35476.0,34536.0
mean,4.232545e+17,1990.139305,0.500592,-6.214867,106.841542,13.543692,-6.214662,106.841228,21.197232,2707.768705
std,1.319346e+18,13.034856,0.500007,0.057869,0.060397,12.200414,0.059024,0.061045,13.786082,4227.721929
min,60403680000.0,1946.0,0.0,-6.394973,106.61473,0.0,-6.394973,106.61473,1.0,0.0
25%,180040700000000.0,1982.0,0.0,-6.245863,106.80347,4.0,-6.247225,106.80164,11.0,0.0
50%,3507503000000000.0,1990.0,1.0,-6.214587,106.83483,10.0,-6.214787,106.83458,18.0,3500.0
75%,4693230000000000.0,2001.0,1.0,-6.175528,106.88262,19.0,-6.174736,106.8834,29.0,3500.0
max,4.997694e+18,2012.0,1.0,-6.089429,107.02395,68.0,-6.091746,107.02366,77.0,20000.0


## **Statistical Analysis**

## **Data Analysis**

Use statistika deskriptif, inferensial, atau keduanya sesuai keperluan analisis. -> and able to explain the WHYs

In [25]:
# ===== stopStartSequence & stopEndSequence =====

# df_tj[['direction', 'stopStartSeq', 'stopEndSeq']]
df_tj[['direction', 'stopStartSeq', 'stopEndSeq']][df_tj['corridorID'] == '5'].reset_index()

# Alternate Syntax
# df_tj[df_tj['corridorID'] == '5'][['direction', 'stopStartSeq', 'stopEndSeq']]

Unnamed: 0,index,direction,stopStartSeq,stopEndSeq
0,21,0.0,1,9.0
1,70,1.0,7,12.0
2,287,0.0,11,12.0
3,347,0.0,13,14.0
4,662,0.0,13,14.0
...,...,...,...,...
238,34675,1.0,6,7.0
239,34823,0.0,10,13.0
240,35056,1.0,14,15.0
241,35182,1.0,1,8.0


In [26]:
# ===== stopStartSequence & stopEndSequence =====

# ===== DISCREPANCY DETECTED =====
# Stop Start > StopEnd (will ALWAYS be bigger)
# Validated: 37900 (totalRow) - 1344 (nullRows) = 36556 (curr rowCount)
df_tj[(df_tj['stopStartSeq'] < df_tj['stopEndSeq'])]

Unnamed: 0,transID,payCardID,payCardBank,payCardName,payCardSex,payCardBirthDate,corridorID,corridorName,direction,tapInStops,tapInStopsName,tapInStopsLat,tapInStopsLon,stopStartSeq,tapInTime,tapOutStops,tapOutStopsName,tapOutStopsLat,tapOutStopsLon,stopEndSeq,tapOutTime,payAmount
0,LDEA875J4U32YZ,213107623841273,emoney,Natalia Pratama,M,1964,11D,Pulo Gebang - Pulo Gadung 2 via PIK,0.0,B05823P,United Tractors 1,-6.183260,106.93243,28,2023-04-01 06:22:27,B03090P,Raya Penggilingan,-6.183068,106.93194,29.0,2023-04-01 08:15:45,3500.0
1,MSNJ848P3Q44GI,4139531858845,online,Tgk. Kacung Nashiruddin,F,1997,7E,Kampung Rambutan - Ragunan,1.0,B01062P,Jln. Gabus Raya,-6.301222,106.83623,7,2023-04-01 06:54:35,B03148P,RS Jantung Binawaluya,-6.308410,106.87071,15.0,2023-04-01 07:27:31,3500.0
2,JZZV014Y9O33OV,60459139923,flazz,Zelda Thamrin,M,1972,JIS3,Harmoni - Jakarta International Stadium,0.0,P00046,Danau Agung,-6.146869,106.85805,8,2023-04-01 07:32:50,P00161,Pecenongan,-6.167710,106.82819,12.0,2023-04-01 08:16:02,3500.0
3,EVRV919Q6A86EC,4301046448643115806,brizzi,Rama Firmansyah,F,1992,5C,PGC - Juanda,0.0,P00016,BKN,-6.257751,106.87000,1,2023-04-01 09:41:03,P00033,Cawang UKI,-6.250309,106.87360,2.0,2023-04-01 10:27:36,3500.0
4,MLDJ961C1Q41OU,3567863915368369,dki,"Ciaobella Prasetyo, S.Gz",F,1988,11D,Pulo Gebang - Pulo Gadung 2 via PIK,1.0,B00396P,Gg. Aim 2,-6.200395,106.93515,13,2023-04-01 10:33:57,P00270,Walikota Jakarta Timur,-6.212540,106.94537,30.0,2023-04-01 12:12:59,3500.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
35471,RQCC152K8Z33OB,370582193494134,emoney,Tomi Winarsih,F,1997,JAK.08,Roxy - Benhil,0.0,B01364P,Jln. Marabahan Cideng,-6.172066,106.80538,2,2023-04-30 21:43:18,B02995P,PU Irigasi,-6.201479,106.81079,26.0,2023-04-30 22:02:27,0.0
35472,UDKH290G9T77LN,348952647256687,emoney,Estiawan Januar,M,1992,6M,Stasiun Manggarai - Blok M,1.0,P00068,Gatot Subroto LIPI Arah Timur,-6.226549,106.81740,6,2023-04-30 21:46:00,P00066,Gatot Subroto Jamsostek Arah Timur,-6.232602,106.82162,7.0,2023-04-30 22:09:47,3500.0
35473,LUUF688T8X16WC,30441106310287,bni,"Hasan Nugroho, M.Kom.",M,2004,D11,Depok - BKN,1.0,B02822P,Pesona Khayangan,-6.382532,106.83001,7,2023-04-30 21:48:15,B00106P,Balaikota Depok 2,-6.394973,106.82277,11.0,2023-04-30 22:57:47,3500.0
35474,WBEY632U5D57DZ,180096606431591,emoney,KH. Laksana Kuswandari,F,1993,M7,Kampung Rambutan - Monas,1.0,P00137,Monas,-6.176248,106.82286,0,2023-04-30 21:51:08,P00204,RS Harapan Bunda,-6.301956,106.86803,23.0,2023-04-30 23:23:18,3500.0


In [27]:
# ===== corridorID =====

# available corridors
# 221 Unique Corridors

df_tj['corridorID'].value_counts().reset_index()
# df_tj['corridorID'].isna().sum()

Unnamed: 0,corridorID,count
0,1T,384
1,S21,376
2,JIS3,333
3,JAK.06,327
4,11P,323
...,...,...
216,1R,38
217,JAK.99,31
218,JAK.12,22
219,7B,17


In [28]:
# ===== corridorName =====

# available corridors
# 221 Unique Corridors

df_tj['corridorName'].value_counts().reset_index()

Unnamed: 0,corridorName,count
0,Cibubur - Balai Kota,384
1,Ciputat - CSW,376
2,Harmoni - Jakarta International Stadium,333
3,Pulo Gadung - Monas,333
4,Kampung Rambutan - Pondok Gede,327
...,...,...
211,Gondangdia - Balai Kota,39
212,Senen - Tanah Abang,38
213,Term. Pulo Gadung - Lampiri,31
214,Tanah Abang - Kebayoran Lama via Pos Pengumben,22


In [29]:
# Displaying the least occuring CorridorID
df_tj[['corridorID', 'corridorName']][df_tj['corridorID'] == 'M5'].reset_index()

Unnamed: 0,index,corridorID,corridorName
0,7758,M5,Matraman Baru - Ancol
1,8112,M5,Matraman Baru - Ancol
2,16673,M5,Matraman Baru - Ancol
3,16798,M5,Matraman Baru - Ancol
4,17209,M5,Matraman Baru - Ancol
5,17600,M5,Matraman Baru - Ancol
6,17878,M5,Matraman Baru - Ancol
7,22642,M5,Matraman Baru - Ancol
8,26745,M5,Matraman Baru - Ancol
9,26845,M5,Matraman Baru - Ancol


In [30]:
# ===== DISCREPANCY DETECTED =====
# 5 & M5 has the same CorridorName
# 1 & 1T has different CorridorName

df_tj[['corridorID', 'corridorName']][df_tj['corridorName'] == 'Matraman Baru - Ancol'].reset_index()

Unnamed: 0,index,corridorID,corridorName
0,21,5,Matraman Baru - Ancol
1,70,5,Matraman Baru - Ancol
2,287,5,Matraman Baru - Ancol
3,347,5,Matraman Baru - Ancol
4,662,5,Matraman Baru - Ancol
...,...,...,...
255,35056,5,Matraman Baru - Ancol
256,35182,5,Matraman Baru - Ancol
257,35278,M5,Matraman Baru - Ancol
258,35330,M5,Matraman Baru - Ancol


In [31]:
# ===== Locating (matching CorridorID with CorridorName) =====
# pd.set_option("display.max_rows", None)
# pd.reset_option()
df = df_tj.groupby(['corridorName', 'corridorID']).agg({'direction':'sum'})
df.head(30)

# Contains >1 CorridorID for a SINGLE route
# 1. Blok M - Kota
# 2. Kalideres - Bundaran HI via Veteran
# 3. 	Matraman Baru - Ancol	
# 4. Pinang Ranti - Pluit
# 5. Pulo Gadung - Monas

# Validation: CorridorID count (221) - CorridorName count (216)
# hence the other 5 DISCREPANCIES is found in the 5 CorridorName listed above

Unnamed: 0_level_0,Unnamed: 1_level_0,direction
corridorName,corridorID,Unnamed: 2_level_1
Andara - Stasiun Universitas Pancasila,JAK.44,118.0
BKN - Blok M,M7B,153.0
BSD - Jelambar,S11,59.0
BSD Serpong - Fatmawati,S12,50.0
Batusari - Grogol,8K,128.0
Bekasi Barat - Blok M,B13,62.0
Bekasi Barat - Kuningan,B14,109.0
Bekasi Timur - Cawang,B21,108.0
Bintara - Cipinang Indah,JAK.85,68.0
Bintaro - Blok M,8E,85.0


In [32]:
# ===== Locating (matching tapInStops with tapInStopsName) =====

df = df_tj.groupby(['tapInStops', 'tapInStopsName']).agg({'direction':'sum'})
df.head(30)

Unnamed: 0_level_0,Unnamed: 1_level_0,direction
tapInStops,tapInStopsName,Unnamed: 2_level_1
B00001P,18 Office Park,19.0
B00004P,ACC Simatupang,1.0
B00005P,ACE Hardware,7.0
B00008P,Adam Malik 1,0.0
B00017P,Akper Fatmawati Pondok Labu,3.0
B00018P,AKR Tower,0.0
B00027P,Al Izhar Pondok Labu 2,0.0
B00028P,Al Khairiyah School,0.0
B00030P,Al Mukhlisin,19.0
B00031P,Al Wathoniyah 1,0.0


In [33]:
# ===== Locating (matching tapOutStops with tapOutStopsName) =====

df = df_tj.groupby(['tapOutStops', 'tapOutStopsName']).agg({'direction':'sum'})
df.head(30)

Unnamed: 0_level_0,Unnamed: 1_level_0,direction
tapOutStops,tapOutStopsName,Unnamed: 2_level_1
B00002P,ABA,0.0
B00003P,Acacia Residence,1.0
B00004P,ACC Simatupang,1.0
B00005P,ACE Hardware,1.0
B00013P,Ahmad Yani Pisangan Baru,0.0
B00015P,Akademi Farmasi Mahadhika,0.0
B00022P,Akses Jembatan Ciliwung Balekambang,0.0
B00028P,Al Khairiyah School,0.0
B00029P,Al Mahbubiyah,1.0
B00030P,Al Mukhlisin,1.0


In [34]:
df_tj['tapInStops'].count()
df_tj['tapOutStops'].count()
df_tj['tapInStops'].value_counts()
df_tj['tapOutStops'].value_counts()

tapOutStops
P00016     302
P00170     264
B05725P    188
B05708P    158
P00137     158
          ... 
B05380P      1
B00652P      1
B00390P      1
B00851P      1
B00106P      1
Name: count, Length: 2202, dtype: int64

In [35]:
# Outer JOIN
# pd.merge(df_tj['tapInStops'], df_tj['tapOutStops'], how='inner', on='key')
print("H")

H


In [36]:
# ===== payAmount =====
# 3 price options:
# 1. free 
# 2. 3.5K
# 3. 20K

df_tj['payAmount'].value_counts()

payAmount
3500.0     17313
0.0        15577
20000.0     1646
Name: count, dtype: int64

In [37]:
# ===== Corridor Prices =====

df = df_tj.groupby(['payAmount', 'corridorID']).agg({'direction':'sum'})
df.head(30)
# df = df_tj[df_tj['payAmount'] == 20000]
# df.head(50)

# Insights: All CorridorIDs with Substring("JAK") -> are FREE to ride (just like the fact in the real world)

# Most Expensive Corridors (20K)
# 1K	81.0
# 1T	202.0
# 6P	48.0
# B13	64.0
# B14	112.0
# D31	43.0
# D32	88.0
# S12	51.0
# S31	79.0
# T21	91.0

Unnamed: 0_level_0,Unnamed: 1_level_0,direction
payAmount,corridorID,Unnamed: 2_level_1
0.0,10A,65.0
0.0,10B,66.0
0.0,11B,43.0
0.0,11C,30.0
0.0,11K,66.0
0.0,11M,81.0
0.0,11N,67.0
0.0,11P,161.0
0.0,12C,65.0
0.0,12F,25.0
