# Preprocessing of Train Station Datasets

In [1]:
import pandas as pd
import geopandas
import json, math, os, re, requests, time
import datetime
pd.set_option('display.max_columns', 100)
pd.set_option('display.max_rows', 200)

In [2]:
# Navigate to the directory containing final_dataset.csv
os.chdir('C:/Users/Mindy/Documents/BT4222/Group Project/Primary Datasets/')

# Section 1: MRT Opening Dates

In [3]:
# Dataset of MRT Opening
mrt_opening = pd.read_excel('mrt_openingDates.xlsx', header = None)
mrt_opening.columns = ['STN_NO', 'STN_PREFIX', 'OPEN_DATE']
mrt_opening.index = mrt_opening.index + 1
mrt_opening.head()

Unnamed: 0,STN_NO,STN_PREFIX,OPEN_DATE
1,NS1 EW24,Jurong East,1990-03-10
2,NS2,Bukit Batok,1990-03-10
3,NS3,Bukit Gombak,1990-03-10
4,NS4,Choa Chu Kang,1990-03-10
5,NS5,Yew Tee,1996-02-10


In [4]:
mrt_opening.shape

(152, 3)

In [5]:
mrt_opening.dtypes

STN_NO                object
STN_PREFIX            object
OPEN_DATE     datetime64[ns]
dtype: object

In [6]:
# Cleanin STN_NO so that only the first value is taken
mrt_opening['cleaned_STN_NO'] = mrt_opening['STN_NO'].apply(lambda x: str(x).strip().split('\xa0')[0])

# Dropping duplicated instances (ie. interchanges)
mrt_opening = mrt_opening[mrt_opening['cleaned_STN_NO'].duplicated(keep='last') == False]
mrt_opening.head()

# Editing instances where STN_NAME = 'Botanic Gardens • Kebun Bunga'
mrt_opening.loc[mrt_opening['STN_PREFIX'].str.contains('Botanic Gardens'), 'STN_PREFIX'] = 'Botanic Gardens'

In [7]:
len(mrt_opening['cleaned_STN_NO'].unique())

145

In [8]:
mrt_opening.head()

Unnamed: 0,STN_NO,STN_PREFIX,OPEN_DATE,cleaned_STN_NO
1,NS1 EW24,Jurong East,1990-03-10,NS1
2,NS2,Bukit Batok,1990-03-10,NS2
3,NS3,Bukit Gombak,1990-03-10,NS3
4,NS4,Choa Chu Kang,1990-03-10,NS4
5,NS5,Yew Tee,1996-02-10,NS5


# Section 2: LRT Opening Dates

In [9]:
lrt_opening = pd.read_excel('lrt_openingDates.xlsx', header = None)
lrt_opening.columns = ['STN_NO', 'STN_PREFIX', 'OPEN_DATE', 'CLOSE_DATE']
lrt_opening.index = lrt_opening.index + 1
lrt_opening.head()

Unnamed: 0,STN_NO,STN_PREFIX,OPEN_DATE,CLOSE_DATE
1,BP1 NS4,Choa Chu Kang,1999-11-06,NaT
2,BP2,South View,1999-11-06,NaT
3,BP3,Keat Hong,1999-11-06,NaT
4,BP4,Teck Whye,1999-11-06,NaT
5,BP5,Phoenix,1999-11-06,NaT


In [10]:
lrt_opening.dtypes

STN_NO                object
STN_PREFIX            object
OPEN_DATE     datetime64[ns]
CLOSE_DATE    datetime64[ns]
dtype: object

In [11]:
# Cleanin STN_NO so that only the first value is taken
lrt_opening['cleaned_STN_NO'] = lrt_opening['STN_NO'].apply(lambda x: str(x).strip().split('\xa0')[0])

# Correcting cleaned_STN_NO for Sengkang station
lrt_opening.loc[lrt_opening['cleaned_STN_NO'] == 'NE16','cleaned_STN_NO'] = 'STC'

# Section 3: Train Stations

In [12]:
def prep_train_station_df(train_stations): 
    
    # Leaving out Bukit Brown Station CC18  and Teck Lee Station PW2 
    # since they are not yet open (ie. opening dates are TBA)
    train_stations = train_stations[train_stations['STN_NO'] != 'CC18'] 
    train_stations = train_stations[train_stations['STN_NO'] != 'PW2']

    # Altering COLOR column for all LRT stations from 'OTHERS' to 'LRT' 
    train_stations.loc[train_stations['STN_NAME'].str.contains('LRT'), 'COLOR'] = 'LRT'

    # Altering COLOR columns for all stations along the Thompson East Coast line from 'OTHERS' to 'BROWN'
    train_stations.loc[train_stations['STN_NO'].str.contains('TE'), 'COLOR'] = 'BROWN'

    # Altering COLOR columns for all CE stations from 'OTHERS' to 'YELLOW'
    train_stations.loc[train_stations['STN_NO'].str.contains('CE'), 'COLOR'] = 'YELLOW'

    # Altering COLOR columns for all CG stations from 'OTHERS' to 'GREEN'
    train_stations.loc[train_stations['STN_NO'].str.contains('CG'), 'COLOR'] = 'GREEN'

    """
    # Converting train_stations from DataFrame object in GeoDataFrame
    train_station_gdf = geopandas.GeoDataFrame(train_stations, 
                                               geometry=geopandas.points_from_xy(train_stations['Longitude'], train_stations['Latitude']),
                                               crs={"init":"EPSG:4326"})

    # Converting CRS so that we can calculate distances in meters instead of degrees
    train_station_gdf = train_station_gdf.to_crs({"init": "EPSG:3857"})
    """
    return train_stations

In [13]:
train_stations = pd.read_csv('mrtsg.csv')
train_stations = prep_train_station_df(train_stations)

# Separating MRT stations from LRT stations
lrt = train_stations[train_stations['COLOR'] == 'LRT']
mrt = train_stations[train_stations['COLOR'] != 'LRT']

In [14]:
mrt.head()

Unnamed: 0,OBJECTID,STN_NAME,STN_NO,X,Y,Latitude,Longitude,COLOR
0,12,ADMIRALTY MRT STATION,NS10,24402.1063,46918.1131,1.440585,103.800998,RED
1,16,ALJUNIED MRT STATION,EW9,33518.6049,33190.002,1.316433,103.882893,GREEN
2,33,ANG MO KIO MRT STATION,NS16,29807.2655,39105.772,1.369933,103.849553,RED
5,153,BARTLEY MRT STATION,CC12,33168.3039,36108.7003,1.342828,103.879746,YELLOW
6,115,BAYFRONT MRT STATION,CE1,30867.0093,29368.625,1.281874,103.859073,YELLOW


# Section 4: Combining MRT Datasets

In [15]:
# Checking that these dataframes are of the same length
len(mrt['STN_NO'].values) == len(mrt_opening['STN_NO'].values)

True

In [16]:
# Check that the stations within these 2 dataframes are a 1-to-1 match
# There should be no output if it is indeed a 1-to-1 match
for station in mrt['STN_NO'].values: 
    if (station not in mrt_opening['cleaned_STN_NO'].values): 
            print(station)
            
print('---------------')
            
for station in mrt_opening['cleaned_STN_NO'].values: 
    if (station not in mrt['STN_NO'].values): 
            print(station)

---------------


In [17]:
mrt = mrt.join(mrt_opening.set_index('cleaned_STN_NO'), on = 'STN_NO', lsuffix='_station', rsuffix='_opening')
mrt = mrt.drop('STN_NO_opening', axis = 1)
mrt.rename(columns = {'STN_NO_station': 'STN_NO'}, inplace = True)

In [18]:
# Double verifying correctness of join
mrt['NAME_VERIFIED'] = mrt.apply(lambda x: x['STN_PREFIX'].upper() in x['STN_NAME'], axis = 1)
mrt['NAME_VERIFIED'].value_counts()

True    145
Name: NAME_VERIFIED, dtype: int64

# Section 5: Combining LRT Datasets

In [19]:
# Checking that these dataframes are of the same length
len(lrt['STN_NO'].values) == len(lrt_opening['STN_NO'].values)

True

In [20]:
# Check that the stations within these 2 dataframes are a 1-to-1 match
# There should be no output if it is indeed a 1-to-1 match
for station in lrt['STN_NO'].values: 
    if (station not in lrt_opening['cleaned_STN_NO'].values): 
            print(station)
            
print('---------------')
            
for station in lrt_opening['cleaned_STN_NO'].values: 
    if (station not in lrt['STN_NO'].values): 
            print(station)

---------------


In [21]:
lrt = lrt.join(lrt_opening.set_index('cleaned_STN_NO'), on = 'STN_NO', lsuffix='_station', rsuffix='_opening')
lrt = lrt.drop('STN_NO_opening', axis = 1)
lrt.rename(columns = {'STN_NO_station': 'STN_NO'}, inplace = True)

In [22]:
# Double verifying correctness of join
lrt['NAME_VERIFIED'] = lrt.apply(lambda x: x['STN_PREFIX'].upper() in x['STN_NAME'], axis = 1)
lrt['NAME_VERIFIED'].value_counts()

True    42
Name: NAME_VERIFIED, dtype: int64

# Section 6: Final Train Stations Dataset

In [23]:
mrt['CLOSE_DATE'] = datetime.datetime(2100, 12, 31)
lrt.loc[lrt['CLOSE_DATE'].isnull() == True, 'CLOSE_DATE'] = datetime.datetime(2100, 3, 31)
mrt = mrt.append(lrt, ignore_index=True)

In [24]:
# Checking for duplicates
mrt[mrt.duplicated(keep = False)]

Unnamed: 0,OBJECTID,STN_NAME,STN_NO,X,Y,Latitude,Longitude,COLOR,STN_PREFIX,OPEN_DATE,NAME_VERIFIED,CLOSE_DATE


In [25]:
# Dropping unneccesary columns
mrt = mrt.drop(['STN_PREFIX', 'NAME_VERIFIED'], axis = 1)
mrt

Unnamed: 0,OBJECTID,STN_NAME,STN_NO,X,Y,Latitude,Longitude,COLOR,OPEN_DATE,CLOSE_DATE
0,12,ADMIRALTY MRT STATION,NS10,24402.1063,46918.1131,1.440585,103.800998,RED,1996-02-10,2100-12-31
1,16,ALJUNIED MRT STATION,EW9,33518.6049,33190.002,1.316433,103.882893,GREEN,1989-11-04,2100-12-31
2,33,ANG MO KIO MRT STATION,NS16,29807.2655,39105.772,1.369933,103.849553,RED,1987-11-07,2100-12-31
3,153,BARTLEY MRT STATION,CC12,33168.3039,36108.7003,1.342828,103.879746,YELLOW,2009-05-28,2100-12-31
4,115,BAYFRONT MRT STATION,CE1,30867.0093,29368.625,1.281874,103.859073,YELLOW,2012-01-14,2100-12-31
5,115,BAYFRONT MRT STATION,DT16,30867.0093,29368.625,1.281874,103.859073,BLUE,2013-12-22,2100-12-31
6,140,BEAUTY WORLD MRT STATION,DT5,21598.1665,35931.2359,1.341223,103.77581,BLUE,2015-12-27,2100-12-31
7,37,BEDOK MRT STATION,EW5,38757.952,34024.7048,1.32398,103.929959,GREEN,1989-11-04,2100-12-31
8,143,BEDOK NORTH MRT STATION,DT29,37421.7438,35214.6804,1.334742,103.917955,BLUE,2017-10-21,2100-12-31
9,149,BEDOK RESERVOIR MRT STATION,DT30,39008.3132,35421.0409,1.336608,103.932208,BLUE,2017-10-21,2100-12-31


In [26]:
# Checking for null values
mrt.isnull().any()

OBJECTID      False
STN_NAME      False
STN_NO        False
X             False
Y             False
Latitude      False
Longitude     False
COLOR         False
OPEN_DATE     False
CLOSE_DATE    False
dtype: bool

In [28]:
mrt['COLOR'].value_counts()

# ___ LRT Stations ___
# Historically, there has been a totoal of 41 LRT stations opened. 
# Ten Mile Junction is the only Lrt station that has ceased operations

# ___ BROWN Line ___
# The Thomson East Coast Line cuurently only has 3 stations in operation when verifying with the mrt map published

# ___ Other Lines ___
# The number of train stations for the rest of the lines were verified through a quick google search

LRT       42
GREEN     35
BLUE      34
YELLOW    30
RED       27
PURPLE    16
BROWN      3
Name: COLOR, dtype: int64

# Saving Dataset

In [27]:
mrt.to_csv('train_stations_data.csv')