In [2]:
import geopandas as gpd
import pandas as pd
import os

In [3]:
stops = pd.read_csv('transit_stop_gtfs/stops.txt')
routes = pd.read_csv('transit_stop_gtfs/routes.txt')
trips = pd.read_csv('transit_stop_gtfs/trips.txt')
stop_times = pd.read_csv('transit_stop_gtfs/stop_times.txt')

### Data Index of GTFS

#### agency.txt
- agency_id
	- The agency_id field is an ID that uniquely identifies a transit agency. A transit feed may represent data from more than one agency. The agency_id is dataset unique. This field is optional for transit feeds that only contain data for a single agency
- agency_name
	- The agency_name field contains the full name of the transit agency.
- agency_url
	- The agency_url field contains the URL of the transit agency.
- agency_timezone
	- The agency_timezone field contains the timezone where the transit agency is located.
- agency_lang
	- The agency_lang field contains a two-letter ISO 639-1 code for the primary language used by this transit agency.
- agency_phone
	- The agency_phone field contains a single voice telephone number for the specified agency.
- agency_fare_url
	- The agency_fare_url specifies the URL of a web page that allows a rider to purchase tickets or other fare instruments for that agency online.

#### calendar.txt
- service_id
	- The service_id contains an ID that uniquely identifies a set of dates when service is available for one or more routes. Each service_id value can appear at most once in a calendar.txt file. This value is dataset unique. <span style="color:rgb(255, 192, 0)">It is referenced by the trips.txt file. </span>
- monday ~ sunday
	- The days field contain a binary value that indicates whether the service is valid for each days.
- start_date
	- The start_date field contains the start date for the service should be in YYYYMMDD format.
- end_date
	- The end_date field contains the end date for the service. This date is included in teh service interval adn should be in YYYMMDD format.

#### calendar_date.txt
- service_id
	-  The service_id contains an ID that uniquely identifies a set of dates when service is available for one or more routes. 
- date
	- The date field specifies a particular date when service availability is different than norm. You can use the exception_type field to indicate whether service is available on the specified date.
- exception_type
	- The exception_type indicates whether service is available on the date specified in the date field.


#### routes.txt
- route_id
	- The route_id field contains as ID that uniquely identifies a route. The route_id is dataset unique.
- agency_id
	- The agency_id field defines an agency for the specified route. This value is refereced from the agency.txt file. Use this field when you are providing data for routes from more than one agency.
- route_short_name
	- The route_short_name contains the short name of a route. This will often be a short, abstract identifier like "32", "100X", or "Green" that riders use to identify a route, but which doesn't give any indication of what places the route serves. At least one of route_short_name or route_long_name must be specified, or potentially both if appropriate. If the route does not have a short name, please specify a route_long_name and use an empty string as the value for this field.
- route_long_name
	- The route_long_name contains the full name of a route. This name is generally more descriptive than the route_short_name and will often include the route's destination or stop. At least one of route_short_name or route_long_name must be specified, or potentially both if appropriate. If the route does not have a long name, please specify a route_short_name and use an empty string as the value for this field.
- route_desc
	- The route_desc field contains a description of a route. Please provide useful, quality information. Do not simply duplicate the name of the route. For example, "A trains operate between Inwood-207, Manhattan and Far Rockaway-Mott Avenue, Queens at all times. Also from about midnight, additional A trains operate between Inwood-207 St and Lefferts Boulevard (trains typically alternate between Lefferts Blvd and Far Rockaway"
- route_type
	- The route_type field describes the type of transportation used on a route.
- route_url
	- The route_url field contains the URL of a web page about that particular route. This should be different from the agency_url
- route_color
	- In systems that have colors assigned to routes, the route_color field defines a color that corresponds to a route. The color must be provided as a six-character hexadecimal number, for example, 00FFFF. If no color is specified, the default route color is white(FFFFFF)
- route_text_color
	- The route_text_color field can be used to specify a legible color to use for text drawn against a background of route_color. The color must be provided as a six-character hexadecimal number. If no color is specified, the default text color is black (000000)

#### shape.txt
- shape_id
	- The shape_id field contains an ID that uniquely identifies a shape.
- shape_pt_lat
	- The shape_pt_lat field associates a shape point's latitude with a shape ID. The field value must be a valid WGS 84 latitude. Each row in shapes.txt represents a shape point in our shape definition.
- shape_pt_long
	- The shape_pt_lon field associates a shape point's longitude with a shape ID. The field value must be a valid WGS 84 longitude value from -180 to 180. Each row in shapes.txt represents a shape point in your shape definition.
- shape_pt_sequence
	- The shape_pt_sequence field associates the latitude and longitude of a shape point with its sequence order along the shape. The values for shape_pt_sequence must be non-negative integers, and they must increase along the trip.
- shape_dist_traveled
	- When used in the shapes.txt file, the shape_dist_traveled field positions a shape point as a distance traveled along a shape from the first shape point. The shape_dist_traveled field represents a real distance traveled along the route in units such as feet or kilometers. This information allows the trip planner to determine how much of the shape to draw when showing part of a trip on the map. The values used for shape_dist_traveled must increase along with shape_pt_sequence: they cannot be used to show reverse travel along a route.


#### stop_times.txt
- trip_id
	- The trip_id field contains an ID that identifies a trip. This value is referenced from teh trips.txt file.
- arrival_time
	- The arrival_time specifies the arrival time at a specific stop for a specific trip on a route. Times must be eight characters in HH:MM:SS format.
- departure_time
	- The departure_time specifies the departure time from a specific stop for a specific trip on a route. You must specify arrival and departure times for the first and last stops in a trip. Times must be eight digits in HH:MM:SS format.
- stop_id
	- The stop_id field contains an ID that uniquely identifies a stop. Multiple routes may use the same stop. The stop_id is referenced from the stops.txt file. If location_type is used in stops.txt, all stops referenced in stop_times.txt must have lovation_type of 0.
- stop_sequence
	- The stop_sequence field identifies the order of the stops for a particular trip. The values for stop_sequence must be non-negative integers, and they must increase along the trip.
- stop_headsign
	- The stop_headsign field contains the text that appears on a sign that identifies the trip's destination to passengers. Use this field to override the default trip_headsign when the headsign changes between stops. If this headsign is associated with an entire trip, use trip_headsign instead.
- pickup_type
	- The pickup_type field indicates whether passengers are picked up at a stop as part of the normal schedule or whether a pickup at the stop is not available. This field also allows the transit agency to indicate that passengers must call the agency or notify the driver to arrange a pickup at a particular stop.
- drop_off_type
	- The drop_off_type field indicates whether passengers are dropped off at a stop as part of the normal schedule or whether a drop off at the stop is not available. This field also allows the transit agency to indicate that passengers must call the agency or notify the driver to arrange a drop off at a particular stop.
- shape_dist_traveled
	- When used in the stop_times.txt file, the shape_dist_traveled field positions a stop as a distance from the first shape point. The shape_dist_traveled field represents a real distance traveled along the route in units such as feet or kilometers. The units used for shape_dist_traveled in the stop_times.txt file must match the units that are used for this field in the shapes.txt.file.


#### stops.txt
- stop_id
	- The stop_id field contains an ID that uniquely identifies a stop or station. Multiple routes may use the same stop. THe stop_id is dataset unique.
- stop_code
	- The stop_code field contains short text or a number that uniquely identifies the stop for passengers. Stop_codes are often used in phone-based transit information systems or printed on step signage to make it easier for riders to get a stop schedule or real-time arrival information for a particular stop. THe stop_code field should only be used for stop codes that are displayed to passengers. For internal codes, use stop_id. This field should be left blank for stops without a code.
- stop_name
	- The stop_name field contains the name of a stop or station. Please use a name that people will understand in the local and tourist vernacular.
- stop_desc
	- The stop_desc field contains a description of a stop. Please provide useful, quality information. Do not simply duplicate the name of the stop.
- stop_lat
- stop_lon
- zone_id
	- The zone_id field defines the fare zone for a stop ID. Zone IDs are required if you want to provide fare information using fare_rules.txt. IF this stop ID represents a station. the zone ID is ignored.
- stop_url
	- The stop_url field contains the URL of a web page about a particular stop. This should be different from the agency_url and the route_url fields. The value must be a fully qualified URL that includes http:// or https://, and any special characters in the URL must be correctly escaped. See http://www.w3.org/Addressing/URL for a description of how to create fully qualified URL values.
- location_type
	- The location_type field identifies whether this stop ID represents a stop or station. If no location type is specified, or the location_type is blank, stop IDs are treated as stops. Stations may have different properties from stops when they are represented on a map or used in trip planning. 0 or blank represents a stop, while 1 represents a station that contains one or more stop.
- parent_station
	- For stops that are physically located inside stations, the parent_station field identifies the station associated with the stop. To use this field, stops.txt must also contains a row where this stop ID is assigned location type=1
- stop_timezone
	- The stop_timezone field contains the timezone in which this stop or station is located. Please refer to Wikipedia List of Timezones for a list of valid values. If omitted, the stop should be assumed to be located in the timezone specified by agency_timezone in agency.txt
- wheelchair_boarding
	- The wheelchair_boarding field identifies whether wheelcahir boardings are possible from the specified stop or station. 0 or blank indicates there is no accessibility information for the stop. 1 indicates that at least some vehicles at this stop can be boarded by a rider in a wheelchair. 2 indicates wheelchair boarding is not possible at this stop.


#### trips.txt
- route_id
	- The route_id field contains an ID that uniquely identifies a route. THis value is referenced from the routes.txt file.
- service_id
	- The service_id contains an ID that uniquely identifies a set of dates when service is available for one or more routes. This value is referenced from the calendar.txt or calendar_dates.txt file.
- trip_id
	- The trip_id field contains an ID that identifies a trip. The trip_id is dataset unique.
- trip_headsign
	- The trip_headsign field contains the text that appears on a sign that identifies the trip's destination to passengers. Use this field to distinguish between different patterns of service in the same route. If the headsign changes during a trip, you can override the trip_headsign by specifying values for the the stop_headsign field in stop_times.txt.
- trip_short_name
	- The trip_short_name field contains the text that appears in schedules and sign boards to identify the trip to passengers, for example, to idntify train numbers for commuter rail trips. If riders do not commonly rely on trip names, please leave this field blank. A trip_short_name value, if provided, should uniquely identify a trip within a service day; it should not be used for destination names or limited/express designations.
- direction_id
	- The direction_id field contains a binary value that indicates the direction of travel for a trip. Use this field to distinguish between bi-directional trips with the same route_id. This field is not used in routing; it provides a way to separate trips by direction when publishing time tables. You can specify names for each direction with the trip_headsign_field.

### Data Exploration

In [4]:
stops.head()

Unnamed: 0,stop_id,stop_code,stop_name,stop_desc,stop_lat,stop_lon,zone_id,stop_url,location_type,parent_station,stop_timezone,wheelchair_boarding
0,27,907933,HAMILTON E HOLMES STATION,70 HAMILTON E HOLMES DR NW & CSX TRANSPORTATION,33.754553,-84.469302,,,,,,1.0
1,28,908023,WEST LAKE STATION,80 ANDERSON AVE NW & CSX TRANSPORTATION,33.753328,-84.445329,,,,,,1.0
2,39,907906,WEST LAKE STATION,80 ANDERSON AVE NW & CSX TRANSPORTATION,33.753247,-84.445568,,,,,,1.0
3,40,907907,HAMILTON E HOLMES STATION,70 HAMILTON E HOLMES DR NW & CSX TRANSPORTATION,33.754517,-84.469824,,,,,,1.0
4,53,908051,DUNWOODY STATION,1200 HAMMOND DR NE & UNNAMED ST,33.920862,-84.344213,,,,,,1.0


In [5]:
routes.head()

Unnamed: 0,route_id,agency_id,route_short_name,route_long_name,route_desc,route_type,route_url,route_color,route_text_color
0,21618,MARTA,1,Marietta Blvd/Joseph E Lowery Blvd,,3,https://itsmarta.com/1.aspx,FF00FF,0
1,21619,MARTA,2,Ponce de Leon Avenue / Druid Hills,,3,https://itsmarta.com/2.aspx,008000,0
2,21620,MARTA,3,Martin Luther King Jr Dr/Auburn Ave,,3,https://itsmarta.com/3.aspx,FF8000,0
3,21621,MARTA,4,Moreland Avenue,,3,https://itsmarta.com/4.aspx,FF00FF,0
4,21622,MARTA,5,Piedmont Road / Sandy Springs,,3,https://itsmarta.com/5.aspx,00FFFF,0


In [6]:
trips.head()

Unnamed: 0,route_id,service_id,trip_id,trip_headsign,trip_short_name,direction_id,block_id,shape_id,wheelchair_accessible,bikes_allowed
0,21743,2,9141382,BLUE EASTBOUND TO INDIAN CREEK STATION,,0,1155676,117635,1,0
1,21743,2,9141381,BLUE EASTBOUND TO INDIAN CREEK STATION,,0,1155675,117635,1,0
2,21743,2,9141380,BLUE EASTBOUND TO INDIAN CREEK STATION,,0,1155673,117635,1,0
3,21743,2,9141415,BLUE EASTBOUND TO INDIAN CREEK STATION,,0,1155672,117635,1,0
4,21743,2,9141414,BLUE EASTBOUND TO INDIAN CREEK STATION,,0,1155670,117635,1,0


In [7]:
stop_times.head()

Unnamed: 0,trip_id,arrival_time,departure_time,stop_id,stop_sequence,stop_headsign,pickup_type,drop_off_type,shape_dist_traveled,timepoint
0,9141380,6:43:00,6:43:00,27,1,,0,0,,1
1,9141380,6:46:00,6:46:00,28,2,,0,0,2.377,1
2,9141380,6:49:00,6:49:00,485,3,,0,0,5.1292,1
3,9141380,6:50:00,6:50:00,470,4,,0,0,6.3709,1
4,9141380,6:51:00,6:51:00,796,5,,0,0,7.0017,1


### **Merge Data (to find stops' type (bus, subway, streetcars))**

#### 1.trips <- route_type
route_type contains transportation types (Bus, Subway, and Streecar)

In [8]:
trips_with_routes = trips.merge(routes[['route_id', 'route_type']], on='route_id', how='left')
trips_with_routes.head()

Unnamed: 0,route_id,service_id,trip_id,trip_headsign,trip_short_name,direction_id,block_id,shape_id,wheelchair_accessible,bikes_allowed,route_type
0,21743,2,9141382,BLUE EASTBOUND TO INDIAN CREEK STATION,,0,1155676,117635,1,0,1
1,21743,2,9141381,BLUE EASTBOUND TO INDIAN CREEK STATION,,0,1155675,117635,1,0,1
2,21743,2,9141380,BLUE EASTBOUND TO INDIAN CREEK STATION,,0,1155673,117635,1,0,1
3,21743,2,9141415,BLUE EASTBOUND TO INDIAN CREEK STATION,,0,1155672,117635,1,0,1
4,21743,2,9141414,BLUE EASTBOUND TO INDIAN CREEK STATION,,0,1155670,117635,1,0,1


#### 2.stop_times <- trips_with_routes
now we can merge these by common column (trip_id) so that each stops get route_type data

In [9]:
stop_times_with_route_type = stop_times.merge(trips_with_routes[['trip_id', 'route_type']], on='trip_id', how='left')
stop_times_with_route_type.head()

Unnamed: 0,trip_id,arrival_time,departure_time,stop_id,stop_sequence,stop_headsign,pickup_type,drop_off_type,shape_dist_traveled,timepoint,route_type
0,9141380,6:43:00,6:43:00,27,1,,0,0,,1,1
1,9141380,6:46:00,6:46:00,28,2,,0,0,2.377,1,1
2,9141380,6:49:00,6:49:00,485,3,,0,0,5.1292,1,1
3,9141380,6:50:00,6:50:00,470,4,,0,0,6.3709,1,1
4,9141380,6:51:00,6:51:00,796,5,,0,0,7.0017,1,1


#### 3.stops <- stop_times_with_route_type
now we can make new stops dataframe which each stops have transportation type data

In [10]:
stops_with_type = stops.merge(stop_times_with_route_type[['stop_id', 'route_type']], on='stop_id', how='left')
stops_with_type.head()

Unnamed: 0,stop_id,stop_code,stop_name,stop_desc,stop_lat,stop_lon,zone_id,stop_url,location_type,parent_station,stop_timezone,wheelchair_boarding,route_type
0,27,907933,HAMILTON E HOLMES STATION,70 HAMILTON E HOLMES DR NW & CSX TRANSPORTATION,33.754553,-84.469302,,,,,,1.0,1
1,27,907933,HAMILTON E HOLMES STATION,70 HAMILTON E HOLMES DR NW & CSX TRANSPORTATION,33.754553,-84.469302,,,,,,1.0,1
2,27,907933,HAMILTON E HOLMES STATION,70 HAMILTON E HOLMES DR NW & CSX TRANSPORTATION,33.754553,-84.469302,,,,,,1.0,1
3,27,907933,HAMILTON E HOLMES STATION,70 HAMILTON E HOLMES DR NW & CSX TRANSPORTATION,33.754553,-84.469302,,,,,,1.0,1
4,27,907933,HAMILTON E HOLMES STATION,70 HAMILTON E HOLMES DR NW & CSX TRANSPORTATION,33.754553,-84.469302,,,,,,1.0,1


#### 4.Drop duplicated rows
There are several trips for each stops! We need to drop.

In [11]:
stops_with_type = stops_with_type.drop_duplicates(subset=['stop_id'])

Lets change type name to more legible ones

In [12]:
stops_with_type['station_type'] = stops_with_type['route_type'].apply(lambda x: 'Streetcar' if x == 0 else ('Subway' if x == 1 else ('Bus' if x == 3 else 'Others')))

In [13]:
stops_with_type2 = stops_with_type.reset_index()
stops_with_type2

Unnamed: 0,index,stop_id,stop_code,stop_name,stop_desc,stop_lat,stop_lon,zone_id,stop_url,location_type,parent_station,stop_timezone,wheelchair_boarding,route_type,station_type
0,0,27,907933,HAMILTON E HOLMES STATION,70 HAMILTON E HOLMES DR NW & CSX TRANSPORTATION,33.754553,-84.469302,,,,,,1.0,1,Subway
1,622,28,908023,WEST LAKE STATION,80 ANDERSON AVE NW & CSX TRANSPORTATION,33.753328,-84.445329,,,,,,1.0,1,Subway
2,1244,39,907906,WEST LAKE STATION,80 ANDERSON AVE NW & CSX TRANSPORTATION,33.753247,-84.445568,,,,,,1.0,1,Subway
3,1858,40,907907,HAMILTON E HOLMES STATION,70 HAMILTON E HOLMES DR NW & CSX TRANSPORTATION,33.754517,-84.469824,,,,,,1.0,1,Subway
4,2472,53,908051,DUNWOODY STATION,1200 HAMMOND DR NE & UNNAMED ST,33.920862,-84.344213,,,,,,1.0,1,Subway
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8903,1964669,99974332,213843,ANVIL BLOCK RD @ S 1ST ST,,33.625161,-84.317307,0.0,,,,,1.0,3,Bus
8904,1964784,99974333,213844,ANVIL BLOCK RD @ MORELAND AVE,,33.628359,-84.313178,0.0,,,,,0.0,3,Bus
8905,1964899,99974334,213839,OLD DIXIE RD @ SOUTHPOINT DR,,33.630066,-84.386122,0.0,,,,,1.0,3,Bus
8906,1965015,99974335,213840,OLD DIXIE RD @ SOUTHPOINT DR,,33.630325,-84.385943,0.0,,,,,0.0,3,Bus


### Save Stop Data Files by transportation type

In [14]:
streetcar_stops = stops_with_type[stops_with_type['station_type'] == 'Streetcar']
streetcar_stops.to_csv('streetcar_stops.csv', index=False)

In [15]:
folder_path = 'Stops_Cleaned'
file_path = os.path.join(folder_path, 'streetcar_stops.csv')
streetcar_stops.to_csv(file_path, index=False)

In [16]:
subway_stops = stops_with_type[stops_with_type['station_type'] == 'Subway']
folder_path = 'Stops_Cleaned'
file_path = os.path.join(folder_path, 'subway_stops.csv')
subway_stops.to_csv(file_path, index=False)

In [17]:
bus_stops = stops_with_type[stops_with_type['station_type'] == 'Bus']
folder_path = 'Stops_Cleaned'
file_path = os.path.join(folder_path, 'bus_stops.csv')
bus_stops.to_csv(file_path, index=False)

In [18]:
undefined_stops = stops_with_type[stops_with_type['station_type'] == 'Others']
undefined_stops

Unnamed: 0,stop_id,stop_code,stop_name,stop_desc,stop_lat,stop_lon,zone_id,stop_url,location_type,parent_station,stop_timezone,wheelchair_boarding,route_type,station_type
