# Building the Database

Each citibike file records information about every single trip that was taken during a single month of the year. There are files for each month starting from June 2013. Each citibike file has the same format. The order and the description of the colomns are as follows:
- Trip Duration (seconds): The length of the trip in seconds
- Start Date & Time: The start time of the trip MM-DD-YYYY HH:MM:SS
- End Date & Time: The end time of the trip MM-DD-YYYY HH:MM:SS
- Start Station ID: The ID for the station where the trip started
- Start Station Name: The name of the station where the trip started
- Start Station Latitude: The latitude of the station where the trip started
- Start Station Longitude: The longitude of the station where the trip started
- End Station ID: The ID for the station where the trip ended
- End Station Name: The name of the station where the trip ended
- End Station Latitude: The latitude of the station where the trip ended
- End Station Longitude: The longitude of the station where the trip ended
- Bike ID: The ID for the bike that was used in the trip
- User Type: What type of user took the trip (Subscriber or Customer)
- Gender: The gender of the user (Male - 1, Female - 2, None - 0)
- Year of Birth: The year that the user was born

<img src="./Data/Images/DatabaseDiagramW.png" width="600" height="800" align="center"/>

*Note: If you cannot see the label names try editing the markdown code (double click diagram) and change the src from DatabaseDiagramW.png to DatabaseDiagramB.png

### Connecting to the Database

In [1]:
pip install psycopg2-binary;

Collecting psycopg2-binary
  Using cached psycopg2_binary-2.8.6-cp37-cp37m-manylinux1_x86_64.whl (3.0 MB)
Installing collected packages: psycopg2-binary
Successfully installed psycopg2-binary-2.8.6
Note: you may need to restart the kernel to use updated packages.


In [2]:
import psycopg2

In [3]:
# Put the password in 
PGHOST = 'tripdatabase2.cmaaautpgbsf.us-east-2.rds.amazonaws.com'
PGDATABASE = ''
PGUSER = 'postgres'
PGPASSWORD = 'Josh1234'

In [4]:
# Database Context Manager
try:   
    # Set up a connection to the postgres server.    
    conn = psycopg2.connect(user = PGUSER,
                            port = "5432",
                            password = PGPASSWORD,
                            host = PGHOST,
                            database = PGDATABASE)
    # Create a cursor object
    cursor = conn.cursor()   
    cursor.execute("SELECT version();")
    record = cursor.fetchone()
    print("Connection Success:", record,"\n")

except (Exception, psycopg2.Error) as error:
    print("Error while connecting to PostgreSQL", error)

Connection Success: ('PostgreSQL 12.5 on x86_64-pc-linux-gnu, compiled by gcc (GCC) 4.8.5 20150623 (Red Hat 4.8.5-11), 64-bit',) 



## Database Construction I - Creating the Staging Tables

#### A - Installs, Imports, Functions, etc.

In [5]:
pip install s3fs;

Collecting botocore<1.19.53,>=1.19.52
  Using cached botocore-1.19.52-py2.py3-none-any.whl (7.2 MB)
[31mERROR: boto3 1.17.12 has requirement botocore<1.21.0,>=1.20.12, but you'll have botocore 1.19.52 which is incompatible.[0m
[31mERROR: awscli 1.19.12 has requirement botocore==1.20.12, but you'll have botocore 1.19.52 which is incompatible.[0m
Installing collected packages: botocore
  Attempting uninstall: botocore
    Found existing installation: botocore 1.20.12
    Uninstalling botocore-1.20.12:
      Successfully uninstalled botocore-1.20.12
Successfully installed botocore-1.19.52
Note: you may need to restart the kernel to use updated packages.


In [6]:
import pandas as pd
import numpy as np
import s3fs
import os
from io import StringIO
import Queries
from urllib.parse import urlencode
import requests

In [7]:
# The S3 Bucket that will be used to store the data should be created beforehand
ACCESS_KEY_ID = 'AKIARJEUISD2VILSZ6HM'
ACCESS_SECRET_KEY = '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'

fs = s3fs.S3FileSystem(anon=False, key = ACCESS_KEY_ID, secret= ACCESS_SECRET_KEY)

In [8]:
api_key = 'AIzaSyCrG_VK47xMKjER4zpHyd3FJNLFn2weNFY'

In [10]:
def upload_data(conn, data: pd.DataFrame(), table: str):
    datastream = StringIO()
    cursor = conn.cursor()
    
    data.to_csv(datastream, index=False, header=False)
    datastream.seek(0)
    
    cursor.execute('rollback;')
    cursor.copy_from(datastream,table,sep=',')
    conn.commit()
    
    return None    

#### Geocoding Functions


In [82]:
def get_address_components(address, data_type='json'):
    endpoint = f'https://maps.googleapis.com/maps/api/geocode/{data_type}'
    params = {'address': address, 'key': api_key}
    url_params = urlencode(params)
    url = f"{endpoint}?{url_params}"
    
    r = requests.get(url)
    if r.status_code not in range(200,299):
        return {}

    try:
        return r.json()['results'][0]['address_components']
    except IndexError:
        return -1


def get_latlong_components(lat, long, data_type = 'json'):
    url = f"https://maps.googleapis.com/maps/api/geocode/json?latlng={lat}, {long}&key={api_key}"
    
    r = requests.get(url)
    if r.status_code not in range(200,299):
        return {}
   
    try:
        return r.json()['results'][0]['address_components']
    except IndexError:
        return -1


def extract_zipcode(components):
    for comp in components:
        if comp.get('types')[0] == 'postal_code':
            return comp.get('long_name')
        

In [83]:
def input_zipcode(df):
    if df.end_lat < 10:
        return extract_zipcode(get_address_components(df.endname))
    else:
        return extract_zipcode(get_latlong_components(df.end_lat, df.end_long))

In [76]:
staging_schema_query = """CREATE SCHEMA IF NOT EXISTS staging;"""
cursor.execute("rollback;")
cursor.execute(staging_schema_query)

#### Creating the BayWheels Staging Table

In [17]:
bay_filenames = fs.ls("s3://williams-citibike/TripData/BayWheels")

In [18]:
# TAbles module. One function for all the tables. 
bay_staging_query = """
               CREATE TABLE IF NOT EXISTS staging.bay_trip (
                   starttime TIMESTAMP,
                   endtime TIMESTAMP,
                   startID VARCHAR,
                   startname VARCHAR(128),
                   start_lat REAL,
                   start_long REAL,
                   endID VARCHAR,
                   endname VARCHAR(128),
                   end_lat REAL,
                   end_long REAL             
              );
              """
cursor.execute("rollback;")
cursor.execute(bay_staging_query)
conn.commit()

In [19]:
def populate_bay_staging(datafile: str) -> None:
    """Grabs the data from the s3 bucket and edits it so that it can be uploaded to the staging table
    
    Parameters
    ----------
    datafile : str
        The name of a file in the s3 bucket without the s3:// prefix

    Returns
    -------
    None:
        If executed properly the database should now have rows corresponding to the rows in the data
    """
    columns = ['start_time','end_time',
               'start_station_id', 'start_station_name', 
               'start_station_latitude', 'start_station_longitude', 
               'end_station_id', 'end_station_name',
               'end_station_latitude', 'end_station_longitude']


    altcols = ['started_at','ended_at',
               'start_station_id', 'start_station_name',
               'start_lat', 'start_lng',
               'end_station_id', 'end_station_name',
               'end_lat', 'end_lng']
        
    na_fills = {'start_lat': -1,'start_lng': -1,
               'end_lat': -1, 'end_lng': -1}
    
    with fs.open("s3://"+datafile, 'r') as file:
        try:
            data = pd.read_csv(file, usecols = columns, na_values="")[columns]
        except:    
            file.seek(0)
            data = pd.read_csv(file, usecols = altcols, na_values="")[altcols]
            data.fillna(value=na_fills, inplace=True)
        
        #Some stations have commas in their name causing the copy_from to register extra data fields
        data.iloc[:, 3] = data.iloc[:, 3].str.replace(',','_')
        data.iloc[:, 7] = data.iloc[:, 7].str.replace(',','_')
        
        upload_data(conn, data, 'staging.bay_trip')

    print(f"Finished Uploading to Bay Staging Table: {datafile}")
    return None

In [20]:
for file in bay_filenames:
    populate_bay_staging(file)

Finished Uploading to Bay Staging Table: williams-citibike/TripData/BayWheels/2017-fordgobike-tripdata.csv
Finished Uploading to Bay Staging Table: williams-citibike/TripData/BayWheels/201801-fordgobike-tripdata.csv
Finished Uploading to Bay Staging Table: williams-citibike/TripData/BayWheels/201802-fordgobike-tripdata.csv
Finished Uploading to Bay Staging Table: williams-citibike/TripData/BayWheels/201803-fordgobike-tripdata.csv
Finished Uploading to Bay Staging Table: williams-citibike/TripData/BayWheels/201804-fordgobike-tripdata.csv
Finished Uploading to Bay Staging Table: williams-citibike/TripData/BayWheels/201805-fordgobike-tripdata.csv
Finished Uploading to Bay Staging Table: williams-citibike/TripData/BayWheels/201806-fordgobike-tripdata.csv
Finished Uploading to Bay Staging Table: williams-citibike/TripData/BayWheels/201807-fordgobike-tripdata.csv
Finished Uploading to Bay Staging Table: williams-citibike/TripData/BayWheels/201808-fordgobike-tripdata.csv
Finished Uploading to

#### Creating the BlueBike Staging Table

In [21]:
blue_filenames = fs.ls("s3://williams-citibike/TripData/BlueBike")

In [22]:
# TAbles module. One function for all the tables. 
blue_staging_query = """
               CREATE TABLE IF NOT EXISTS staging.blue_trip (
                   starttime TIMESTAMP,
                   endtime TIMESTAMP,
                   startID NUMERIC,
                   startname VARCHAR(128),
                   start_lat REAL,
                   start_long REAL,
                   endID NUMERIC,
                   endname VARCHAR(128),
                   end_lat REAL,
                   end_long REAL              
              );
              """
cursor.execute("rollback;")
cursor.execute(blue_staging_query)
conn.commit()

In [23]:
def populate_blue_staging(datafile: str) -> None:
    """Grabs the data from the s3 bucket and edits it so that it can be uploaded to the staging table
    
    Parameters
    ----------
    datafile : str
        The name of a file in the s3 bucket without the s3:// prefix

    Returns
    -------
    None:
        If executed properly the database should now have rows corresponding to the rows in the data
    """
      
    columns = ['starttime','stoptime',
               'start station id', 'start station name',
               'start station latitude', 'start station longitude',
               'end station id', 'end station name',
               'end station latitude', 'end station longitude']
    
    with fs.open("s3://"+datafile, 'r') as file:
        data = pd.read_csv(file, usecols=columns, na_values = "")[columns]
        
        data.iloc[:, 3] = data.iloc[:, 3].str.replace(',','_')
        data.iloc[:, 7] = data.iloc[:, 7].str.replace(',','_')
        
        upload_data(conn,data,'staging.blue_trip')
    
    print(f"Finished Uploading to Blue Staging Table: {datafile}")
    return None

In [24]:
# Data starts from 2015, any data before data doesn't have location data
for file in blue_filenames[5:]:
    populate_blue_staging(file)

Finished Uploading to Blue Staging Table: williams-citibike/TripData/BlueBike/201501-hubway-tripdata.csv
Finished Uploading to Blue Staging Table: williams-citibike/TripData/BlueBike/201502-hubway-tripdata.csv
Finished Uploading to Blue Staging Table: williams-citibike/TripData/BlueBike/201503-hubway-tripdata.csv
Finished Uploading to Blue Staging Table: williams-citibike/TripData/BlueBike/201504-hubway-tripdata.csv
Finished Uploading to Blue Staging Table: williams-citibike/TripData/BlueBike/201505-hubway-tripdata.csv
Finished Uploading to Blue Staging Table: williams-citibike/TripData/BlueBike/201506-hubway-tripdata.csv
Finished Uploading to Blue Staging Table: williams-citibike/TripData/BlueBike/201507-hubway-tripdata.csv
Finished Uploading to Blue Staging Table: williams-citibike/TripData/BlueBike/201508-hubway-tripdata.csv
Finished Uploading to Blue Staging Table: williams-citibike/TripData/BlueBike/201509-hubway-tripdata.csv
Finished Uploading to Blue Staging Table: williams-citi

#### Creating the Capital Staging Table

In [11]:
capital_filenames = fs.ls("s3://williams-citibike/TripData/CapitalBike")

In [26]:
# TAbles module. One function for all the tables. 
capital_staging_query = """
               CREATE TABLE IF NOT EXISTS staging.capital_trip (
                   starttime TIMESTAMP,
                   endtime TIMESTAMP,
                   startID NUMERIC,
                   startname VARCHAR(128),
                   start_lat REAL,
                   start_long REAL,
                   endID NUMERIC,
                   endname VARCHAR(128),
                   end_lat REAL,
                   end_long REAL              
              );
              """
cursor.execute("rollback;")
cursor.execute(capital_staging_query)
conn.commit()

In [27]:
def populate_capital_staging(datafile: str) -> None:
    """Grabs the data from the s3 bucket and edits it so that it can be uploaded to the staging table
    
    Parameters
    ----------
    datafile : str
        The name of a file in the s3 bucket without the s3:// prefix

    Returns
    -------
    None:
        If executed properly the database should now have rows corresponding to the rows in the data
    """
    
    columns = ['Start date', 'End date',
               'Start station number', 'Start station',
               'End station number', 'End station']
    
    altcolumns = ['started_at','ended_at',
                  'start_station_id', 'start_station_name',
                  'start_lat', 'start_lng',
                  'end_station_id', 'end_station_name',
                  'end_lat', 'end_lng']
    
    with fs.open("s3://"+datafile, 'r') as file:
        try:   
            data = pd.read_csv(file, usecols=columns, na_values = "")[columns]
            data.insert(4,'start_lat', -1)
            data.insert(5,'start_lng',-1)

            data.insert(8,'end_lat', -1)
            data.insert(9,'end_lng',-1)
        except:
            file.seek(0)
            data = pd.read_csv(file, usecols=altcolumns, na_values = "")[altcolumns]
            data.fillna({'start_station_id': -1, 'end_station_id':-1, 
                         'start_lat': -1, 'start_lng': -1,
                         'end_lat': -1, 'end_lng': -1}, inplace=True)
        
        data.iloc[:, 3] = data.iloc[:, 3].str.replace(',','_')
        data.iloc[:, 7] = data.iloc[:, 7].str.replace(',','_')

        upload_data(conn,data,'staging.capital_trip')
    
    print(f"Finished Uploading to Blue Staging Table: {datafile}")
    return None

In [28]:
for file in capital_filenames:
    populate_capital_staging(file)


Finished Uploading to Blue Staging Table: williams-citibike/TripData/CapitalBike/2010-capitalbikeshare-tripdata.csv
Finished Uploading to Blue Staging Table: williams-citibike/TripData/CapitalBike/2011-capitalbikeshare-tripdata.csv
Finished Uploading to Blue Staging Table: williams-citibike/TripData/CapitalBike/2012Q1-capitalbikeshare-tripdata.csv
Finished Uploading to Blue Staging Table: williams-citibike/TripData/CapitalBike/2012Q2-capitalbikeshare-tripdata.csv
Finished Uploading to Blue Staging Table: williams-citibike/TripData/CapitalBike/2012Q3-capitalbikeshare-tripdata.csv
Finished Uploading to Blue Staging Table: williams-citibike/TripData/CapitalBike/2012Q4-capitalbikeshare-tripdata.csv
Finished Uploading to Blue Staging Table: williams-citibike/TripData/CapitalBike/2013Q1-capitalbikeshare-tripdata.csv
Finished Uploading to Blue Staging Table: williams-citibike/TripData/CapitalBike/2013Q2-capitalbikeshare-tripdata.csv
Finished Uploading to Blue Staging Table: williams-citibike/

#### Creating the CitiBike Staging Table

In [1]:
citi_filenames = fs.ls("s3://williams-citibike/TripData/CitiBike")

NameError: name 'fs' is not defined

Get rid of bikeID:gender

In [30]:
# TAbles module. One function for all the tables. 
citi_staging_query = """
               CREATE TABLE IF NOT EXISTS staging.citi_trip (
                   tripduration NUMERIC, 
                   starttime TIMESTAMP,
                   endtime TIMESTAMP,
                   startID NUMERIC,
                   startname VARCHAR(128),
                   start_lat REAL,
                   start_long REAL,
                   endID NUMERIC,
                   endname VARCHAR(128),
                   end_lat REAL,
                   end_long REAL              
              );
              """
cursor.execute("rollback;")
cursor.execute(citi_staging_query)
conn.commit()

In [31]:
def populate_citi_staging(datafile: str) -> None:
    """Grabs the data from the s3 bucket and edits it so that it can be uploaded to the staging table
    
    Parameters
    ----------
    datafile : str
        The name of a file in the s3 bucket without the s3:// prefix

    Returns
    -------
    None:
        If executed properly the database should now have rows corresponding to the rows in the data
    """
       
    with fs.open("s3://"+datafile, 'r') as file:
        data = pd.read_csv(file, na_values ="", usecols=list(range(0,11)))   # Can't use the C engine to speed this up
        data.fillna(-1, inplace=True)   # Empty spaces need to be integers for birthyear REAL type in database
        
        #Some stations have commas in their name causing the copy_from to register extra data fields
        data.iloc[:, 4] = data.iloc[:, 4].str.replace(',','_')
        data.iloc[:, 8] = data.iloc[:, 8].str.replace(',','_')
        
        data.iloc[:, 3] = data.iloc[:, 3].astype('int32')
        data.iloc[:, 7] = data.iloc[:, 7].astype('int32')
        
        upload_data(conn,data,'staging.citi_trip')
        
    print(f"Finished Uploading to Citi Staging Table: {datafile}")
    return None

In [None]:
for file in citi_filenames:
    populate_citi_staging(file)

Finished Uploading to Citi Staging Table: williams-citibike/TripData/CitiBike/2013-07 - Citi Bike trip data.csv
Finished Uploading to Citi Staging Table: williams-citibike/TripData/CitiBike/2013-08 - Citi Bike trip data.csv
Finished Uploading to Citi Staging Table: williams-citibike/TripData/CitiBike/2013-09 - Citi Bike trip data.csv
Finished Uploading to Citi Staging Table: williams-citibike/TripData/CitiBike/2013-10 - Citi Bike trip data.csv
Finished Uploading to Citi Staging Table: williams-citibike/TripData/CitiBike/2013-11 - Citi Bike trip data.csv
Finished Uploading to Citi Staging Table: williams-citibike/TripData/CitiBike/2013-12 - Citi Bike trip data.csv
Finished Uploading to Citi Staging Table: williams-citibike/TripData/CitiBike/201306-citibike-tripdata.csv
Finished Uploading to Citi Staging Table: williams-citibike/TripData/CitiBike/2014-01 - Citi Bike trip data.csv
Finished Uploading to Citi Staging Table: williams-citibike/TripData/CitiBike/2014-02 - Citi Bike trip data.c

#### Creating the Divvy Staging Table

In [14]:
divvy_filenames = fs.ls("s3://williams-citibike/TripData/DivvyBike")

In [15]:
# TAbles module. One function for all the tables. 
divvy_staging_query = """
               CREATE TABLE IF NOT EXISTS staging.divvy_trip (
                   starttime TIMESTAMP,
                   endtime TIMESTAMP,
                   startID NUMERIC,
                   startname VARCHAR(128),
                   start_lat REAL,
                   start_long REAL,
                   endID NUMERIC,
                   endname VARCHAR(128),
                   end_lat REAL,
                   end_long REAL             
              );
              """
cursor.execute("rollback;")
cursor.execute(divvy_staging_query)
conn.commit()

In [16]:
def populate_divvy_staging(datafile: str) -> None:
    """Grabs the data from the s3 bucket and edits it so that it can be uploaded to the staging table
    
    Parameters
    ----------
    datafile : str
        The name of a file in the s3 bucket without the s3:// prefix

    Returns
    -------
    None:
        If executed properly the database should now have rows corresponding to the rows in the data
    """
    
    columns = ['started_at', 'ended_at',
               'start_station_id', 'start_station_name',
               'start_lat', 'start_lng',
               'end_station_id', 'end_station_name',
               'end_lat', 'end_lng']
    
    altcolumns = ['starttime', 'stoptime',
                  'from_station_id', 'from_station_name',
                  'to_station_id','to_station_name']
    
    alt3 = ['start_time', 'end_time',
            'from_station_id', 'from_station_name',
            'to_station_id','to_station_name']
    
    names = ['starttime', 'endtime','startid','startname','endid','endname']
    
    with fs.open("s3://"+datafile, 'r') as file:
        try:
            data = pd.read_csv(file, usecols=columns, na_values="", parse_dates=[0,1])[columns]
            data.fillna({'start_station_id': -1, 'end_station_id':-1, 
                         'start_lat': -1, 'start_lng': -1,
                         'end_lat': -1, 'end_lng': -1}, inplace=True)            
        except ValueError:
            file.seek(0)
            try:
                data = pd.read_csv(file, usecols=altcolumns, na_values = "", parse_dates=[0,1])[altcolumns]
                data.columns = names
            except ValueError:
                file.seek(0)
                try:
                    data = pd.read_csv(file, usecols=alt3, na_values = "", parse_dates=[0,1])[alt3]
                    data.columns = names
                except:
                    file.seek(0)
                    data = pd.read_csv(file, usecols=[1,2,5,6,7,8], na_values="", parse_dates=[0,1])
                    data.columns = names
        
            data.insert(4,'start_lat', -1)
            data.insert(5,'start_lng',-1)

            data.insert(8,'end_lat', -1)
            data.insert(9,'end_lng',-1)
            
            data.fillna({'startid': -1, 'endidd':-1}, inplace=True)

        data.iloc[:, 3] = data.iloc[:, 3].str.replace(',','_')
        data.iloc[:, 7] = data.iloc[:, 7].str.replace(',','_')
        
        
        upload_data(conn,data,'staging.divvy_trip')
        
        
    print(f"Finished Uploading to Divvy Staging Table: {datafile}")
    return None

In [17]:
for file in divvy_filenames:
    populate_divvy_staging(file)

Finished Uploading to Divvy Staging Table: williams-citibike/TripData/DivvyBike/202004-divvy-tripdata.csv
Finished Uploading to Divvy Staging Table: williams-citibike/TripData/DivvyBike/202005-divvy-tripdata.csv
Finished Uploading to Divvy Staging Table: williams-citibike/TripData/DivvyBike/202006-divvy-tripdata.csv
Finished Uploading to Divvy Staging Table: williams-citibike/TripData/DivvyBike/202007-divvy-tripdata.csv
Finished Uploading to Divvy Staging Table: williams-citibike/TripData/DivvyBike/202008-divvy-tripdata.csv
Finished Uploading to Divvy Staging Table: williams-citibike/TripData/DivvyBike/202009-divvy-tripdata.csv
Finished Uploading to Divvy Staging Table: williams-citibike/TripData/DivvyBike/202010-divvy-tripdata.csv
Finished Uploading to Divvy Staging Table: williams-citibike/TripData/DivvyBike/202011-divvy-tripdata.csv
Finished Uploading to Divvy Staging Table: williams-citibike/TripData/DivvyBike/Divvy_Trips_2013.csv
Finished Uploading to Divvy Staging Table: williams

## Database Construction II - Creating the Station Tables

In [13]:
pip install geopandas

Collecting geopandas
  Using cached geopandas-0.8.2-py2.py3-none-any.whl (962 kB)
Collecting pyproj>=2.2.0
  Using cached pyproj-3.0.0.post1-cp37-cp37m-manylinux2010_x86_64.whl (6.4 MB)
Collecting fiona
  Using cached Fiona-1.8.18-cp37-cp37m-manylinux1_x86_64.whl (14.8 MB)
Collecting shapely
  Using cached Shapely-1.7.1-cp37-cp37m-manylinux1_x86_64.whl (1.0 MB)
Collecting cligj>=0.5
  Using cached cligj-0.7.1-py3-none-any.whl (7.1 kB)
Collecting munch
  Using cached munch-2.5.0-py2.py3-none-any.whl (10 kB)
Collecting click-plugins>=1.0
  Using cached click_plugins-1.1.1-py2.py3-none-any.whl (7.5 kB)
Installing collected packages: pyproj, cligj, munch, click-plugins, fiona, shapely, geopandas
Successfully installed click-plugins-1.1.1 cligj-0.7.1 fiona-1.8.18 geopandas-0.8.2 munch-2.5.0 pyproj-3.0.0.post1 shapely-1.7.1
Note: you may need to restart the kernel to use updated packages.


In [14]:
import geopandas as gpd
import shapely

In [15]:
def get_stations(conn, service: str, drop_indices: list=[]):
    station_query = f"""
            SELECT DISTINCT ON(endid) endid, endname, end_lat, end_long 
              FROM staging.{service}_trip
             ORDER BY endid;
            """
    station = pd.read_sql(station_query, conn)
    station.dropna(inplace=True)
    
    if len(drop_indices) > 0:
        station = station.set_index('endid').drop(drop_indices).reset_index()
    
    return station

In [16]:
def database_upload(conn, geodf, table):
    cursor = conn.cursor()
    
    stream = StringIO()
    geodf.to_csv(stream, sep='\t', index=False, header=False)
    stream.seek(0)
    
    cursor.copy_from(stream, table, sep='\t')
    conn.commit()
    
    return None

In [22]:
stations_schema_query = """CREATE SCHEMA IF NOT EXISTS stations;"""
cursor.execute("rollback;")
cursor.execute(stations_schema_query)

#### Geocoding Functions


In [82]:
def get_address_components(address, data_type='json'):
    endpoint = f'https://maps.googleapis.com/maps/api/geocode/{data_type}'
    params = {'address': address, 'key': api_key}
    url_params = urlencode(params)
    url = f"{endpoint}?{url_params}"
    
    r = requests.get(url)
    if r.status_code not in range(200,299):
        return {}

    try:
        return r.json()['results'][0]['address_components']
    except IndexError:
        return -1


def get_latlong_components(lat, long, data_type = 'json'):
    url = f"https://maps.googleapis.com/maps/api/geocode/json?latlng={lat}, {long}&key={api_key}"
    
    r = requests.get(url)
    if r.status_code not in range(200,299):
        return {}
   
    try:
        return r.json()['results'][0]['address_components']
    except IndexError:
        return -1


def extract_zipcode(components):
    for comp in components:
        if comp.get('types')[0] == 'postal_code':
            return comp.get('long_name')
        

In [83]:
def input_zipcode(df):
    if df.end_lat < 10:
        return extract_zipcode(get_address_components(df.endname))
    else:
        return extract_zipcode(get_latlong_components(df.end_lat, df.end_long))

#### Creating the BayWheels Station Table

In [23]:
# Endid has more distinct values than startid
# Tables module
bay_station_query = """
        SELECT DISTINCT ON(endid) endid, endname, end_lat, end_long 
          FROM staging.bay_trip 
         ORDER BY endid;
        """

In [24]:
bay_station = pd.read_sql(bay_station_query, conn) # Expect long execution times

In [25]:
def drop_decimal(x):
    """Drops the .0 from a string, if it has it"""
    if x.endswith('.0'):
        return(x[:-2])
    else: return x

In [26]:
bay_station['endid'] = bay_station.endid.apply(drop_decimal)

In [27]:
bay_station.drop_duplicates(inplace=True)
bay_station = bay_station.set_index('endid').drop(['449','420', '408','484']).reset_index()

In [28]:
bay_spatial = gpd.GeoDataFrame(bay_station, geometry=gpd.points_from_xy(bay_station.end_long, bay_station.end_lat), crs="EPSG:4326")

In [29]:
bay_spatial['zipcode'] = bay_spatial.apply(input_zipcode, axis=1)

In [35]:
bay_spatial.zipcode = bay_spatial.zipcode.fillna(-1)

In [36]:
# Tables module
bay_station_query = """
               CREATE TABLE IF NOT EXISTS stations.bay_station (
                   stationID VARCHAR,
                   name VARCHAR(64) NOT NULL,
                   latitude REAL,
                   longitude REAL,
                   geometry GEOGRAPHY(POINT,4326) NOT NULL,
                   zipcode INTEGER
                );
                
                """
cursor.execute("rollback;")
cursor.execute(bay_station_query)
conn.commit()

In [37]:
database_upload(conn, bay_spatial, 'stations.bay_station')

#### Creating the BlueWheels Station Table

In [38]:
blue_station = get_stations(conn, 'blue', [153, 158, 223, 229, 230, 308, 382])

In [39]:
blue_spatial = gpd.GeoDataFrame(blue_station, geometry=gpd.points_from_xy(blue_station.end_long, blue_station.end_lat), crs="EPSG:4326")

In [40]:
blue_spatial['zipcode'] = blue_spatial.apply(input_zipcode, axis=1)

In [43]:
blue_spatial.head()

Unnamed: 0,endid,endname,end_lat,end_long,geometry,zipcode
0,1.0,18 Dorrance Warehouse,42.38715,-71.07598,POINT (-71.07598 42.38715),2129
1,3.0,Colleges of the Fenway - Fenway at Avenue Loui...,42.340115,-71.10062,POINT (-71.10062 42.34011),2115
2,4.0,Tremont St at E Berkeley St,42.34539,-71.06962,POINT (-71.06962 42.34539),2116
3,5.0,Northeastern University - North Parking Lot,42.341812,-71.09018,POINT (-71.09018 42.34181),2115
4,6.0,Cambridge St at Joy St,42.361256,-71.065285,POINT (-71.06529 42.36126),2114


In [44]:
# Tables module
blue_station_query = """
               CREATE TABLE IF NOT EXISTS stations.blue_station (
                   stationID VARCHAR,
                   name VARCHAR(128) NOT NULL,
                   latitude REAL,
                   longitude REAL,
                   geometry GEOGRAPHY(POINT,4326) NOT NULL,
                   zipcode INTEGER
                );
                
                """
cursor.execute("rollback;")
cursor.execute(blue_station_query)
conn.commit()

In [45]:
database_upload(conn, blue_spatial, 'stations.blue_station')

#### Creating the Capital Station Table

In [77]:
capital_station = get_stations(conn, 'capital', [-1,0,32902])

In [78]:
capital_spatial = gpd.GeoDataFrame(capital_station, geometry=gpd.points_from_xy(capital_station.end_long, capital_station.end_lat), crs="EPSG:4326")

In [79]:
# Tables module
capital_station_query = """
               CREATE TABLE IF NOT EXISTS stations.capital_station (
                   stationID VARCHAR,
                   name VARCHAR(128) NOT NULL,
                   latitude REAL,
                   longitude REAL,
                   geometry GEOGRAPHY(POINT,4326) NOT NULL,
                   zipcode INTEGER
                );
                
                """
cursor.execute("rollback;")
cursor.execute(capital_station_query)
conn.commit()

In [80]:
capital_spatial['zipcode'] = capital_spatial.apply(input_zipcode, axis=1)

IndexError: list index out of range

In [31]:
database_upload(conn, capital_spatial, 'stations.capital_station')

#### Creating the CitiBike Station Table

In [51]:
citi_station = get_stations(conn, 'citi', [-1,3036,3650,3247,3248,3446,3480,3488,3633])

In [52]:
citi_spatial = gpd.GeoDataFrame(citi_station, geometry=gpd.points_from_xy(citi_station.end_long, citi_station.end_lat), crs="EPSG:4326")

In [60]:
# Tables module
citi_station_query = """
               CREATE TABLE IF NOT EXISTS stations.citi_station (
                   stationID VARCHAR,
                   name VARCHAR(128) NOT NULL,
                   latitude REAL,
                   longitude REAL,
                   geometry GEOGRAPHY(POINT,4326) NOT NULL,
                   zipcode INTEGER
                );
                
                """
cursor.execute("rollback;")
cursor.execute(citi_station_query)
conn.commit()

In [54]:
citi_spatial['zipcode'] = citi_spatial.apply(input_zipcode, axis=1)

In [57]:
citi_spatial.zipcode = citi_spatial.zipcode.fillna(-1)

In [61]:
citi_spatial.head()

Unnamed: 0,endid,endname,end_lat,end_long,geometry,zipcode
0,72.0,W 52 St & 11 Ave,40.767273,-73.99393,POINT (-73.99393 40.76727),10019
1,79.0,Franklin St & W Broadway,40.719116,-74.00667,POINT (-74.00667 40.71912),10013
2,82.0,St James Pl & Pearl St,40.711174,-74.00017,POINT (-74.00017 40.71117),10038
3,83.0,Atlantic Ave & Fort Greene Pl,40.683826,-73.976326,POINT (-73.97633 40.68383),11217
4,116.0,W 17 St & 8 Ave,40.741776,-74.001495,POINT (-74.00150 40.74178),10011


In [62]:
database_upload(conn, citi_spatial, 'stations.citi_station')

#### Creating the Divvy Station Table

In [63]:
divvy_station = get_stations(conn, 'divvy', [-1])

In [64]:
divvy_spatial = gpd.GeoDataFrame(divvy_station, geometry=gpd.points_from_xy(divvy_station.end_long, divvy_station.end_lat), crs="EPSG:4326")

In [65]:
# Tables module
divvy_station_query = """
               CREATE TABLE IF NOT EXISTS stations.divvy_station (
                   stationID VARCHAR,
                   name VARCHAR(128) NOT NULL,
                   latitude REAL,
                   longitude REAL,
                   geometry GEOGRAPHY(POINT,4326) NOT NULL,
                   zipcode INTEGER
                );
                
                """
cursor.execute("rollback;")
cursor.execute(divvy_station_query)
conn.commit()

In [66]:
divvy_spatial['zipcode'] = divvy_spatial.apply(input_zipcode, axis=1)

IndexError: list index out of range

In [None]:
divvy_spatial.head()

In [52]:
database_upload(conn, divvy_spatial, 'stations.divvy_station')

## Testing Geocoding


In [13]:
from urllib.parse import urlencode
import requests

In [16]:
api_key = 'AIzaSyCrG_VK47xMKjER4zpHyd3FJNLFn2weNFY'

In [17]:
def get_address_components(address, data_type='json'):
    endpoint = f'https://maps.googleapis.com/maps/api/geocode/{data_type}'
    params = {'address': address, 'key': api_key}
    url_params = urlencode(params)
    url = f"{endpoint}?{url_params}"
    
    r = requests.get(url)
    if r.status_code not in range(200,299):
        return {}
    return r.json()['results'][0]['address_components']


def get_latlong_components(lat, long, data_type = 'json'):
    url = f"https://maps.googleapis.com/maps/api/geocode/json?latlng={lat}, {long}&key={api_key}"
    
    r = requests.get(url)
    if r.status_code not in range(200,299):
        return {}
    return r.json()['results'][0]['address_components']


def extract_zipcode(components):
    for comp in components:
        if comp.get('types')[0] == 'postal_code':
            return comp.get('long_name')
        

In [152]:
def input_zipcode(row):
    if row['end_lat'] < 10:
        return 'address'
        #row['zipcode'] = extract_zipcode(get_address_components(row.endname))
    else:
        return 'latlong'
        #row['zipcode'] = extract_zipcode(get_latlong_components(row.end_lat, row.end_long))
    '''

In [179]:
bay_spatial['zipcode'] = np.where(
    bay_spatial.end_lat < 10,
    extract_zipcode(get_address_components(bay_spatial.endname)),
    extract_zipcode(get_latlong_components(bay_spatial.end_lat, bay_spatial.end_long))
)

In [192]:
for index, row in bay_spatial.iloc[:2,].iterrows():
    print(extract_zipcode(get_latlong_components(row.end_lat, row.end_long)))

94109
94108


In [26]:
def input_zipcode(df):
    if df.end_lat < 10:
        return extract_zipcode(get_address_components(df.endname))
    else:
        return extract_zipcode(get_latlong_components(df.end_lat, df.end_long))

In [32]:
bay_spatial['zipcode'] = bay_spatial.apply(input_zipcode, axis=1)

In [33]:
bay_spatial

Unnamed: 0,endid,endname,end_lat,end_long,geometry,zipcode
0,,,37.800000,-122.420000,POINT (-122.42000 37.80000),94109
1,10,Washington St at Kearny St,37.795390,-122.404770,POINT (-122.40477 37.79539),94108
2,100,Bryant St at 15th St,37.767100,-122.410660,POINT (-122.41066 37.76710),94103
3,101,15th St at Potrero Ave,37.767080,-122.407360,POINT (-122.40736 37.76708),94103
4,102,Irwin St at 8th St,37.766884,-122.399580,POINT (-122.39958 37.76688),94107
...,...,...,...,...,...,...
946,SJ-Q4,Willow St at Blewett Ave,37.309013,-121.900010,POINT (-121.90001 37.30901),95125
947,SJ-Q5,Bird Ave at Willow St,37.311234,-121.896300,POINT (-121.89630 37.31123),95125
948,SJ-Q8,Palm St at Willow St,37.317300,-121.884995,POINT (-121.88500 37.31730),95110
949,SJ-Q9,Willow St at Vine St,37.318450,-121.883170,POINT (-121.88317 37.31845),95110
