## Designing the Database

Each citibike file has the same format, the colomns in the files are:
- Trip Duration (seconds)
- Start Date & Time
- End Date & Time
- Start Station ID
- Start Station Name
- Start Station Latitude
- Start Station Longitude
- End Date & Time
- End Station ID
- End Station Name
- End Station Latitude
- End Station Longitude
- Bike ID
- User Type
- Gender
- Year of Birth

<img src="DatabaseDiagram.png" width="600" height="800" align="center"/>

## Connecting to the Database

In [1]:
pip install psycopg2-binary;

Note: you may need to restart the kernel to use updated packages.


In [2]:
import psycopg2

In [43]:
# Put the password in 
PGHOST = 'tripdatabase.cmaaautpgbsf.us-east-2.rds.amazonaws.com'
PGDATABASE = ''
PGUSER = 'postgres'
PGPASSWORD = ''

In [44]:
try:   
    # Set up a connection to the postgres server.    
    conn = psycopg2.connect(user = PGUSER,
                            port = "5432",
                            password = PGPASSWORD,
                            host = PGHOST,
                            database = PGDATABASE)
    # Create a cursor object
    cursor = conn.cursor()   
    cursor.execute("SELECT version();")
    record = cursor.fetchone()
    print("Connection Success:", record,"\n")

except (Exception, psycopg2.Error) as error:
    print("Error while connecting to PostgreSQL", error)

Connection Success: ('PostgreSQL 12.4 on x86_64-pc-linux-gnu, compiled by gcc (GCC) 4.8.5 20150623 (Red Hat 4.8.5-11), 64-bit',) 



## Populating the Raw Database

In [5]:
pip install s3fs;

Note: you may need to restart the kernel to use updated packages.


In [6]:
import pandas as pd
import s3fs
import os
from io import StringIO

In [15]:
ACCESS_KEY_ID = ''
ACCESS_SECRET_KEY = ''
bucket = "s3://williams-citibike/TripData/"
fs = s3fs.S3FileSystem(anon=False, key = ACCESS_KEY_ID, secret= ACCESS_SECRET_KEY)
filenames = fs.ls("s3://williams-citibike/TripData/")[1:]

OSError: [Errno 22] The authorization header is malformed; a non-empty Access Key (AKID) must be provided in the credential.

In [8]:
rawtable = """
           CREATE TABLE IF NOT EXISTS raw (
               tripduration INTEGER, 
               starttime TIMESTAMP,
               endtime TIMESTAMP,
               startID NUMERIC,
               startname VARCHAR(64),
               start_lat REAL,
               start_long REAL,
               endID NUMERIC,
               endname VARCHAR(64),
               end_lat REAL,
               end_long REAL,
               bikeID INTEGER,
               usertype VARCHAR(16),
               birthyear REAL,
               gender SMALLINT                
          );
          """
cursor.execute("rollback;")
cursor.execute(rawtable)
conn.commit()

In [8]:
def populate_raw(datafile: str) -> None:
    datastream = StringIO()
    
    with fs.open("s3://"+datafile, 'r') as file:
        data = pd.read_csv(file, na_values ="") 
        data.fillna(-1, inplace=True) # Empty spaces need to be integers for birthyear
        
        #Some stations have commas in their name cause the copy_from to register extra data fields
        data.iloc[:, 4] = data.iloc[:, 4].str.replace(',','_')
        data.iloc[:, 8] = data.iloc[:, 8].str.replace(',','_')
        
        data.iloc[:, 3] = data.iloc[:, 3].astype('int32')
        data.iloc[:, 7] = data.iloc[:, 7].astype('int32')
        
        data.to_csv(datastream, index=False, header = False)
        datastream.seek(0)

        cursor.copy_from(datastream,'raw',sep=',')
        conn.commit()
    
    datastream.close()
    print(f"Finished Uploading to Raw: {datafile}")
    return None

In [11]:
filenames[0:7]

['williams-citibike/TripData/2013-07 - Citi Bike trip data.csv',
 'williams-citibike/TripData/2013-08 - Citi Bike trip data.csv',
 'williams-citibike/TripData/2013-09 - Citi Bike trip data.csv',
 'williams-citibike/TripData/2013-10 - Citi Bike trip data.csv',
 'williams-citibike/TripData/2013-11 - Citi Bike trip data.csv',
 'williams-citibike/TripData/2013-12 - Citi Bike trip data.csv',
 'williams-citibike/TripData/201306-citibike-tripdata.csv']

In [11]:
#finished 201904

In [12]:
cursor.execute("rollback;")
for file in filenames[0:7]:
    populate_raw(file)

Finished Uploading to Raw: williams-citibike/TripData/2013-07 - Citi Bike trip data.csv
Finished Uploading to Raw: williams-citibike/TripData/2013-08 - Citi Bike trip data.csv
Finished Uploading to Raw: williams-citibike/TripData/2013-09 - Citi Bike trip data.csv
Finished Uploading to Raw: williams-citibike/TripData/2013-10 - Citi Bike trip data.csv
Finished Uploading to Raw: williams-citibike/TripData/2013-11 - Citi Bike trip data.csv
Finished Uploading to Raw: williams-citibike/TripData/2013-12 - Citi Bike trip data.csv
Finished Uploading to Raw: williams-citibike/TripData/201306-citibike-tripdata.csv


In [None]:
qtable = """
        CREATE TABLE IF NOT EXISTS trip(
                starttime TIMESTAMP,
                endtime TIMESTAMP,
                tripduration INTEGER, 
                startID SMALLINT,
                endID SMALLINT
        );
        """
cursor.execute("rollback;")
cursor.execute(qtable)
conn.commit()

In [None]:
def populate_database(datafile: str) -> None:
    cols = ["starttime","stoptime","tripduration","start station id", "end station id"]
    datastream = StringIO()
    
    with fs.open("s3://"+datafile, 'r') as f:
        data = pd.read_csv(f, usecols=cols, parse_dates = ['starttime','stoptime'])
        data = data[cols]
        data.rename(columns={"stoptime":"endtime", "start station id":"startID", "end station id":"endID"}, inplace=True)
        
        data.to_csv(datastream, index=False, header = False)
        datastream.seek(0)

        cursor.copy_from(datastream,'trip',sep=',')
        conn.commit()
    
    datastream.close()
    print(f"F {datafile}")
    return None

In [None]:
for file in filenames:
    populate_database(file)

In [None]:
if(conn):
    cursor.close()
    conn.close()
    print("PostgreSQL connection is closed")

## Populating the Neighborhood Database

In [20]:
from bs4 import BeautifulSoup
import requests

In [21]:
# Attempt connection to the URL
HoodURL = "https://furmancenter.org/neighborhoods"
try:
    r2 = requests.get(HoodURL)
    r2.raise_for_status()
except requests.exceptions.HTTPError as errh:
    print(errh)

In [31]:
soup = BeautifulSoup(r2.content, "html.parser")

# The website has a dropdown with all the neighborhood codes and names
hood_code_names = []

#Instead of creating a dictionary like before, we create a list of tuples
for code in soup.find_all('option')[1:]:
    hood_code_names.append((code.text[:4], code.text[6:].replace("/","-").replace(" ","_")))

In [34]:
hood_df = pd.DataFrame(hood_code_names, columns=["Code", "Name"])

In [40]:
borough = {
        "BK": "Brooklyn", 
        "BX": "Bronx",
        "MN": "Manhattan",
        "QN": "Queens",
        "SI": "Staten"
        }

hood_df["Borough"] = hood_df["Code"].str[0:2].map(borough)

In [48]:
hoodtable = """
            CREATE TABLE IF NOT EXISTS neighborhood (
                code CHAR(4) PRIMARY KEY,
                hood VARCHAR NOT NULL,
                borough VARCHAR(16) NOT NULL
            );
            """
cursor.execute("rollback;")
cursor.execute(hoodtable)
conn.commit()

In [50]:
hoodstream = StringIO()

hood_df.to_csv(hoodstream, index=False, header = False)
hoodstream.seek(0)

cursor.copy_from(hoodstream,'neighborhood',sep=',')
conn.commit()
    
hoodstream.close()