## Scraping the Trip Data From the CitiBike S3 Bucket
*The purpose of this section is to connect, extract, and store all of the tripdata files from the CitiBike S3 bucket*

In [1]:
import requests, zipfile, io   # Needed to pull data from CitiBike S3 bucket
import os

In [2]:
CITIBIKE_DATA_FOLDER = "https://s3.amazonaws.com/tripdata/"    
TEMP_BIKE_FOLDER = "/root/Citi-Bike-Expansion/TempTripData/"

In [3]:
if not os.path.exists(TEMP_BIKE_FOLDER):
    os.makedirs(TEMP_BIKE_FOLDER)

In [75]:
def pull_citi_data(filename: str) -> None:
    """Connects to Citibike's S3 bucket, extracts, and stores the trip data into the temp_data_folder"

    Parameters
    ----------
    filename : str
        The name of a file in the Citibike S3 bucket (stem only)

    Returns
    -------
    bool:
        If executed properly there should be a CSV file in the TEMP_BIKE_FOLDER.
    """
    
    # Attempts to connect to the file in the citibike S3 bucket and catches the different errors
    # Returns False if the connection fails
    try:
        r = requests.get(CITIBIKE_DATA_FOLDER + filename, stream=True)   
        r.raise_for_status()
    except requests.exceptions.HTTPError as errh:
        try:
            # Starting in 2017 bucket endings changed from .zip -> .csv.zip
            r = requests.get(CITIBIKE_DATA_FOLDER + filename[:-4] + '.csv.' + filename[-3:])
            r.raise_for_status()
        except requests.exceptions.HTTPError as errh: 
            print(errh)
            return None
        else:
            print(f"Request Success: {filename[:-4] + '.csv.' + filename[-3:]} requested from Citibike S3 Bucket")       
    except requests.exceptions.ConnectionError as errc:
        print(errc)
        return None
    except requests.exceptions.Timeout as errt:
        print(errt)
        return None
    except requests.exceptions.RequestException as err:
        print(err)
        return None
    else:
        print(f"Request Success: {filename} requested from Citibike S3 Bucket")
    
    # ==============================================================================================================
    
    # Unzips the file and extracts it to the Temporary Data Folder
    with zipfile.ZipFile(io.BytesIO(r.content), 'r') as zip: 
        
        # Regardless of the change in naming conventions, the actual data appears first in every bucket
        datafile = zip.namelist()[0] 
               
        if os.path.exists(TEMP_BIKE_FOLDER + datafile):
            print(f"Skipped: {datafile} already extracted from Citbike S3 Bucket \n")
            return None
        
        zip.extract(datafile, path = TEMP_BIKE_FOLDER)
    
    print(f"Extract Success: {datafile} unzipped and uploaded to {TEMP_BIKE_FOLDER} \n")
    return None

In [4]:
yearlist = ["2013", "2014", "2015", "2016", "2017", "2018", "2019", "2020"]
monthlist = ["01", "02", "03", "04", "05", "06", "07", "08", "09", "10", "11", "12"]

In [77]:
# Note: The first data file that exists is 201306. Expect 404 Client Errors on the first 5 runs
for year in yearlist:
    for month in monthlist:
        pull_citi_data(f"{year}{month}-citibike-tripdata.zip")

404 Client Error: Not Found for url: https://s3.amazonaws.com/tripdata/201301-citibike-tripdata.csv.zip
404 Client Error: Not Found for url: https://s3.amazonaws.com/tripdata/201302-citibike-tripdata.csv.zip
404 Client Error: Not Found for url: https://s3.amazonaws.com/tripdata/201303-citibike-tripdata.csv.zip
404 Client Error: Not Found for url: https://s3.amazonaws.com/tripdata/201304-citibike-tripdata.csv.zip
404 Client Error: Not Found for url: https://s3.amazonaws.com/tripdata/201305-citibike-tripdata.csv.zip
Request Success: 201306-citibike-tripdata.zip requested from Citibike S3 Bucket
Extract Success: 201306-citibike-tripdata.csv unzipped and uploaded to /root/Citi-Bike-Expansion/TempTripData/ 

Request Success: 201307-citibike-tripdata.zip requested from Citibike S3 Bucket
Extract Success: 2013-07 - Citi Bike trip data.csv unzipped and uploaded to /root/Citi-Bike-Expansion/TempTripData/ 

Request Success: 201308-citibike-tripdata.zip requested from Citibike S3 Bucket
Extract S

## Upload TripData to Personal S3 Bucket
*The purpose of this section is to take the downloaded files and upload them to my own personal S3 bucket giving me full control over the data*

In [5]:
import boto3
import shutil

In [6]:
# Note: This code can be executed with your own S3 bucket by changing the following values:
# ACCESS_KEY_ID, ACCESS_SECRET_KEY, bucket, prefix (optional)

ACCESS_KEY_ID = ''
ACCESS_SECRET_KEY = ''

s3 = boto3.resource(
     's3',
     aws_access_key_id = ACCESS_KEY_ID,
     aws_secret_access_key = ACCESS_SECRET_KEY
)

bucket = 'williams-citibike'   # Premade bucket in S3
prefix = 'TripData'   # Premade folder inside the bucket

In [7]:
filenames = sorted([file for file in os.listdir(TEMP_BIKE_FOLDER)])

In [81]:
for key in filenames:
    s3.Bucket(bucket).Object(os.path.join(prefix,key)).upload_file(TEMP_BIKE_FOLDER + key)

In [8]:
shutil.rmtree(TEMP_BIKE_FOLDER)

## Scraping Neighborhood Data I - Getting the Neighborhood Codes

In [9]:
from bs4 import BeautifulSoup

In [10]:
# Attempt connection to the URL
HoodURL = "https://furmancenter.org/neighborhoods"
try:
    r2 = requests.get(HoodURL)
    r2.raise_for_status()
except requests.exceptions.HTTPError as errh:
    print(errh)

In [27]:
soup = BeautifulSoup(r2.content, "html.parser")

# The website has a dropdown with all the neighborhood codes and names
hood_codes = {}
for code in soup.find_all('option')[1:]:
    hood_codes[code.text[:4]] = code.text[6:].replace("/","-")

## Scraping Neighborhood Data II - Getting the Neighborhood Data Files

In [39]:
TEMP_HOOD_FOLDER = "/root/Citi-Bike-Expansion/TempHoodData/"

if not os.path.exists(TEMP_HOOD_FOLDER):
    os.makedirs(TEMP_HOOD_FOLDER)

In [49]:
def pull_hood_data(code: str, name: str) -> None:
    """Uses the inputted neighborhood code to download the xlsx data from Furman Center
    
    Parameters
    ----------
    code: str
        The 4 character neighborhood string
    name: str
        The actual name of the neighborhood
    Returns
    -------
    None:
        If executed properly there should be an XLSX file in the TEMP_HOOD_FOLDER.
    """
    file = f"https://furmancenter.org/files/NDP/{code}_NeighborhoodDataProfile.xlsx"
    
    if os.path.exists(TEMP_HOOD_FOLDER + f"{code}_{name}.xlsx"):
        print(f"Skipped: {code}_{name} already downloaded from Furman Center")
        return None
    
    try:
        r3 = requests.get(file)
        r3.raise_for_status()
    except requests.exceptions.HTTPError as errh:
        print(errh)
        return None
    else:
        print(f"Request Success: {file} from Furman Center")
    
    with open(TEMP_HOOD_FOLDER + f"{code}_{name}.xlsx", 'wb') as output:
        output.write(r3.content)
    
    return None

In [48]:
for key, value in hood_codes.items():
    pull_hood_data(key, value)

Skipped: BK01_Greenpoint-Williamsburg already downloaded from Furman Center
Skipped: BK02_Fort Greene-Brooklyn Heights already downloaded from Furman Center
Skipped: BK03_Bedford Stuyvesant already downloaded from Furman Center
Skipped: BK04_Bushwick already downloaded from Furman Center
Skipped: BK05_East New York-Starrett City already downloaded from Furman Center
Skipped: BK06_Park Slope-Carroll Gardens already downloaded from Furman Center
Skipped: BK07_Sunset Park already downloaded from Furman Center
Skipped: BK08_Crown Heights-Prospect Heights already downloaded from Furman Center
Skipped: BK09_South Crown Heights-Lefferts Gardens already downloaded from Furman Center
Skipped: BK10_Bay Ridge-Dyker Heights already downloaded from Furman Center
Skipped: BK11_Bensonhurst already downloaded from Furman Center
Skipped: BK12_Borough Park already downloaded from Furman Center
Skipped: BK13_Coney Island already downloaded from Furman Center
Skipped: BK14_Flatbush-Midwood already downloa

In [None]:
prefix2 = "NeighData"

for key in filenames:
    s3.Bucket(bucket).Object(os.path.join(prefix2,key)).upload_file(TEMP_HOOD_FIOLDER + key)