## Pulling the Data From the CitiBike S3 Bucket
*The purpose of this section is to connect, extract, and store all of the tripdata files from the CitiBike S3 bucket*

In [3]:
import requests, zipfile, io   # Needed to pull data from CitiBike S3 bucket
import os

In [4]:
CITIBIKE_DATA_FOLDER = "https://s3.amazonaws.com/tripdata/"    
TEMP_DATA_FIOLDER = "/root/Citi-Bike-Expansion/TempTripData/"

In [5]:
if not os.path.exists(TEMP_DATA_FIOLDER):
    os.makedirs(TEMP_DATA_FIOLDER)

In [75]:
def pull_citi_data(filename: str) -> bool:
    """Connects to Citibike's S3 bucket, extracts, and stores the trip data into the temp_data_folder"

    Parameters
    ----------
    filename : str
        The name of a file in the Citibike S3 bucket (stem only)

    Returns
    -------
    bool:
        As a final result the function returns a bool whether the extraction was successful,
        however there should be a folder in your directory with an xlsx file.
    """
    
    # Attempts to connect to the file in the citibike S3 bucket and catches the different errors
    # Returns False if the connection fails
    try:
        r = requests.get(CITIBIKE_DATA_FOLDER + filename, stream=True)   
        r.raise_for_status()
    except requests.exceptions.HTTPError as errh:
        try:
            # Starting in 2017 bucket endings changed from .zip -> .csv.zip
            r = requests.get(CITIBIKE_DATA_FOLDER + filename[:-4] + '.csv.' + filename[-3:])
            r.raise_for_status()
        except requests.exceptions.HTTPError as errh: 
            print(errh)
            return False
        else:
            print(f"Request Success: {filename[:-4] + '.csv.' + filename[-3:]} requested from Citibike S3 Bucket")       
    except requests.exceptions.ConnectionError as errc:
        print(errc)
        return False
    except requests.exceptions.Timeout as errt:
        print(errt)
        return False
    except requests.exceptions.RequestException as err:
        print(err)
        return False
    else:
        print(f"Request Success: {filename} requested from Citibike S3 Bucket")
    
    # ==============================================================================================================
    
    # Unzips the file and extracts it to the Temporary Data Folder
    with zipfile.ZipFile(io.BytesIO(r.content), 'r') as zip: 
        
        # Regardless of the change in naming conventions, the actual data appears first in every bucket
        datafile = zip.namelist()[0] 
               
        if os.path.exists(TEMP_DATA_FIOLDER + datafile):
            print(f"Skipped: {datafile} already extracted from Citbike S3 Bucket \n")
            return False
        
        zip.extract(datafile, path = TEMP_DATA_FIOLDER)
    
    print(f"Extract Success: {datafile} unzipped and uploaded to {TEMP_DATA_FIOLDER} \n")
    return True

In [6]:
yearlist = ["2013", "2014", "2015", "2016", "2017", "2018", "2019", "2020"]
monthlist = ["01", "02", "03", "04", "05", "06", "07", "08", "09", "10", "11", "12"]

In [77]:
# Note: The first data file that exists is 201306. Expect 404 Client Errors on the first 5 runs
for year in yearlist:
    for month in monthlist:
        pull_citi_data(f"{year}{month}-citibike-tripdata.zip")

404 Client Error: Not Found for url: https://s3.amazonaws.com/tripdata/201301-citibike-tripdata.csv.zip
404 Client Error: Not Found for url: https://s3.amazonaws.com/tripdata/201302-citibike-tripdata.csv.zip
404 Client Error: Not Found for url: https://s3.amazonaws.com/tripdata/201303-citibike-tripdata.csv.zip
404 Client Error: Not Found for url: https://s3.amazonaws.com/tripdata/201304-citibike-tripdata.csv.zip
404 Client Error: Not Found for url: https://s3.amazonaws.com/tripdata/201305-citibike-tripdata.csv.zip
Request Success: 201306-citibike-tripdata.zip requested from Citibike S3 Bucket
Extract Success: 201306-citibike-tripdata.csv unzipped and uploaded to /root/Citi-Bike-Expansion/TempTripData/ 

Request Success: 201307-citibike-tripdata.zip requested from Citibike S3 Bucket
Extract Success: 2013-07 - Citi Bike trip data.csv unzipped and uploaded to /root/Citi-Bike-Expansion/TempTripData/ 

Request Success: 201308-citibike-tripdata.zip requested from Citibike S3 Bucket
Extract S

## Upload TripData to Personal S3 Bucket
*The purpose of this section is to take the downloaded files and upload them to my own personal S3 bucket giving me full control over the data*

In [7]:
import boto3
import shutil

In [8]:
# Note: This code can be executed with your own S3 bucket by changing the following values:
# ACCESS_KEY_ID, ACCESS_SECRET_KEY, bucket, prefix (optional)

ACCESS_KEY_ID = ''
ACCESS_SECRET_KEY = ''

s3 = boto3.resource(
     's3',
     aws_access_key_id = ACCESS_KEY_ID,
     aws_secret_access_key = ACCESS_SECRET_KEY
)

bucket = 'williams-citibike'   # Premade bucket in S3
prefix = 'TripData'   # Premade folder inside the bucket

In [9]:
filenames = sorted([file for file in os.listdir(TEMP_DATA_FIOLDER)])

In [81]:
for key in filenames:
    s3.Bucket(bucket).Object(os.path.join(prefix,key)).upload_file(TEMP_DATA_FIOLDER + key)

In [82]:
shutil.rmtree(TEMP_DATA_FIOLDER)

## Scraping Neighborhood Data I - Getting the Codes

In [48]:
from bs4 import BeautifulSoup

In [49]:
# Attempt connection to the URL
HoodURL = "https://furmancenter.org/neighborhoods"
try:
    r2 = requests.get(HoodURL)
    r2.raise_for_status()
except requests.exceptions.HTTPError as errh:
    print(errh)

In [50]:
soup = BeautifulSoup(r2.content, "html.parser")

# The website has a dropdown with all the neighborhood codes and names
hood_codes = {}
for code in soup.find_all('option')[1:]:
    hood_codes[code.text[:4]] = code.text[6:]


## Scraping Neighborhood Data II - Getting the Data

In [54]:
def pull_hood_data(code: str) -> bool:
    """Uses the inputted neighborhood code to download the xlsx data from Furman Center
    
    Parameters
    ----------
    code: str
        The 4 character neighborhood string
    
    Returns
    -------
    bool:
        As a final result the function returns a bool if successful.
        However, there should be an xlsx file in the data_folder.
    """
    
    pass

In [55]:
pull_hood_data?

[0;31mSignature:[0m [0mpull_hood_data[0m[0;34m([0m[0mcode[0m[0;34m:[0m [0mstr[0m[0;34m)[0m [0;34m->[0m [0mbool[0m[0;34m[0m[0;34m[0m[0m
[0;31mDocstring:[0m
Uses the inputted neighborhood code to download the xlsx data from Furman Center

Parameters
----------
code: str
    The 4 character neighborhood string

Returns
-------
bool:
    As a final result the function returns a bool if successful.
    However, there should be an xlsx file in the data_folder
[0;31mFile:[0m      ~/Citi-Bike-Expansion/<ipython-input-54-633a9b46d0da>
[0;31mType:[0m      function
