## Grabing the Data From the CitiBike S3 Bucket

In [1]:
import pandas as pd
import requests, glob, zipfile, io   # Needed to pull data from CitiBike S3 bucket
import os

In [95]:
CITIBIKE_DATA_FOLDER = "https://s3.amazonaws.com/tripdata/"    
DATA_FOLDER = "/root/Citi-Bike-Expansion/TripData/"

In [96]:
if not os.path.exists(DATA_FOLDER):
    os.makedirs(DATA_FOLDER)

In [113]:
def pull_data(filename):   
    try:
        r = requests.get(CITIBIKE_DATA_FOLDER + filename, stream=True)   
        r.raise_for_status()
    except requests.exceptions.HTTPError as errh:
        try:
            r = requests.get(CITIBIKE_DATA_FOLDER + filename[:-4] + '.csv.' + filename[-3:])
            r.raise_for_status()
        except requests.exceptions.HTTPError as errh: 
            print(errh)
            return False
        else:
            print(f"Request Success: {filename} requested from Citibike S3 Bucket")       
    except requests.exceptions.ConnectionError as errc:
        print(errc)
        return False
    except requests.exceptions.Timeout as errt:
        print(errt)
        return False
    except requests.exceptions.RequestException as err:
        print(err)
        return False
    else:
        print(f"Request Success: {filename} requested from Citibike S3 Bucket")
    
    
    #.content is an attribute of the request.Response() object that returns the conent of object in bytes
    with zipfile.ZipFile(io.BytesIO(r.content), 'r') as zip: 
        datafile = zip.namelist()[0]   # The namelist method returns a list of all files in the directory the actual data is always first
               
        if os.path.exists(DATA_FOLDER + datafile):
            print(f"Skipped: {datafile} already extracted from Citbike S3 Bucket \n")
            return True
        
        zip.extract(datafile, path = DATA_FOLDER)
    
    print(f"Extract Success: {datafile} unzipped and uploaded to {DATA_FOLDER} \n")
    return True

In [94]:
yearlist = ["2013", "2014", "2015", "2016", "2017", "2018", "2019", "2020"]
monthlist = ["01", "02", "03", "04", "05", "06", "07", "08", "09", "10", "11", "12"]

In [84]:
for year in yearlist:
    for month in monthlist:
        if not pull_data(f"{year}{month}-citibike-tripdata.zip"):
            pull_data(f"{year}{month}-citibike-tripdata.csv.zip")

404 Client Error: Not Found for url: https://s3.amazonaws.com/tripdata/201301-citibike-tripdata.zip
404 Client Error: Not Found for url: https://s3.amazonaws.com/tripdata/201301-citibike-tripdata.csv.zip
404 Client Error: Not Found for url: https://s3.amazonaws.com/tripdata/201302-citibike-tripdata.zip
404 Client Error: Not Found for url: https://s3.amazonaws.com/tripdata/201302-citibike-tripdata.csv.zip
404 Client Error: Not Found for url: https://s3.amazonaws.com/tripdata/201303-citibike-tripdata.zip
404 Client Error: Not Found for url: https://s3.amazonaws.com/tripdata/201303-citibike-tripdata.csv.zip
404 Client Error: Not Found for url: https://s3.amazonaws.com/tripdata/201304-citibike-tripdata.zip
404 Client Error: Not Found for url: https://s3.amazonaws.com/tripdata/201304-citibike-tripdata.csv.zip
404 Client Error: Not Found for url: https://s3.amazonaws.com/tripdata/201305-citibike-tripdata.zip
404 Client Error: Not Found for url: https://s3.amazonaws.com/tripdata/201305-citibi

KeyboardInterrupt: 

In [114]:
pull_data("202006-citibike-tripdata.zip");

Request Success: 202006-citibike-tripdata.zip requested from Citibike S3 Bucket
Skipped: 202006-citibike-tripdata.csv already extracted from Citbike S3 Bucket 

