## Grabing the Data From the CitiBike S3 Bucket

In [72]:
import pandas as pd
import requests, glob, zipfile, io   # Needed to pull data from CitiBike S3 bucket
import os

In [73]:
CITIBIKE_DATA_FOLDER = "https://s3.amazonaws.com/tripdata/"    
TEMP_DATA_FIOLDER = "/root/Citi-Bike-Expansion/TempTripData/"

In [74]:
if not os.path.exists(TEMP_DATA_FIOLDER):
    os.makedirs(TEMP_DATA_FIOLDER)

In [75]:
def pull_data(filename):   
    try:
        r = requests.get(CITIBIKE_DATA_FOLDER + filename, stream=True)   
        r.raise_for_status()
    except requests.exceptions.HTTPError as errh:
        try:
            r = requests.get(CITIBIKE_DATA_FOLDER + filename[:-4] + '.csv.' + filename[-3:])
            r.raise_for_status()
        except requests.exceptions.HTTPError as errh: 
            print(errh)
            return False
        else:
            print(f"Request Success: {filename[:-4] + '.csv.' + filename[-3:]} requested from Citibike S3 Bucket")       
    except requests.exceptions.ConnectionError as errc:
        print(errc)
        return False
    except requests.exceptions.Timeout as errt:
        print(errt)
        return False
    except requests.exceptions.RequestException as err:
        print(err)
        return False
    else:
        print(f"Request Success: {filename} requested from Citibike S3 Bucket")
    
    
    #.content is an attribute of the request.Response() object that returns the conent of object in bytes
    with zipfile.ZipFile(io.BytesIO(r.content), 'r') as zip: 
        datafile = zip.namelist()[0]   # The namelist method returns a list of all files in the directory the actual data is always first
               
        if os.path.exists(TEMP_DATA_FIOLDER + datafile):
            print(f"Skipped: {datafile} already extracted from Citbike S3 Bucket \n")
            return False
        
        zip.extract(datafile, path = TEMP_DATA_FIOLDER)
    
    print(f"Extract Success: {datafile} unzipped and uploaded to {TEMP_DATA_FIOLDER} \n")
    return True

In [76]:
yearlist = ["2013", "2014", "2015", "2016", "2017", "2018", "2019", "2020"]
monthlist = ["01", "02", "03", "04", "05", "06", "07", "08", "09", "10", "11", "12"]

In [77]:
for year in yearlist:
    for month in monthlist:
        pull_data(f"{year}{month}-citibike-tripdata.zip")

404 Client Error: Not Found for url: https://s3.amazonaws.com/tripdata/201301-citibike-tripdata.csv.zip
404 Client Error: Not Found for url: https://s3.amazonaws.com/tripdata/201302-citibike-tripdata.csv.zip
404 Client Error: Not Found for url: https://s3.amazonaws.com/tripdata/201303-citibike-tripdata.csv.zip
404 Client Error: Not Found for url: https://s3.amazonaws.com/tripdata/201304-citibike-tripdata.csv.zip
404 Client Error: Not Found for url: https://s3.amazonaws.com/tripdata/201305-citibike-tripdata.csv.zip
Request Success: 201306-citibike-tripdata.zip requested from Citibike S3 Bucket
Extract Success: 201306-citibike-tripdata.csv unzipped and uploaded to /root/Citi-Bike-Expansion/TempTripData/ 

Request Success: 201307-citibike-tripdata.zip requested from Citibike S3 Bucket
Extract Success: 2013-07 - Citi Bike trip data.csv unzipped and uploaded to /root/Citi-Bike-Expansion/TempTripData/ 

Request Success: 201308-citibike-tripdata.zip requested from Citibike S3 Bucket
Extract S

## Upload Data to Personal S3 Bucket

In [78]:
import boto3
import shutil

In [79]:
ACCESS_KEY_ID = 'AKIAI7NF5NC4LPPWM3DQ'
ACCESS_SECRET_KEY = '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'

s3 = boto3.resource(
     's3',
     aws_access_key_id = ACCESS_KEY_ID,
     aws_secret_access_key = ACCESS_SECRET_KEY
)

bucket = 'williams-citibike' # the bucket in s3
prefix = 'TripData' # the folder in the bucket

In [80]:
filenames = sorted([file for file in os.listdir(TEMP_DATA_FIOLDER)])

In [81]:
for key in filenames:
    s3.Bucket(bucket).Object(os.path.join(prefix,key)).upload_file(TEMP_DATA_FIOLDER + key)

In [82]:
shutil.rmtree(TEMP_DATA_FIOLDER)