In [1]:
from io import BytesIO, StringIO
import zipfile
import requests as r 
import pandas as pd
import boto3

# Create S3 resource, session and bucket variable
s3_resource = boto3.resource("s3")
s3_session = boto3.Session().client("s3")
bucket = 'team-3-project-data'

In [7]:
# Get District Data Codes
district_data_links = {2010: '8XEDGA',
                       2011: '1JEKQB',
                       2012: 'F6G5KD',
                       2013: '5IBSOL',
                       2014: '5IDVZ7',
                       2015: 'WZ57WS',
                       2016: 'C4703U',
                       2017: 'QCO2KE',
                       2018: 'S72VOG',
                       2019: 'I24Z9B',
                       2020: 'PFP3JS'}

# District Base URL
district_base_url = "https://s3.amazonaws.com/virginia-court-data/district_criminal_{}_anon_{}.zip"
    
# Temp Storage
data_dict = {}

# Iterate through each years data
for year, code in district_data_links.items():

    # Fetch Response
    temp_response = r.get(district_base_url.format(year, code))

    # Decompress and Look at Zip
    temp_zip_file = zipfile.ZipFile(BytesIO(temp_response.content))

    # Read as CSV and Store to Dictionary
    for name in temp_zip_file.namelist():

        # Write Data as CSV to S3 Bucket
        with StringIO() as csv_buffer:
            
            # Read in stream of data
            temp_stream = pd.read_csv(temp_zip_file.open(name), low_memory=False)
            
            # Only get Misdeameanor and Felonies
            temp_stream[temp_stream.CaseType.isin(['Misdemeanor','Felony'])].to_csv(csv_buffer, index=False)

            # Write file to S3
            response = s3_resource.Object(bucket, 'raw-data/{}'.format(name)).put(Body=csv_buffer.getvalue())

            status = response.get("ResponseMetadata", {}).get("HTTPStatusCode")

            # Print whether it was successful
            if status == 200:
                print(f"Successful S3 put_object - ({name}) - response. Status - {status}")
            else:
                print(f"Unsuccessful S3 put_object - ({name}) -  response. Status - {status}")


2010
district_criminal_2010_anon_00.csv
Successful S3 put_object - (district_criminal_2010_anon_00.csv) - response. Status - 200
district_criminal_2010_anon_01.csv
Successful S3 put_object - (district_criminal_2010_anon_01.csv) - response. Status - 200
district_criminal_2010_anon_02.csv
Successful S3 put_object - (district_criminal_2010_anon_02.csv) - response. Status - 200
district_criminal_2010_anon_03.csv
Successful S3 put_object - (district_criminal_2010_anon_03.csv) - response. Status - 200
district_criminal_2010_anon_04.csv
Successful S3 put_object - (district_criminal_2010_anon_04.csv) - response. Status - 200
district_criminal_2010_anon_05.csv
Successful S3 put_object - (district_criminal_2010_anon_05.csv) - response. Status - 200
district_criminal_2010_anon_06.csv
Successful S3 put_object - (district_criminal_2010_anon_06.csv) - response. Status - 200
district_criminal_2010_anon_07.csv
Successful S3 put_object - (district_criminal_2010_anon_07.csv) - response. Status - 200
dis

In [2]:
# Get a list of all files in raw-data bucket
district_files = [file['Key'] for file in s3_session.list_objects_v2(Bucket=bucket)['Contents'] if ('raw-data/' in file['Key'] and file['Size'] > 0)]

In [3]:
#Create data dictionary to store filecontents
df = pd.DataFrame()

for file in district_files:
    df = pd.concat([df, pd.read_csv(s3_session.get_object(Bucket=bucket, Key=file)['Body'], low_memory=False)])

: 

: 

In [4]:
# Write to CSV
df.to_csv('district_misdemeanor_felony.csv', index=False)

In [6]:
# Upload to S3
s3_session.upload_file('district_misdemeanor_felony.csv', bucket, 'district_misdemeanor_felony.csv')