In [2]:
'''

@Author: Vighnesh Harish Bilgi
@Date: 2022-12-12
@Last Modified by: Vighnesh Harish Bilgi
@Last Modified time: 2022-12-12
@Title : 1 - Upload covid-19 datasets to S3 bucket

'''

'\n\n@Author: Vighnesh Harish Bilgi\n@Date: 2022-12-12\n@Last Modified by: Vighnesh Harish Bilgi\n@Last Modified time: 2022-12-12\n@Title : 1 - Upload covid-19 datasets to S3 bucket\n\n'

In [20]:
import boto3
import pandas as pd
import json
import gzip
import shutil

In [4]:
import os
os.environ['AWS_DEFAULT_REGION'] = 'us-east-1'
os.environ['AWS_ACCESS_KEY_ID'] = os.environ.get('test1_access_key')
os.environ['AWS_SECRET_ACCESS_KEY'] = os.environ.get('test1_secret_access_key')
BUCKET_NAME = "covid19-input-bucket"

### Custom function to connect to S3 service

In [6]:
def connect_to_s3_client():
    """

    Description:
        To connect to AWS S3 service.
    Parameter:
        No parameters
    Return:
        ServiceResource s3
    """
    # s3 =  boto3.resource('s3')
    client = boto3.client("s3")
    return client


def connect_to_s3_resource():
    """

    Description:
        To connect to AWS S3 service through an IAM user.
    Parameter:
        No parameters
    Return:
        ServiceResource s3
    """
    s3 =  boto3.resource(service_name = 's3')
    return s3

### Custom function to convert file to .csv

In [14]:
def file_to_csv(filename):
    """

    Description:
        To read records/dictionaries from text file and convert it into a .csv file.
    Parameter:
        string filename
    Return:
        Dataframe df
    """
    with open(filename, encoding="ISO-8859–1") as file:
            data = file.read()
            list_of_incomp_str_dict = data.split('}')
    
    # print(list_of_incomp_str_dict)

    list_of_comp_str_dict = []

    for inc_dict in list_of_incomp_str_dict:
        list_of_comp_str_dict.append(inc_dict+"}")

    list_of_comp_str_dict.pop()
    # print(list_of_comp_str_dict)

    #Now we convert dictionary as string to dictionary
    list_of_dict = []

    for str_dict in list_of_comp_str_dict:
        list_of_dict.append(json.loads(str_dict))

    df = pd.DataFrame(list_of_dict)
    
    return df

### Creating S3 bucket

In [5]:
s3 = connect_to_s3_resource()
client = connect_to_s3_client()

# creating new bucket
client.create_bucket(Bucket = BUCKET_NAME)
print(f"Printing all bucket names to verify if - {BUCKET_NAME} is created:")
for bucket in s3.buckets.all():
    print(bucket.name)

Printing all bucket names to verify if - covid19-input-bucket is created:
auto-load-bucket
aws-cloudtrail-logs-949401335332-4af97cdf
aws-cloudtrail-logs-949401335332-a2ad74b3
covid19-input-bucket


### Uploading each COVID dataset to the S3 bucket

#### Unzipping the 'enigma-jhud' .gz folder to extract the .csv file 

In [22]:
try:
    with gzip.open('enigma-jhud\csv\Enigma-JHU.csv.gz.014c84bE.gz', 'rb') as f_in:
        with open('enigma-jhud\csv\Enigma-JHU.csv.gz.014c84bE.csv', 'wb') as f_out:
            shutil.copyfileobj(f_in, f_out)
except Exception as e:
    print(e)

Compressed file ended before the end-of-stream marker was reached


#### Uploading the 'enigma-jhud' file to the bucket

In [None]:
file_path = r"enigma-jhud\csv\Enigma-JHU.csv.gz.014c84bE.csv"
file_path_key = file_path.replace("\\", "/" )
client.upload_file(Filename = file_path,Bucket = BUCKET_NAME, Key = file_path_key)    

#### Uploading the 'us-county' dataset to the bucket

In [33]:
file_path = r"enigma-nytimes-data-in-usa\csv\us_county\us_county.csv.fDde7E2b"
file_path_key = file_path.replace("\\", "/" )
client.upload_file(Filename = file_path,Bucket = BUCKET_NAME, Key = file_path_key)

#### Uploading the 'us-states' dataset to the bucket

In [34]:
file_path = r"enigma-nytimes-data-in-usa\csv\us_states\us_states.csv"
file_path_key = file_path.replace("\\", "/" )
client.upload_file(Filename = file_path,Bucket = BUCKET_NAME, Key = file_path_key)

In [35]:
file_path = r"enigma-nytimes-data-in-usa\json\us_states\part-00000-ef8ae09e-40cc-486f-a91a-ced8e16a03eb-c000.json"
file_path_key = file_path.replace("\\", "/" )
client.upload_file(Filename = file_path,Bucket = BUCKET_NAME, Key = file_path_key)

#### Uploading the 'states-daily' dataset to the bucket

In [36]:
file_path = "rearc-covid-19-testing-data\csv\states_daily\states_daily.csv.DD2BE07f"
file_path_key = file_path.replace("\\", "/" )
client.upload_file(Filename = file_path,Bucket = BUCKET_NAME, Key = file_path_key)

#### Uploading the 'us-daily' dataset to the bucket

In [37]:
file_path = r"rearc-covid-19-testing-data\csv\us_daily\us_daily.csv"
file_path_key = file_path.replace("\\", "/" )
client.upload_file(Filename = file_path,Bucket = BUCKET_NAME, Key = file_path_key)

#### Uploading the 'us-total-latest' dataset to the bucket

In [38]:
file_path = r"rearc-covid-19-testing-data\csv\us-total-latest\us.csv"
file_path_key = file_path.replace("\\", "/" )
client.upload_file(Filename = file_path,Bucket = BUCKET_NAME, Key = file_path_key)

In [39]:
file_path = "rearc-covid-19-testing-data\json\states_daily\part-00000-349d9bd9-a37f-4ac2-bcad-da29873fdf10-c000.json.93db33f9"
file_path_key = file_path.replace("\\", "/" )
client.upload_file(Filename = file_path,Bucket = BUCKET_NAME, Key = file_path_key)

In [40]:
file_path = r"rearc-covid-19-testing-data\json\us_daily\part-00000-41160f2e-4b45-479b-ae5e-acbc0a0026a5-c000.json"
file_path_key = file_path.replace("\\", "/" )
client.upload_file(Filename = file_path,Bucket = BUCKET_NAME, Key = file_path_key)

In [41]:
file_path = r"rearc-covid-19-testing-data\json\us-total-latest\part-00000-f2e315e2-6055-4fef-98d1-1e620b265158-c000.json"
file_path_key = file_path.replace("\\", "/" )
client.upload_file(Filename = file_path,Bucket = BUCKET_NAME, Key = file_path_key)

#### Uploading the 'usa-hospital-beds' dataset to the bucket

In [59]:
file_path = r"rearc-usa-hospital-beds\usa-hospital-beds.geojson.4cCa297c"
df = file_to_csv(file_path)
df.to_csv(f'{file_path}.csv', index=False, header=True)

file_path = f'{file_path}.csv'
file_path_key = file_path.replace("\\", "/" )
client.upload_file(Filename = file_path,Bucket = BUCKET_NAME, Key = file_path_key)

#### Uploading the 'us-countrycode' dataset to the bucket

In [43]:
file_path = "static-datasets\csv\countrycode\CountryCodeQS.csv"
file_path_key = file_path.replace("\\", "/" )
client.upload_file(Filename = file_path,Bucket = BUCKET_NAME, Key = file_path_key)

#### Uploading the 'countypopulation' dataset to the bucket

In [44]:
file_path = "static-datasets\csv\CountyPopulation\County_Population.csv"
file_path_key = file_path.replace("\\", "/" )
client.upload_file(Filename = file_path,Bucket = BUCKET_NAME, Key = file_path_key)

#### Uploading the 'state-abv' dataset to the bucket

In [48]:
file_path = "static-datasets\csv\state-abv\states_abv.csv"
file_path_key = file_path.replace("\\", "/" )
client.upload_file(Filename = file_path,Bucket = BUCKET_NAME, Key = file_path_key)

In [49]:
file_path = "static-datasets\json\countrycode\part-00000-eb56cbf7-e81a-463b-99a0-fee413e79475-c000.json"
file_path_key = file_path.replace("\\", "/" )
client.upload_file(Filename = file_path,Bucket = BUCKET_NAME, Key = file_path_key)

In [50]:
file_path = "static-datasets\json\state-abv\part-00000-7354f86c-52dd-4eb7-9ad4-5a4425778ea3-c000.json"
file_path_key = file_path.replace("\\", "/" )
client.upload_file(Filename = file_path,Bucket = BUCKET_NAME, Key = file_path_key)