In [10]:
'''

@Author: Vighnesh Harish Bilgi
@Date: 2022-11-25
@Last Modified by: Vighnesh Harish Bilgi
@Last Modified time: 2022-11-25
@Title : Ideation Project - 1. Transform JSON dataset to .csv file and upload it to an S3 Bucket

'''

'\n\n@Author: Vighnesh Harish Bilgi\n@Date: 2022-11-25\n@Last Modified by: Vighnesh Harish Bilgi\n@Last Modified time: 2022-11-25\n@Title : Ideation Project - 1. Transform JSON dataset to .csv file and upload it to an S3 Bucket\n\n'

In [11]:
import boto3
import json
import pandas as pd

In [12]:
import os
os.environ['AWS_DEFAULT_REGION'] = 'us-east-1'
os.environ['AWS_ACCESS_KEY_ID'] = os.environ.get('test1_access_key')
os.environ['AWS_SECRET_ACCESS_KEY'] = os.environ.get('test1_secret_access_key')

In [13]:
def read_from_json(file_name):
    """
    Description:
        This function 'reads' a JSON file from path 'filename' and returns the JSON file as list of dictionaries.
    Parameter:
        string filename
    Return:
       list list_of_dict
    """
    
    with open(file_name,'r') as f:
        list_of_dict = json.load(f)  
    
    return list_of_dict

def remove_newline(list_of_dict):
    """
    Description:
        This function Removed newline '\n' from values of 'body' key.
    Parameter:
       list list_of_dict
    Return:
       list list_of_dict
    """
    
    for d in list_of_dict:
        for k in d:
            if k == 'body':
                d[k] = d[k].replace("\n", " ") 
    
    return list_of_dict


In [14]:
def connect_to_s3_client():
    """

    Description:
        To connect to AWS S3 service.
    Parameter:
        No parameters
    Return:
        ServiceResource s3
    """
    # s3 =  boto3.resource('s3')
    client = boto3.client("s3")
    return client


def connect_to_s3_resource():
    """

    Description:
        To connect to AWS S3 service through an IAM user.
    Parameter:
        No parameters
    Return:
        ServiceResource s3
    """
    s3 =  boto3.resource(service_name = 's3')
    return s3

### 1. Transform JSON file to Pandas Dataframe and save it as .csv file

In [15]:
# Read JSON file and transform it into a list of dictionaries.
file_name = "posts.json"
list_of_dict = read_from_json(file_name)

# Removing newline '\n' from values of 'body' key.
list_of_dict= remove_newline(list_of_dict)

# Converting list_of_dict into dataframe
df = pd.DataFrame(list_of_dict)
print("Printing info about the dataframe:")
print(df.info())
print("\n Printing top 10 rows about the dataframe:")
print(df.head(10))

df.to_csv('dataset.csv', index= False, header= True)

Printing info about the dataframe:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100 entries, 0 to 99
Data columns (total 4 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   userId  100 non-null    int64 
 1   id      100 non-null    int64 
 2   title   100 non-null    object
 3   body    100 non-null    object
dtypes: int64(2), object(2)
memory usage: 3.2+ KB
None

 Printing top 10 rows about the dataframe:
   userId  id                                              title  \
0       1   1  sunt aut facere repellat provident occaecati e...   
1       1   2                                       qui est esse   
2       1   3  ea molestias quasi exercitationem repellat qui...   
3       1   4                               eum et est occaecati   
4       1   5                                 nesciunt quas odio   
5       1   6                 dolorem eum magni eos aperiam quia   
6       1   7                               magnam facilis autem   
7  

### 2. Send .csv file to an S3 Bucket

In [18]:
s3 = connect_to_s3_resource()
client = connect_to_s3_client()

# creating new bucket
client.create_bucket(Bucket = 'dataset-input-bucket',ACL = 'public-read-write')
print("Printing all bucket names to verify if - dataset-input-bucket is created:")
for bucket in s3.buckets.all():
    print(bucket.name)

# uploading csv file to the new bucket to be accessed by redshift 
client.upload_file(Filename = 'dataset.csv',Bucket = 'dataset-input-bucket', Key = 'data-source/dataset.csv')


Printing all bucket names to verify if - dataset-input-bucket is created:
dataset-input-bucket
redshift-twitter-input-bucket
twitter-streaming-output-bucket
