In [1]:
'''

@Author: Vighnesh Harish Bilgi
@Date: 2022-11-25
@Last Modified by: Vighnesh Harish Bilgi
@Last Modified time: 2022-11-25
@Title : Ideation Project - 3. Fetch output .csv files from S3 bucket and and upload it to DyanmoDB Table

'''

'\n\n@Author: Vighnesh Harish Bilgi\n@Date: 2022-11-25\n@Last Modified by: Vighnesh Harish Bilgi\n@Last Modified time: 2022-11-25\n@Title : Ideation Project - 3. Fetch output .csv files from S3 bucket and and upload it to DyanmoDB Table\n\n'

In [2]:
import boto3
import pandas as pd

In [3]:
import os
os.environ['AWS_DEFAULT_REGION'] = 'us-east-1'
os.environ['AWS_ACCESS_KEY_ID'] = os.environ.get('test1_access_key')
os.environ['AWS_SECRET_ACCESS_KEY'] = os.environ.get('test1_secret_access_key')

In [5]:
def connect_to_s3_client():
    """

    Description:
        To connect to AWS S3 service.
    Parameter:
        No parameters
    Return:
        ServiceResource s3
    """
    # s3 =  boto3.resource('s3')
    client = boto3.client("s3")
    return client


def connect_to_s3_resource():
    """

    Description:
        To connect to AWS S3 service through an IAM user.
    Parameter:
        No parameters
    Return:
        ServiceResource s3
    """
    s3 =  boto3.resource(service_name = 's3')
    return s3

In [33]:
def connect_to_dynamoDB():
    """

    Description:
        To connect to AWS DynamoDB service.
    Parameter:
        No parameters
    Return:
        ServiceResource dyDB

    """
    dyDB =  boto3.resource('dynamodb')
    return dyDB

def create_dataset_items(table,dataset_records):
    """

    Description:
        To create items in a table of DynamoDB.
    Parameter:
        dynamodb.table table
    Return:
        No values returned.

    """
    table.put_item(
        Item={
                'id': dataset_records[0],
                'userId': dataset_records[1],
                'title': dataset_records[2],
                'body': dataset_records[3],
                'title_word_count': dataset_records[4],
                'body_word_count': dataset_records[5]
            }
    )

def create_word_count_items(table,dataset_records,partition_key):
    """

    Description:
        To create items in a table of DynamoDB.
    Parameter:
        dynamodb.table table
    Return:
        No values returned.

    """
    table.put_item(
        Item={
                partition_key: dataset_records[0],
                'count': dataset_records[1]
            }
    )

def create_dataset_table(dyDB,table_name):
    """

    Description:
        To create a dynamoDB table for main dataset if it doen't exist
    Parameter:
        ServiceResource dyDB
    Return:
        no value returned

    """
    
    dynamodb_client = boto3.client('dynamodb')
    existing_tables = dynamodb_client.list_tables()['TableNames']

    if table_name not in existing_tables:
        # Create the DynamoDB table.
        dyDB.create_table(
            TableName=table_name,
            KeySchema=[
                {
                    'AttributeName': 'id',
                    'KeyType': 'HASH'
                },
                {
                    'AttributeName': 'userId',
                    'KeyType': 'RANGE'
                }
            ],
            AttributeDefinitions=[
                {
                    'AttributeName': 'id',
                    'AttributeType': 'N'
                },
                {
                    'AttributeName': 'userId',
                    'AttributeType': 'N'
                },
            ],
            ProvisionedThroughput={
                'ReadCapacityUnits': 5,
                'WriteCapacityUnits': 5
            }
        )

def create_word_count_table(dyDB,table_name,partition_key):
    """

    Description:
        To create a dynamoDB table for the word count datasets if it doen't exist
    Parameter:
        ServiceResource dyDB
    Return:
        no value returned

    """
    
    dynamodb_client = boto3.client('dynamodb')
    existing_tables = dynamodb_client.list_tables()['TableNames']

    if table_name not in existing_tables:
        # Create the DynamoDB table.
        dyDB.create_table(
            TableName=table_name,
            KeySchema=[
                {
                    'AttributeName': partition_key,
                    'KeyType': 'HASH'
                }
            ],
            AttributeDefinitions=[
                {
                    'AttributeName': partition_key,
                    'AttributeType': 'S'
                }
            ],
            ProvisionedThroughput={
                'ReadCapacityUnits': 5,
                'WriteCapacityUnits': 5
            }
        )        

### 1. Fetching .csv files from S3 Bucket

#### 1a. Fetch dataset-output .csv file

In [28]:
s3 = connect_to_s3_resource()
client = connect_to_s3_client()

bucket_name = 'dataset-input-bucket'

# Fetch dataset-output .csv file
dataset_output_obj = client.get_object(Bucket=bucket_name, Key='data-output/part-00000-4a641a34-ee29-46d3-b598-fe45a0876cd2-c000.csv')
# convert .csv to pandas dataframe
dataset_output_df = pd.read_csv(dataset_output_obj['Body'])
print(dataset_output_df.head(5))

   userId  id                                              title  \
0       1   1  sunt aut facere repellat provident occaecati e...   
1       1   2                                       qui est esse   
2       1   3  ea molestias quasi exercitationem repellat qui...   
3       1   4                               eum et est occaecati   
4       1   5                                 nesciunt quas odio   

                                                body  title_word_count  \
0  quia et suscipit suscipit recusandae consequun...                 9   
1  est rerum tempore vitae sequi sint nihil repre...                 3   
2  et iusto sed quo iure voluptatem occaecati omn...                 9   
3  ullam et saepe reiciendis voluptatem adipisci ...                 4   
4  repudiandae veniam quaerat sunt sed alias aut ...                 3   

   body_word_count  
0               23  
1               31  
2               26  
3               28  
4               23  


#### 1b. Fetch dataset-title-output .csv file

In [36]:
# Fetch dataset-output .csv file
title_output_obj = client.get_object(Bucket=bucket_name, Key='data-title-output/part-00000-81aef290-7783-40ed-984d-d8c715de3d5a-c000.csv')
# convert .csv to pandas dataframe
title_output_df = pd.read_csv(title_output_obj['Body'])
print(title_output_df.head(5))

  title_word  count
0         et     29
1        aut     15
2         ut     14
3        qui     13
4       quia     12


#### 1c. Fetch dataset-body-output .csv file

In [37]:
# Fetch dataset-output .csv file
body_output_obj = client.get_object(Bucket=bucket_name, Key='data-body-output/part-00000-e9c70918-046e-46cb-9325-1b4e62a3d610-c000.csv')
# convert .csv to pandas dataframe
body_output_df = pd.read_csv(body_output_obj['Body'])
print(body_output_df.head(5))

  body_word  count
0        et    110
1       aut     64
2       qui     61
3        ut     54
4       est     45


### 2. Uploading records of .csv files into DynamoDB

#### 2a. Uploading records dataset-output .csv file into a DynamoDB Table

In [34]:
dyDB = connect_to_dynamoDB()
dataset_table_name = 'dataset_output'

create_dataset_table(dyDB,dataset_table_name)
table = dyDB.Table(dataset_table_name)
table.wait_until_exists()

print(f"DateTime creation of Table : {table.creation_date_time}")

for index,row in dataset_output_df.iterrows():
    record_as_tuple = row['id'],row['userId'],row['title'],row['body'],row['title_word_count'],row['body_word_count']
    record_as_list = list(record_as_tuple)
    create_dataset_items(table,record_as_list)

DateTime creation of Table : 2022-11-26 14:02:53.348000+05:30


#### 2b. Uploading records dataset-title-output .csv file into a DynamoDB Table

In [38]:
title_table_name = 'title_count_table'
title_word_key = 'title_word'

create_word_count_table(dyDB,title_table_name,title_word_key)
table = dyDB.Table(title_table_name)
table.wait_until_exists()

print(f"DateTime creation of Table : {table.creation_date_time}")

for index,row in title_output_df.iterrows():
    record_as_tuple = row[title_word_key],row['count']
    record_as_list = list(record_as_tuple)
    create_word_count_items(table,record_as_list,title_word_key)

DateTime creation of Table : 2022-11-26 14:05:50.036000+05:30


#### 2c. Uploading records dataset-body-output .csv file into a DynamoDB Table

In [39]:
body_table_name = 'body_count_table'
body_word_key = 'body_word'

create_word_count_table(dyDB,body_table_name,body_word_key)
table = dyDB.Table(body_table_name)
table.wait_until_exists()

print(f"DateTime creation of Table : {table.creation_date_time}")

for index,row in body_output_df.iterrows():
    record_as_tuple = row[body_word_key],row['count']
    record_as_list = list(record_as_tuple)
    create_word_count_items(table,record_as_list,body_word_key)

DateTime creation of Table : 2022-11-26 14:10:43.770000+05:30
