In [1]:
import boto3
import io
import csv
import snowflake.connector

In [2]:
aws_access_key = 'XXXXXXXXXXXXXXXX'
aws_secret_key = 'XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX'
aws_s3_bucket_name = 'XXXXXXXXXXXXXXXX'  
aws_s3_object = 'chunked_csv/'

snowflake_account = 'kd66798.ca-central-1.aws'
snowflake_user = 'PRAVEEN11001'
snowflake_password = 'XXXXXXXXXXXXXXXX'
snowflake_database = 'DEMO'
snowflake_schema = 'S3_OBJECTS'

conn = snowflake.connector.connect(
    user=snowflake_user,
    password=snowflake_password,
    account=snowflake_account,
    database=snowflake_database,
    schema=snowflake_schema
)

cur = conn.cursor()
s3 = boto3.client('s3', aws_access_key_id=aws_access_key, aws_secret_access_key=aws_secret_key)

In [3]:
response = s3.list_objects_v2(Bucket=aws_s3_bucket_name, Prefix='xmls')

for obj in response['Contents']:
    key = obj['Key']
    if key.endswith('.csv'):
        obj_size = obj['Size']
        response = s3.get_object(Bucket=aws_s3_bucket_name, Key=key)
        file_content = response['Body'].read().decode('utf-8')

        if obj_size > 10 * 1024:
            chunk_size = 10 * 1024  
            reader = csv.reader(io.StringIO(file_content))
            headers = next(reader)  
            chunks = []
            current_chunk = [','.join(headers)]  # Include header in the first chunk
            current_chunk_size = len(','.join(headers).encode('utf-8'))

            for row in reader:
                row_str = ','.join(row)
                row_size = len(row_str.encode('utf-8'))
                if current_chunk_size + row_size <= chunk_size:
                    current_chunk.append(row_str)
                    current_chunk_size += row_size
                else:
                    chunks.append(current_chunk)
                    current_chunk = [','.join(headers), row_str]  
                    current_chunk_size = len(','.join(headers).encode('utf-8')) + row_size

            if current_chunk:
                chunks.append(current_chunk)

            for i, chunk in enumerate(chunks):
                partition_key = f"{aws_s3_object}{key.split('/')[1].split('.')[0]}_{i}.csv".replace('-', '_')
                partition_data = '\n'.join(chunk)
                s3.put_object(Bucket=aws_s3_bucket_name, Key=partition_key, Body=partition_data.encode('utf-8'))
                print(f"Uploaded partition {i + 1} of {key} to S3 with key: {partition_key}")
        else:
            s3.put_object(Bucket=aws_s3_bucket_name, Key=f"{aws_s3_object}{key.split('/')[1]}".replace('-', '_'), Body=file_content.encode('utf-8'))
            print(f"Uploaded {aws_s3_object}{key.split('/')[1].replace('-', '_')} to S3.")

Uploaded chunked_csv/aws_11001_credentials.csv to S3.
Uploaded chunked_csv/my_data.csv to S3.
Uploaded partition 1 of xmls/student-dataset.csv to S3 with key: chunked_csv/student_dataset_0.csv
Uploaded partition 2 of xmls/student-dataset.csv to S3 with key: chunked_csv/student_dataset_1.csv
Uploaded partition 3 of xmls/student-dataset.csv to S3 with key: chunked_csv/student_dataset_2.csv


In [None]:
def delete_s3_objects(bucket_name, prefix):
    try:
        response = s3.list_objects_v2(Bucket=bucket_name, Prefix=prefix)
        if 'Contents' in response:
            for obj in response['Contents']:
                s3.delete_object(Bucket=bucket_name, Key=obj['Key'])
                print(f"Deleted object: {obj['Key']}")
        else:
            print(f"No objects found with prefix: {prefix}")
    except Exception as e:
        print(f"An error occurred: {str(e)}")

In [None]:
response = s3.list_objects_v2(Bucket=aws_s3_bucket_name, Prefix=aws_s3_object)

for obj in response['Contents']:
    key = obj['Key']
    if key.endswith('.csv'):
        table_name = key.split('/')[-1].split('.')[0]  
        print(f"Creating table for {table_name}")
        create_table_query = f"CREATE OR REPLACE TABLE {table_name} ("

        obj = s3.get_object(Bucket=aws_s3_bucket_name, Key=key)
        csv_content = obj['Body'].read().decode('utf-8').split('\n')
        headers = csv_content[0].split(',')

        for header in headers:
            create_table_query += f'"{header.strip()}" VARCHAR,'

        create_table_query = create_table_query.rstrip(',') + ")"
        print(create_table_query)
        cur.execute(create_table_query)

        copy_query = f"COPY INTO {table_name} FROM "
        copy_query += f"s3://{aws_s3_bucket_name}/{key} "
        copy_query += "CREDENTIALS=(AWS_KEY_ID='{0}' AWS_SECRET_KEY='{1}') ".format(aws_access_key, aws_secret_key)
        copy_query += "FILE_FORMAT=(TYPE=CSV FIELD_OPTIONALLY_ENCLOSED_BY='\"' SKIP_HEADER=1)"
        print(f"Copying data into {table_name}")
        cur.execute(copy_query)

cur.close()
conn.close()

In [None]:
delete_s3_objects(aws_s3_bucket_name, aws_s3_object)