## Read multiple s3 objects

Let us understand how we can read multiple s3 objects into the collection.
* Create client using appropriate profile.
* Get 10 objects using the client.
* Read the content of all 10 objects into collection

In [1]:
import boto3

In [2]:
import os
os.environ.setdefault('AWS_PROFILE', 'itvgenlogs')

'itvgenlogs'

In [3]:
s3_client = boto3.client('s3')

In [4]:
s3_objects = s3_client.list_objects(
    Bucket='itv-genlogs-mana00',
    Prefix='logs/year',
    MaxKeys=10
)

In [5]:
s3_objects['Contents']

[{'Key': 'logs/year=2024/month=02/day=16/gen_logs_s3-1-2024-02-16-12-51-20-2b9337a1-7e5c-4a5d-84cc-3587b8d40e07',
  'LastModified': datetime.datetime(2024, 2, 16, 12, 52, 22, tzinfo=tzutc()),
  'ETag': '"d7592fc75dbb74e210dfe695cab29f83"',
  'Size': 24450,
  'StorageClass': 'STANDARD',
  'Owner': {'DisplayName': 'laiaddara',
   'ID': '709f2485bbc57aa0687e130826e0d8c48d3beaba7e7f08305a5a39db5536f4f3'}},
 {'Key': 'logs/year=2024/month=02/day=16/gen_logs_s3-1-2024-02-16-12-53-21-23cefb72-3c41-4dd8-8da0-0597a1b0494d',
  'LastModified': datetime.datetime(2024, 2, 16, 12, 54, 22, tzinfo=tzutc()),
  'ETag': '"8eb5281271d1bcfc800c8893036a6a94"',
  'Size': 12509,
  'StorageClass': 'STANDARD',
  'Owner': {'DisplayName': 'laiaddara',
   'ID': '709f2485bbc57aa0687e130826e0d8c48d3beaba7e7f08305a5a39db5536f4f3'}},
 {'Key': 'logs/year=2024/month=02/day=16/gen_logs_s3-1-2024-02-16-12-54-22-18f05e40-b7a1-452c-8bf8-d059b0913ef5',
  'LastModified': datetime.datetime(2024, 2, 16, 12, 55, 23, tzinfo=tzutc(

In [8]:
# Method 1 to get keys collection
s3_object_keys = [s3_object['Key'] for s3_object in s3_objects['Contents']] 

In [9]:
s3_object_keys

['logs/year=2024/month=02/day=16/gen_logs_s3-1-2024-02-16-12-51-20-2b9337a1-7e5c-4a5d-84cc-3587b8d40e07',
 'logs/year=2024/month=02/day=16/gen_logs_s3-1-2024-02-16-12-53-21-23cefb72-3c41-4dd8-8da0-0597a1b0494d',
 'logs/year=2024/month=02/day=16/gen_logs_s3-1-2024-02-16-12-54-22-18f05e40-b7a1-452c-8bf8-d059b0913ef5',
 'logs/year=2024/month=02/day=16/gen_logs_s3-1-2024-02-16-12-55-23-d9eae8b4-a812-4f2e-9f5d-a5ff076d3eec',
 'logs/year=2024/month=02/day=16/gen_logs_s3-1-2024-02-16-12-56-24-5d6ec72b-f34e-495d-9635-1b99392e7df4',
 'logs/year=2024/month=02/day=16/gen_logs_s3-1-2024-02-16-12-57-25-b22faaa6-8fbf-4f5f-88ea-1b643daf555b',
 'logs/year=2024/month=02/day=16/gen_logs_s3-1-2024-02-16-12-58-26-80ac9ca0-1fe2-446b-b0fc-ad0ba6a991a4']

In [10]:
# Method 2 to get keys collection
s3_object_keys = []

for s3_object in s3_objects['Contents']:
    s3_object_key = s3_object['Key']
    s3_object_keys.append(s3_object_key)

In [11]:
s3_object_keys[:3]

['logs/year=2024/month=02/day=16/gen_logs_s3-1-2024-02-16-12-51-20-2b9337a1-7e5c-4a5d-84cc-3587b8d40e07',
 'logs/year=2024/month=02/day=16/gen_logs_s3-1-2024-02-16-12-53-21-23cefb72-3c41-4dd8-8da0-0597a1b0494d',
 'logs/year=2024/month=02/day=16/gen_logs_s3-1-2024-02-16-12-54-22-18f05e40-b7a1-452c-8bf8-d059b0913ef5']

### 1. Read Single Object Content

In [12]:
import boto3

import os
os.environ.setdefault('AWS_PROFILE', 'itvgenlogs')

s3_client = boto3.client('s3')

s3_objects = s3_client.list_objects(
    Bucket='itv-genlogs-mana00',
    Prefix='logs/year'
)

s3_object_key = s3_objects['Contents'][0]['Key']
s3_object = s3_client.get_object(
    Bucket='itv-genlogs-mana00',
    Key=s3_object_key
)

file_contents = s3_object['Body'].read().decode('utf-8')
# s3_object['Body'] is of type botocore.response.StreamingBody
len(file_contents.splitlines())

122

### 2. Read Multiple Object Content

In [13]:
l1 = [1, 2, 3]
l2 = [3, 4, 5]
l1 + l2

[1, 2, 3, 3, 4, 5]

In [14]:
data = []
for s3_object_key in s3_object_keys:
    s3_object_contents = s3_client. \
        get_object(
            Bucket='itv-genlogs-mana00',
            Key=s3_object_key
        )['Body']. \
        read(). \
        decode('utf-8'). \
        splitlines()
    data += s3_object_contents

In [15]:
len(data)

488

In [16]:
data[:3]

['165.220.71.68 - - [16/Feb/2024:12:50:42 -0800] "GET /department/team%20sports/categories HTTP/1.1" 200 1702 "-" "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/35.0.1916.153 Safari/537.36"',
 '45.171.150.225 - - [16/Feb/2024:12:50:30 -0800] "GET /department/team%20sports/categories HTTP/1.1" 200 2100 "-" "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_4) AppleWebKit/537.77.4 (KHTML, like Gecko) Version/7.0.5 Safari/537.77.4"',
 '115.93.215.91 - - [16/Feb/2024:12:50:25 -0800] "GET /categories/indoor/outdoor%20games/products HTTP/1.1" 200 750 "-" "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/36.0.1985.125 Safari/537.36"']

In [None]:
# You can further process data and store in database or a file