# Generate .lst File
***

This notebook is for generating an lst file. This file is used for some Sagemaker implementations of image classifier training.

Format of an lst is:

Index \t Class \t File Path

In [2]:
import os
from pathlib import Path
import boto3

def get_all_s3_objects(s3, **base_kwargs):
    continuation_token = None
    contents = []
    while True:
        list_kwargs = dict(MaxKeys=1000, **base_kwargs)
        if continuation_token:
            list_kwargs['ContinuationToken'] = continuation_token
        response = s3.list_objects_v2(**list_kwargs)
        contents += (response.get('Contents', []))
        if not response.get('IsTruncated'):  # At the end of the list?
            break
        continuation_token = response.get('NextContinuationToken')
    return contents

In [3]:
s3_client = boto3.client("s3")
bucket_name = "lantern-rd-east2"
slfFiles = get_all_s3_objects(s3_client, Bucket=bucket_name, Prefix="slf")
notSlfFiles = get_all_s3_objects(s3_client, Bucket=bucket_name, Prefix="not-slf")

In [4]:
print(len(slfFiles))
print(len(notSlfFiles))

1339
2812


In [5]:
for file in slfFiles:
    print(f"File_name: {file['Key']}, size: {file['Size']}")

File_name: slf/0.jpg, size: 63676
File_name: slf/10.jpg, size: 31591
File_name: slf/100.jpg, size: 96950
File_name: slf/1000.jpg, size: 34508
File_name: slf/1001.jpg, size: 37173
File_name: slf/1002.jpg, size: 37173
File_name: slf/1003.jpg, size: 35424
File_name: slf/1004.jpg, size: 35424
File_name: slf/1005.jpg, size: 13825
File_name: slf/1006.jpg, size: 13166
File_name: slf/1007.jpg, size: 30758
File_name: slf/1008.jpg, size: 30758
File_name: slf/1009.jpg, size: 88909
File_name: slf/101.jpg, size: 12002
File_name: slf/1010.jpg, size: 88345
File_name: slf/1011.jpg, size: 20835
File_name: slf/1012.jpg, size: 20835
File_name: slf/1013.jpg, size: 2697
File_name: slf/1014.jpg, size: 2697
File_name: slf/1015.jpg, size: 240571
File_name: slf/1016.jpg, size: 240571
File_name: slf/1017.jpg, size: 39503
File_name: slf/1018.jpg, size: 39503
File_name: slf/1019.jpg, size: 8538
File_name: slf/1020.jpg, size: 8538
File_name: slf/1021.jpg, size: 10245
File_name: slf/1022.jpg, size: 10245
File_name:

In [6]:
slfFiles[0]

{'Key': 'slf/0.jpg',
 'LastModified': datetime.datetime(2023, 1, 15, 16, 40, 30, tzinfo=tzlocal()),
 'ETag': '"e624b1cebd6625268504067d3235a355"',
 'Size': 63676,
 'StorageClass': 'STANDARD'}

In [None]:
fileNum = 0
while os.path.exists("lst-file%s.lst" % fileNum):
    fileNum += 1

output = open("lst-file%s.lst" % fileNum, "w")
    

for i in range(0, len(slfFiles)):
    output.write(str(i) + "\t1\t" + slfFiles[i]['Key'] + '\n')
print("Finished slf: " + str(len(slfFiles)))
for i in range(len(slfFiles), len(notSlfFiles)):
    output.write(str(i) + "\t0\t" + notSlfFiles[i]['Key'] + '\n')
print("Finished not-slf: " + str(len(notSlfFiles)))
output.close()


In [None]:
missing = 0
for i in range(0, len(slfFiles)):
    match = True
    for j in range(0, len(slfFiles)):
        if i == int(slfFiles[j]['Key'][4:-4]):
            #print(slfFiles[j]['Key'] + " matches with " + str(i))
            match = True
            break
        else: match = False
    if not match:    
        print("Couldn't find " + str(i))
        missing += 1
print("Couldn't find " + str(missing) + " file")
        