In [None]:
"""
CV4E Project 1: Preparing Data for Machine Learning

This project is a first iteration of preparing data for use in a ResNet machine learning model.
The data are in the form of a .json file, which were downloaded from BIIGLE and located in the following folder:
/Users/talenrimmer/Desktop/CV4E_Code/data/21209-lazo-1-2024-05-22.json

The goals of this project are to:
- Teach a whole-image classifier to identify images that contain a specific class (forage fish)
(If the above is successful, then):
a) quantify forage fish into a density estimate (small, medium, large schools)
b) identify species of forage fish

"""

In [None]:
#Now open every .json file in the directory "/Users/talenrimmer/Desktop/CV4E_Code/data/" as a file and use the function data = json.load(file) to load the data into a variable.
import os
import json
directory = "/Users/talenrimmer/Desktop/CV4E_Code/data/"
# Now we will create a list of all the files in the directory
files = os.listdir(directory)
# Now we will iterate over every .json file in the diretory, load the data into a variable, and print the JSON structure
for file in files:
    if file.endswith(".json"):
        with open(directory + file, 'r') as file:
            data = json.load(file)
            print(json.dumps(data, indent=4))

In [19]:
# Below is the old code, that I use to open a single file:

# Import json lbrary and load in the json file from my local directory 
# import json
# with open("/Users/talenrimmer/Desktop/CV4E_Code/data/21209-lazo-1-2024-05-22.json",'r') as file:
#     data = json.load(file)

    # Print the JSON structure
### print(json.dumps(data["annotations"], indent=4))


In [20]:
# The .json file has the data in the form of a dictionary, so I will print each key (images, annotations, and categories), and define all three in a pandas dataframe:

import pandas as pd

imagedf = pd.DataFrame(data["images"])

annotationsdf = pd.DataFrame(data["annotations"])

categoriesdf = pd.DataFrame(data["categories"])

# Print the first 5 rows of each dataframe to see what the data looks like
categoriesdf.sample(5)
#And the number of columns:
annotationsdf.columns

# Rename the columns in each dataframe so they all represent their id
categoriesdf.rename(columns={'id': 'category_id'}, inplace=True)

annotationsdf.rename(columns={'id': 'annotations_id'}, inplace=True)

imagedf.rename(columns={'id': 'image_id'}, inplace=True)


In [None]:
# Now I will merge the image and annotations dataframes together to create one dataframe that contains all the data
merged_df = pd.merge(annotationsdf, imagedf, on='image_id', how='outer')

#print the length of the merged dataframe to see how many rows it has:

len(merged_df)

In [None]:
# Check unique image_id counts in each DataFrame
annotations_count = annotationsdf['image_id'].nunique()
images_count = imagedf['image_id'].nunique()
merged_count = merged_df['image_id'].nunique()

print(f"Unique image_id in annotationsdf: {annotations_count}")
print(f"Unique image_id in imagedf: {images_count}")
print(f"Unique image_id in merged_df: {merged_count}")

# Verify all rows are retained
annotations_rows = len(annotationsdf)
images_rows = len(imagedf)
merged_rows = len(merged_df)

print(f"Rows in annotationsdf: {annotations_rows}")
print(f"Rows in imagedf: {images_rows}")
print(f"Rows in merged_df: {merged_rows}")

# Check for missing values in image_id column
missing_image_ids = merged_df['image_id'].isna().sum()
print(f"Missing image_id rows in merged_df: {missing_image_ids}")


In [None]:
# Merging all rows and images by image_id:
all_images = pd.merge(imagedf, annotationsdf,
         how = 'outer', on = 'image_id')

all_images.sample(5)

In [None]:
#Time to check for any NA values!

# Check for NA values in the 'name' column
na_values = all_images['annotations_id'].isna()

# Display rows where 'name' column has NA values
missing_values = all_images[na_values]

print(missing_values)

In [None]:
# Merging annotations and images by image_id:

annotations_images = pd.merge(annotationsdf, imagedf,
         how = 'left', on = 'image_id')

merged_df = pd.merge(annotations_images, categoriesdf,
         how = 'left', on = "category_id")

merged_df.sample(5)

In [None]:
# subsetting data such that we just see the IDs, the file name, and the name of the animal
shortened_df = merged_df[["image_id", "category_id", "annotations_id", "file_name", "name"]]
# min_df = merged_df[["annotations_id", "file_name", "name"]]

shortened_df

In [None]:
# Define the list of names to mark as True
# UPDATE 20/1/2025 - I removed Forage fish from the dataset (attempt to make it easier to identify other forage fish)
# UPDATE 25/1/2025 - removing forage fish left too few annotations to work with, so I'm adding them back in
forage_fish_names = [
    'Forage Fish',
    'Sch. A. personatus',
    'Ex. C. Pallasii',
    'Sch. C. Pallasii',
    'Ex. C. aggregata',
    'Sch. C. aggregata',
    'A. personatus (Pacific Sand Lance)',
    'Ex. A. personatus'
]

#UPDATE: here's one with just 'obvious' forage fish that I made:
# forage_fish_names = [
#     'Obv_ff'
# ]

# Create a new column 'forage fish' with True/False values based on the condition
shortened_df['is_forage_fish'] = shortened_df['name'].isin(forage_fish_names)

shortened_df.sample(5)

In [None]:

#and then remove rows that contain the file_name values "GOPR3002.JPG" and "GOPR3270.JPG". These contain images that have annotations of both non-forage fish and forage fish, which we don't want.
# It would be an issue because we're classifying the images as a whole, not the individual annotations.
shortened_df = shortened_df[~shortened_df['file_name'].isin(['GOPR3002.JPG', 'GOPR3270.JPG'])]

#Now query the dataframe to make sure the rows have been removed (return false if the rows are still present):
assert not shortened_df['file_name'].isin(['GOPR3002.JPG', 'GOPR3270.JPG']).any()

#Remove rows with a specific image_id value (these are removed because they contain erronous images)
shortened_df = shortened_df[~shortened_df['image_id'].isin(['5993725', '5992343'])]
#Now query the dataframe to make sure the rows have been removed (return false if the rows are still present):
assert not shortened_df['image_id'].isin(['5993725', '5992343']).any()

shortened_df.sample(10)

In [29]:
# # Now we plot a historgram of the 'is_forage_fish' column to see the distribution of True and False values

# import matplotlib.pyplot as plt
# # Count the occurrences of True and False in the 'forage fish' column
# counts = shortened_df['is_forage_fish'].value_counts()
# counts

# # Plot a histogram
# counts.plot(kind='bar')

# # Add labels and title
# plt.xlabel('Forage Fish (True or False)')
# plt.ylabel('Frequency')
# plt.title('Proportion of False vs True in Forage Fish Column')

# # Show the plot
# plt.show()


In [None]:
"""
REVISED BELOW 29.01.25
This code chunk is Important! It checks for missing images in the images folder and adds them to the dataframe with some values.
This matters because up until this point, we've only had the images that were annotated in the dataset. We want to add all the images that we know also exist in the images folder, 
but that don't have the same name as the ones we already have. These new images will be our 'empty' images.
"""

import os
import pandas as pd

# First we find the images from the images folder
for image in os.listdir('/Users/talenrimmer/Desktop/CV4E_Code/images'):
    image_without_ext = image.replace('.jpg', '') # remove the file extension from the image name
    image_without_ext = int(image_without_ext)
    # print(image_without_ext)
    # print(type(image_without_ext))
    # print(type(shortened_df['image_id'].values[0]))
    if not image_without_ext in shortened_df['image_id'].values: #this says 'if the image id is not in the dataframe'
        #add it as a row to an object with columns that are the same as the dataframe:
        new_row = {'image_id': image_without_ext, 'category_id': 'NaN', 'annotations_id': 'NaN', 'file_name': 'NaN', 'name': 'Empty', 'is_forage_fish': False}
        new_row = pd.DataFrame(new_row, index=[0]) #and call the new object a pandas dataframe
    # else: 
    #     print('False')
    # add new row to dataframe
        shortened_df = pd.concat([shortened_df, new_row], ignore_index=True) # Now we add the newly created dataframe to the old dataframe!

     #now we view to make sure we have both 'true' and 'false' values for the 'is_forage_fish' column, to make sure the new rows were added correctly: 
shortened_df

In [None]:
"""
REVISED BELOW 29.01.25
This code chunk is Important! It checks for missing images in the images folder and adds them to the dataframe with some values.
This matters because up until this point, we've only had the images that were annotated in the dataset. We want to add all the images that we know also exist in the images folder, 
but that don't have the same name as the ones we already have. These new images will be our 'empty' images.
"""

import os
import pandas as pd

# # First we find the images from the images folder
# for image in os.listdir('/Users/talenrimmer/Desktop/CV4E_Code/images'):
#     image_without_ext = image.replace('.jpg', '') # remove the file extension from the image name
#     image_without_ext = int(image_without_ext)
#     # print(image_without_ext)
#     # print(type(image_without_ext))
#     # print(type(shortened_df['image_id'].values[0]))
#     if not image_without_ext in shortened_df['image_id'].values: #this says 'if the image id is not in the dataframe'
#         #add it as a row to an object with columns that are the same as the dataframe:
#         new_row = {'image_id': image_without_ext, 'category_id': 'NaN', 'annotations_id': 'NaN', 'file_name': 'NaN', 'name': 'Empty', 'is_forage_fish': False}
#         new_row = pd.DataFrame(new_row, index=[0]) #and call the new object a pandas dataframe
#     # else: 
#     #     print('False')
#     # add new row to dataframe
#         shortened_df = pd.concat([shortened_df, new_row], ignore_index=True) # Now we add the newly created dataframe to the old dataframe!

#      #now we view to make sure we have both 'true' and 'false' values for the 'is_forage_fish' column, to make sure the new rows were added correctly: 
# shortened_df

In [None]:
"""
REVISED BELOW 29.01.25
This code chunk is Important! It checks for missing images in the images folder and adds them to the dataframe with some values.
This matters because up until this point, we've only had the images that were annotated in the dataset. We want to add all the images that we know also exist in the images folder, 
but that don't have the same name as the ones we already have. These new images will be our 'empty' images.
"""

import os
import pandas as pd

# First we find the images from the images folder
for image in os.listdir('/Users/talenrimmer/Desktop/CV4E_Code/images'):
    # Skip hidden files and non-image files
    if image.startswith('.') or not image.lower().endswith(('.png', '.jpg', '.jpeg')):
        continue
    image_without_ext = image.replace('.jpg', '')
    image_without_ext = int(image_without_ext)
    
    if not image_without_ext in shortened_df['image_id'].values: #this says 'if the image id is not in the dataframe'
        #add it as a row to an object with columns that are the same as the dataframe:
        new_row = {'image_id': image_without_ext, 'category_id': 'NaN', 'annotations_id': 'NaN', 'file_name': 'NaN', 'name': 'Empty', 'is_forage_fish': False}
        new_row = pd.DataFrame(new_row, index=[0]) #and call the new object a pandas dataframe
    # else: 
    #     print('False')
    # add new row to dataframe
        shortened_df = pd.concat([shortened_df, new_row], ignore_index=True) # Now we add the newly created dataframe to the old dataframe!

     #now we view to make sure we have both 'true' and 'false' values for the 'is_forage_fish' column, to make sure the new rows were added correctly: 
shortened_df

In [None]:
#Now we check our overall false vs true values in the 'is_forage_fish' column:
import matplotlib.pyplot as plt
# Count the occurrences of True and False in the 'forage fish' column
counts = shortened_df['is_forage_fish'].value_counts()
counts

# Plot a histogram
counts.plot(kind='bar')

# Add labels and title
plt.xlabel('Forage Fish (True or False)')
plt.ylabel('Frequency')
plt.title('Proportion of False vs True in Forage Fish Column')

"""
Turn on the below code if you want to see this frequency plot of the 'is_forage_fish' column
"""
# Show the plot
# plt.show()

In [None]:
# #Creating a list of value counts for each name
histogram = shortened_df['name'].value_counts()

histogram
# So we can see that there are >2700 empty images in dataset

In [None]:
"""
This code chunk is important! 
It adds a new column to the dataframe that is a numerical representation of the 'is_forage_fish' column.
"""
# Now we make a new column called 'forage_num' that will be 1 if the animal is a forage fish and 0 if it is not
shortened_df['forage_num'] = shortened_df['is_forage_fish'].astype(int) #this converts the boolean values to integers

# saving shortened_df to a .csv file
# shortened_df.to_csv('shortened_df.csv')

# Now we sum all the values in the 'forage_num' column to get the total number of forage fish for each image_id. 
forage_fish_counts = shortened_df.groupby('image_id')['forage_num'].sum()

#and we make this a pandas dataframe:
forage_fish_counts = pd.DataFrame(forage_fish_counts)

"""
Here's code that can find the 5 largest values in the 'forage_fish_counts' dataframe
It's useful if you want to see the images with the most forage fish (or a different taxa) in them.
"""
# now we view the 5 max values of the forage fish counts:
# forage_fish_counts.nlargest(20)

# now we export the dataframe as a .csv file:
# forage_fish_counts.to_csv('forage_fish_counts.csv')

#and now we add this to our shortened_df
# shortened_df = pd.merge(shortened_df, forage_fish_counts, on='image_id')
# shortened_df.sample(10)

In [None]:
# Here we can Visualize the distribution of forage_num in the dataset, plotting the value counts of forage_num 
# that are not 0 and sorting from smallest to largest schools:
test = forage_fish_counts[forage_fish_counts['forage_num']!=0].value_counts().sort_index().plot(kind='bar', figsize=(20,10))
# plt.xlabel('Number of Forage Fish')
# we can see that our largest school size is 2610, and our smallest is 4 fish


In [None]:
#Now lets visualize our largest school size:
forage_fish_counts[forage_fish_counts['forage_num']>2000] #this shows us the row with the largest school size

shortened_df[shortened_df['image_id']==5993932]["file_name"] #this shows us the file name of the image with the largest school size

# Now we'll do the same with the image_id of "5992288":
shortened_df[shortened_df['image_id']==5992288]["file_name"] #this shows us the file name of the image with the largest school size

In [35]:
#Now we look at the image with the most forage fish:
from IPython.display import Image, display

#The code below calls on the image with the largest school. BUT IMPORTANT: do NOT sync with github if you run this code, as the image is too large to upload to github.

# display(Image(filename='/Users/talenrimmer/Desktop/CV4E_Code/images/5993932.jpg'))

In [None]:
#we also want to check the number of images that are not empty
num_no0 = forage_fish_counts[forage_fish_counts['forage_num']!=0]

# ["image_id"]
# test
# for imagename in test:
#     print(imagename)
num_no0

In [None]:
"""
Based on the above, we saw there are some images that contained very few forage fish that I'd like to inspect. 
So, we're going to find the file names for those images
"""
# Now we're subsetting the data for just rows that contain values of forage_num <30 and >0:
forage_fish_counts_30 = forage_fish_counts[forage_fish_counts['forage_num']<30]

forage_fish_counts_30 = forage_fish_counts_30[forage_fish_counts_30['forage_num']>0]
forage_fish_counts_30

#Now we're going to find all the rows in shortened_df that contain the image_id values in forage_fish_counts_30:
shortened_df_30 = shortened_df[shortened_df['image_id'].isin(forage_fish_counts_30.index)]
shortened_df_30

#And now we can use this info to discuss with our annotators

In [38]:
# Now calculate number of images in forage_fish_counts that = True, and downsample the false to match:

# Now we calculate number of images in forage_fish_counts that have the value True
forage_fish_counts_true = forage_fish_counts[forage_fish_counts['forage_num']!=0]

# Now we remove the row that has the image_id 5992343, as it's erronous:
forage_fish_counts_true = forage_fish_counts_true[forage_fish_counts_true.index != 5992343]
forage_fish_counts_true
# Now we downsample the false values to match the number of true values:
forage_fish_counts_false = forage_fish_counts[forage_fish_counts['forage_num']==0].sample(n=len(forage_fish_counts_true))
forage_fish_counts_false

#Now we create a training dataset with 63 images from the forage_fish_counts_true and from the forage_fish_counts_false dataset, called "training_set"
training_set = pd.concat([forage_fish_counts_true, forage_fish_counts_false])

# Now we concatenate the two dataframes:
# forage_fish_counts_ds = pd.concat([forage_fish_counts_true, forage_fish_counts_false])
# forage_fish_counts_ds





In [None]:
#checking rows of forage fish in true:
forage_fish_counts_true

In [None]:
# checking rows of forage fish in false:
forage_fish_counts_false

In [41]:
#Now we create a training, test, and val set from the forage_fish_counts_true and forage_fish_counts_false datasets:

from sklearn.model_selection import train_test_split

# Split the 'forage_fish_counts_true' dataframe
true_train, true_temp = train_test_split(forage_fish_counts_true, test_size=0.3, random_state=38)
true_val, true_test = train_test_split(true_temp, test_size=(1/3), random_state=38)

# Split the 'forage_fish_counts_false' dataframe
false_train, false_temp = train_test_split(forage_fish_counts_false, test_size=0.3, random_state=38)
false_val, false_test = train_test_split(false_temp, test_size=(1/3), random_state=38)

# Combine the corresponding splits from both datasets
# train = pd.concat([true_train, false_train])
# val = pd.concat([true_val, false_val])
# test = pd.concat([true_test, false_test])

# # Verify the sizes
# print(f"Train size: {len(train)}")
# print(f"Validation size: {len(val)}")
# print(f"Test size: {len(test)}")
import shutil
from pathlib import Path

# # creating lists for true train and other combinations (THIS WAS THE ORIGINAL CODE THAT WORKED:)
# #creating lists for each combination:
df_list = [[true_train, "train", "ff"],
[false_train, "train", "empty"],
[true_val, "val", "ff"],
[false_val, "val", "empty"],
[true_test, "test", "ff"],
[false_test, "test", "empty"]
]
# for df, split_type, class_type in df_list:

#     for image_id in df.index.tolist():
#         # shutil.copy(f'/Users/talenrimmer/Desktop/CV4E_Code/images/{image_id}.jpg', f'/Users/talenrimmer/Desktop/CV4E_Code/data_split/{split_type}/{class_type}')
#         #split type
#         #class 
#         src_img = f'/Users/talenrimmer/Desktop/CV4E_Code/images/{image_id}.jpg'
#         dst_img = f'/Users/talenrimmer/Desktop/CV4E_Code/data_split/{split_type}/{class_type}'
#         Path(dst_img).mkdir(parents=True, exist_ok=True) #Checking if the destination img folder exists, if not, create it (for each ff and empty folder)
#         shutil.copy(src_img, dst_img)

In [None]:
# now create a plot to view the total number of images in the training, validation, and test sets:
train_size = len(true_train) + len(false_train)
val_size = len(true_val) + len(false_val)
test_size = len(true_test) + len(false_test)
#now we print these three values:
print(f"Train size: {train_size}")
print(f"Validation size: {val_size}")
print(f"Test size: {test_size}")


# concatinate .jpg to the image_id column
# Looping through train, and add images that correspond to the image_id to a new folder (called train)
# Looping through val, and add images that correspond to the image_id to a new folder (called val)
# # Looping through test, and add images that correspond to the image_id to a new folder (called test)

# # make the train dataframe into a list
# train_list = train.index.tolist()
# val_list = val.index.tolist()
# test_list = test.index.tolist()

# import shutil

# # Looping through train, and add images that correspond to the image_id to a new folder (called train)
# for image_id in train_list:
#     shutil.copy(f'/Users/talenrimmer/Desktop/CV4E_Code/images/{image_id}.jpg', f'/Users/talenrimmer/Desktop/CV4E_Code/data_split/train/ff')
#     shutil.copy(f'/Users/talenrimmer/Desktop/CV4E_Code/images/{image_id}.jpg', f'/Users/talenrimmer/Desktop/CV4E_Code/data_split/train/empty')


# for image_id in val_list:
#     shutil.copy(f'/Users/talenrimmer/Desktop/CV4E_Code/images/{image_id}.jpg', f'/Users/talenrimmer/Desktop/CV4E_Code/data_split/val/ff')
#     shutil.copy(f'/Users/talenrimmer/Desktop/CV4E_Code/images/{image_id}.jpg', f'/Users/talenrimmer/Desktop/CV4E_Code/data_split/val/empty')

# for image_id in test_list:
#     shutil.copy(f'/Users/talenrimmer/Desktop/CV4E_Code/images/{image_id}.jpg', f'/Users/talenrimmer/Desktop/CV4E_Code/data_split/test/ff')
#     shutil.copy(f'/Users/talenrimmer/Desktop/CV4E_Code/images/{image_id}.jpg', f'/Users/talenrimmer/Desktop/CV4E_Code/data_split/test/empty')

In [43]:
train_data = {
    "info": {
        "description": "ff_test_data",
        "year": 2025,
        "contributor": "Talen",
        "date_created": "2025-01-10"
    },
    "images": [],
    "annotations": [],
    "categories": []
}


cis_val_data = {
    "info": {
        "description": "ff_test_data",
        "year": 2025,
        "contributor": "Talen",
        "date_created": "2025-01-10"
    },
    "images": [],
    "annotations": [],
    "categories": []
}


cis_test_data = {
    "info": {
        "description": "ff_test_data",
        "year": 2025,
        "contributor": "Talen",
        "date_created": "2025-01-10"
    },
    "images": [],
    "annotations": [],
    "categories": []
}

In [45]:

categories = [{
    "id": 1,
    "name": "ff"
},{
    "id": 2,
    "name": "empty"
}]


# Train data:
for image_id in true_train.index.tolist():
    image = {
        "id": image_id,
        "file_name": f"{image_id}.jpg",
    }
    train_data["images"].append(image)
    
    annotation = {  
        "image_id": image_id,
        "category_id": 1
    }
    train_data["annotations"].append(annotation)




for image_id in false_train.index.tolist():
    image = {
        "id": image_id,
        "file_name": f"{image_id}.jpg",
    }
    train_data["images"].append(image)
    
    annotation = {  
        "image_id": image_id,
        "category_id": 0
    }
    train_data["annotations"].append(annotation)
    
train_data["categories"] = categories

# Now we save the "train_data" as a .json file called "train_annotations" to the local directory:
with open('/Users/talenrimmer/Desktop/CV4E_Code/ct_classifier_outputs/train_annotations.json', 'w') as outfile:
    json.dump(train_data, outfile)



# with open('train_data.json', 'w') as outfile:
#     json.dump(train_data, outfile)



# Val data:
for image_id in true_val.index.tolist():
    image = {
        "id": image_id,
        "file_name": f"{image_id}.jpg",
    }
    cis_val_data["images"].append(image)
    
    annotation = {  
        "image_id": image_id,
        "category_id": 1
    }
    cis_val_data["annotations"].append(annotation)
    

for image_id in false_val.index.tolist():
    image = {
        "id": image_id,
        "file_name": f"{image_id}.jpg",
    }
    cis_val_data["images"].append(image)
    
    annotation = {  
        "image_id": image_id,
        "category_id": 0
    }
    cis_val_data["annotations"].append(annotation)
    
cis_val_data["categories"] = categories

# Now we save the "cis_val_data" as a .json file called "cis_val_annotations" to the local directory:
with open('/Users/talenrimmer/Desktop/CV4E_Code/ct_classifier_outputs/cis_val_annotations.json', 'w') as outfile:
    json.dump(cis_val_data, outfile)


# Test data:
for image_id in true_test.index.tolist():
    image = {
        "id": image_id,
        "file_name": f"{image_id}.jpg",
    }
    cis_test_data["images"].append(image)
    
    annotation = {  
        "image_id": image_id,
        "category_id": 1
    }
    cis_test_data["annotations"].append(annotation)


for image_id in false_test.index.tolist():
    image = {
        "id": image_id,
        "file_name": f"{image_id}.jpg",
    }
    cis_test_data["images"].append(image)
    
    annotation = {  
        "image_id": image_id,
        "category_id": 0
    }
    cis_test_data["annotations"].append(annotation)


cis_test_data["categories"] = categories

# Now we save the "cis_test_data" as a .json file called "cis_test_annotations" to the local directory:
with open('/Users/talenrimmer/Desktop/CV4E_Code/ct_classifier_outputs/cis_test_annotations.json', 'w') as outfile:
    json.dump(cis_test_data, outfile)

In [46]:
#This code chunk transfers all images that match the image_id values in the 'train_data', 'cis_val_data', and 'cis_test_data' dictionaries 
# to a new folder called 'eccv_18_all_images_sm':

import os
import shutil

# Now, we check if a folder called 'eccv_18_all_images_sm' exists, and if it doesn't, we create it:
if not os.path.exists('/Users/talenrimmer/Desktop/CV4E_Code/ct_classifier_outputs/eccv_18_all_images_sm'):
    os.mkdir('/Users/talenrimmer/Desktop/CV4E_Code/ct_classifier_outputs/eccv_18_all_images_sm')

#Now re remove all existing images from the 'eccv_18_all_images_sm' folder:
for image in os.listdir('/Users/talenrimmer/Desktop/CV4E_Code/ct_classifier_outputs/eccv_18_all_images_sm'):
     os.remove(f'/Users/talenrimmer/Desktop/CV4E_Code/ct_classifier_outputs/eccv_18_all_images_sm/{image}')

# Now we copy all image_ids that are contained in the 'train_data', 'cis_val_data', and 'cis_test_data' dictionaries, and add them to the 'eccv_18_all_images_sm' folder:
for image_id in train_data["images"]:
    shutil.copy(f'/Users/talenrimmer/Desktop/CV4E_Code/images/{image_id["id"]}.jpg', '/Users/talenrimmer/Desktop/CV4E_Code/ct_classifier_outputs/eccv_18_all_images_sm')

for image_id in cis_val_data["images"]:
    shutil.copy(f'/Users/talenrimmer/Desktop/CV4E_Code/images/{image_id["id"]}.jpg', '/Users/talenrimmer/Desktop/CV4E_Code/ct_classifier_outputs/eccv_18_all_images_sm')

for image_id in cis_test_data["images"]:
    shutil.copy(f'/Users/talenrimmer/Desktop/CV4E_Code/images/{image_id["id"]}.jpg', '/Users/talenrimmer/Desktop/CV4E_Code/ct_classifier_outputs/eccv_18_all_images_sm')
    
    
# Now re rename the 'images_sm' folder to 'eccv_18_all_images_sm':
# mv /Users/talenrimmer/Desktop/CV4E_Code/images_sm /Users/talenrimmer/Desktop/CV4E_Code/eccv_18_all_images_sm