In [1]:
# Imports
import pandas as pd
import zipfile

# Pandas options
pd.set_option('display.max_colwidth', None)

The provided dataset (../data/raw/sunburst_images.zip) contains images. <br>
We want to create our own dataset using the classifications in the dataset. <br>
Since we want to use the uncompressed intensity data, we won't be using the images. <br>
Here we create a list of all the sunbursts in the provided dataset.

In [2]:
zip_file_path = "../../data/raw/sunburst_images.zip"

with zipfile.ZipFile(zip_file_path, "r") as zip_file:
    file_names = zip_file.namelist()

# Create a DataFrame with all the file names
df = pd.DataFrame(file_names, columns=["File Name"])

# Filter out MacOS artifacts and directories
df = df[~df["File Name"].str.contains("__MACOSX")]
df = df[~df["File Name"].str.contains("DS_Store")]
df = df[~df["File Name"].str.endswith("/")]

# Extract the last folder, file name, and file extension
df["Classification"] = df["File Name"].str.split("/").str[-2]
df["File Name"] = df["File Name"].str.split("/").str[-1]
df["Extension"] = df["File Name"].str.split(".").str[-1]
df["File Name"] = df["File Name"].str.split(".").str[0]

# Extract info from the file name
df["Start"] = pd.to_datetime(df["File Name"].str.split("_").str[0], format="%Y-%m-%d %H-%M-%S")
df["End"] = pd.to_datetime(df["File Name"].str.split("_").str[1], format="%Y-%m-%d %H-%M-%S")
df["Instrument"] = df["File Name"].str.split("_").str[2:5].str.join("_")
df["Instrument"] = df["Instrument"].str.replace("-None", "")

# Reorder the columns
df = df[["Classification", "Instrument", "Start", "End", "File Name", "Extension"]]

df

Unnamed: 0,Classification,Instrument,Start,End,File Name,Extension
16,no_burst,australia_assa_02,2021-08-31 08:09:00,2021-08-31 08:10:00,2021-08-31 08-09-00_2021-08-31 08-10-00_australia_assa_02_None_no_burst,png
18,no_burst,alaska_haarp_62,2023-07-06 22:45:00,2023-07-06 22:46:00,2023-07-06 22-45-00_2023-07-06 22-46-00_alaska_haarp_62_None_no_burst,png
20,no_burst,swiss_landschlacht_01,2021-05-12 06:39:00,2021-05-12 06:40:00,2021-05-12 06-39-00_2021-05-12 06-40-00_swiss_landschlacht_01_None_no_burst,png
22,no_burst,humain_59_None,2022-02-28 06:29:00,2022-02-28 06:30:00,2022-02-28 06-29-00_2022-02-28 06-30-00_humain_59_None_no_burst,png
24,no_burst,australia_assa_02,2021-06-08 02:42:00,2021-06-08 02:43:00,2021-06-08 02-42-00_2021-06-08 02-43-00_australia_assa_02_None_no_burst,png
...,...,...,...,...,...,...
172936,2,australia_assa_02,2021-09-08 00:07:00,2021-09-08 00:08:00,2021-09-08 00-07-00_2021-09-08 00-08-00_australia_assa_02_None_2,png
172938,5,australia_assa_02,2021-10-10 02:16:00,2021-10-10 02:17:00,2021-10-10 02-16-00_2021-10-10 02-17-00_australia_assa_02_None_5,png
172940,5,australia_assa_02,2021-10-25 21:57:00,2021-10-25 21:58:00,2021-10-25 21-57-00_2021-10-25 21-58-00_australia_assa_02_None_5,png
172942,5,australia_assa_02,2021-06-03 03:08:00,2021-06-03 03:09:00,2021-06-03 03-08-00_2021-06-03 03-09-00_australia_assa_02_None_5,png


In [3]:
df.to_csv("../../data/processed/sunburst_images_predownload.csv", index=False)