In [28]:
%reset -f
import os
from os import listdir
import re
import pandas as pd

ROOT_PROJECT = os.path.dirname(os.getcwd())
SOURCE_PROJECT = os.path.join(ROOT_PROJECT,"src/bhowmik2025_et_al_plots")
input_dir = os.path.join(SOURCE_PROJECT, "input_files")
FITS_DIR = os.path.join(input_dir, "fits_files")
RADIAL_PROF_DIR = os.path.join(input_dir, "frank_profiles")

In [29]:
# Check if the directory exists
if not os.path.exists(FITS_DIR):
    raise Exception(f"Directory {FITS_DIR} does not exist.")

# Create a pandas dataframe of all .fits files in the directory, 
# with their full paths  

rows : list = []
counter=0
for file in os.listdir(FITS_DIR):
    if file.endswith(".fits"):
        has_odisea : str = "ODISEA" in file
        # has_odisea = 0 if has_odisea else False
        
        has_frank : str = "frank" in file
        ismodel : bool = True if has_frank else False
        
        has_iso_oph : str = "ISO_Oph" in file
        # is_iso_oph = True if has_iso_oph else False
        
        has_ra : str = "RA" in file
        # is_ra = True if has_ra else False 
        
        ### YOU NEED TO CHECK IF THE NEXT CONDITIONS ARE CORRECT BY 
        ### COMPARING THE TABLES (NUMBER OF ROWS)####
        if has_odisea:
            source_list : list = re.split(r"[_]+", file)[0:3]
        elif has_ra:
            source_list : list = re.split(r"[_]+", file)[0:1]
        else:
            source_list : list = re.split(r"[_]+", file)[0:2]
        #############################################################
        
        source = "_".join(source_list)
        path = os.path.join(FITS_DIR, file)
        rows.append({"id":counter,"field": source, "is model": ismodel, "path": path})
        counter+=1

rows_rad = []

for file_rad in os.listdir(RADIAL_PROF_DIR):
    if file_rad.endswith(".txt"):
        has_odisea : str = "ODISEA" in file_rad
        has_iso_oph : str = "ISO_Oph" in file_rad
        has_ra : str = "RA" in file_rad
        
        ### YOU NEED TO CHECK IF THE NEXT CONDITIONS ARE CORRECT BY 
        ### COMPARING THE TABLES (NUMBER OF ROWS)####
        if has_odisea:
            source_list : list = re.split(r"[_]+", file_rad)[0:3]
        elif has_ra:
            source_list : list = re.split(r"[_]+", file_rad)[0:1]
        else:
            source_list : list = re.split(r"[_]+", file_rad)[0:2]
        #############################################################
        
        source = "_".join(source_list)
        path = os.path.join(RADIAL_PROF_DIR, file_rad)
        rows_rad.append({"field_rad": source, "path_rad": path})


######### Create a pandas dataframe with the rows ###################
##### rows = [{"id":dummy-counter ,"field": source, "is model": bolean,
# "path": path-to-fits-files, "path_rad": path-to-png-radial-profs}]         
table = pd.DataFrame(rows, columns=["id","field", "is model","path"])
table = pd.merge(table, pd.DataFrame(rows_rad),
                 left_on="field", right_on="field_rad").drop("field_rad", axis=1)

# Sorting, fixing indexes, fixing "field" problems
table = table.sort_values(by=["field"])
table = table.reset_index(drop=True)
table['id'] = table.index
table["field"] = table["field"].str.strip().str.lower()
##########################################################
# Creating a data and model identical tables to merge them again
# keeping path of data and model in the same lines of full_table
table_realdata_nomodelcol = table[table["is model"]==False].drop("is model",axis=1).reset_index(drop=True)
table_realdata_nomodelcol['id'] = table_realdata_nomodelcol.index
table_model_nomodelcol = table[table["is model"]==True].drop("is model",axis=1).reset_index(drop=True)
table_model_nomodelcol['id'] = table_model_nomodelcol.index
table_model_nomodelcol = table_model_nomodelcol.rename(columns={'path':'path_model'})
#### IMPORTANT: table_nomodelcol is the full table predecessor,
# before merging with the table Trisha gave me
# Dont make confusion!!
table_nomodelcol =  pd.merge(table_realdata_nomodelcol,
                          table_model_nomodelcol,
                          left_on=("field","id","path_rad"),
                          right_on=("field","id","path_rad"),
                          validate='1:1')
#####################################################################

######### Read table that Trisha gave me ###########
table_sizes = pd.read_csv(f"{input_dir}/table.csv", index_col=False)

# Standardize the 'field' column in table_sizes by stripping whitespace and 
# converting to lowercase
table_sizes["field"] = table_sizes["field"].str.strip().str.lower()
table_sizes = table_sizes.sort_values(by=["field"])
#####################################################################

########## Debugging mismatches! #####################
# Find rows in table_realdata_nomodelcol that do not have a match 
# in table_sizes
not_in_sizes = table_nomodelcol.merge(
    table_sizes,
    on=["id", "field"],
    how="left",
    indicator=True).query('_merge == "left_only"')

# Find rows in table_sizes that do not have a match in 
# table_realdata_nomodelcol
not_in_realdata = table_sizes.merge(
    table_nomodelcol,
    on=["id", "field"],
    how="left",
    indicator=True).query('_merge == "left_only"')

# print("Rows in table_realdata_nomodelcol not in table_sizes:")
# print(not_in_sizes)

# print("\nRows in table_sizes not in table_realdata_nomodelcol:")
# print(not_in_realdata)

########## Merge the two tables ##############################
if not_in_sizes.empty and not_in_realdata.empty:
    print("There is no mismatch - It is safe to merge!!","\n", 50*"#")
    full_table = pd.merge(table_nomodelcol,
                          table_sizes,
                          left_on=("field","id"),
                          right_on=("field","id"),
                          validate='1:1')
    
#####################################################################

####Now fix the center_x and center_y for using of astropy before exporting
# full table:
def fix_center_x(s):
    # If there are 3 colons, replace the last colon with a dot
    if s.count(':') == 3:
        s = s[::-1].replace(':', '.', 1)[::-1]
    return s

def fix_center_y(s):
    # Replace the first two dots with colons
    return s.replace('.', ':', 2)

# Apply to my DataFrame
full_table['center_x'] = full_table['center_x'].astype(str).apply(fix_center_x)
full_table['center_y'] = full_table['center_y'].astype(str).apply(fix_center_y)

# df['Class'] = df['Class'].replace({'I': 'I/F', 'F': 'I/F'})
# df['Group'] = df['Features'] + '+' + df['Class']
# grouped = df.groupby('Group')

full_table['Class'] = full_table['Class'].replace({'I': 'I_F', 'F': 'I_F'})
full_table["Group"] = full_table["Stage"].astype(str) + "+" + full_table["Class"]
full_table["Group"] = full_table["Group"].str.strip()
# grouped = full_table.groupby('Group')

# full_table_expanded = full_table.assign(Group=full_table['Features'].str.split(',')).explode('Group')
# full_table_expanded['Group'] = full_table_expanded['Group'] + '+' + full_table_expanded['Class']#.replace({'I': 'I_F', 'F': 'I_F'})
# full_table_expanded['Group'] = full_table_expanded['Group'].str.strip()

try:
    table.to_csv(f"{os.getcwd()}/fits_files.csv", index=False)
    print(50*"#", "\n","Saved table.csv successfully!",
          table.info(verbose=True),
          "\n", 50*"#")
    full_table.to_csv(f"{os.getcwd()}/full_table.csv", index=False)
    print(50*"#", "\n","Saved full_table.csv successfully!",
          full_table.info(verbose=True), "\n", 50*"#")
    
    # full_table_expanded = full_table.explode('Group')
    # full_table_expanded.to_csv(f"{os.getcwd()}/full_table_expanded.csv", index=False)
    # print(50*"#", "\n","Saved full_table_expanded.csv successfully!",
    #       full_table_expanded.info(verbose=True), "\n", 50*"#")
    
    # print(table.info(), full_table.info())

except Exception as e:
    print("Error:", e)
# full_table

There is no mismatch - It is safe to merge!! 
 ##################################################
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 202 entries, 0 to 201
Data columns (total 5 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   id        202 non-null    int64 
 1   field     202 non-null    object
 2   is model  202 non-null    bool  
 3   path      202 non-null    object
 4   path_rad  202 non-null    object
dtypes: bool(1), int64(1), object(3)
memory usage: 6.6+ KB
################################################## 
 Saved table.csv successfully! None 
 ##################################################
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 101 entries, 0 to 100
Data columns (total 22 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   id                    101 non-null    int64  
 1   field                 101 non-null    object 
 2   path              