In [None]:
import pandas as pd
import os

# Load base dataset
file_path = 'datasets/DB for chromophore_Sci_Data_rev02.csv'
df = pd.read_csv(file_path)

# Filter and keep only required columns
df = df[df["Quantum yield"].notna() & df["Chromophore"].notna() & df["Solvent"].notna()]
df = df[["Chromophore", "Solvent", "Quantum yield"]]
df = df[df["Quantum yield"] >= 0.01]
print("Base dataset size:", len(df))

# Load and process extra dataset
extra_path = 'datasets/dataset2.xlsx'
if os.path.exists(extra_path):
    df_extra_raw = pd.read_excel(extra_path)
    cols_lower = {c.lower().strip(): c for c in df_extra_raw.columns}

    def pick(col_candidates):
        for name in col_candidates:
            if name.lower() in cols_lower:
                return cols_lower[name.lower()]
        return None

    chrom_col = pick(["chromophore", "smiles"])
    solv_col = pick(["solvent"])
    qy_col = pick(["quantum yield", "plqy", "plq y", "qy"])

    if chrom_col and solv_col and qy_col:
        df_extra = df_extra_raw[[chrom_col, solv_col, qy_col]].rename(
            columns={chrom_col: "Chromophore", solv_col: "Solvent", qy_col: "Quantum yield"}
        )
        df_extra = df_extra[df_extra["Quantum yield"].notna()
                            & df_extra["Chromophore"].notna()
                            & df_extra["Solvent"].notna()]
        df_extra = df_extra[df_extra["Quantum yield"] >= 0.01]
        print("Extra dataset rows:", len(df_extra))
        
        df = pd.concat([df, df_extra], ignore_index=True)
    else:
        print("dataset2.xlsx: required columns not found")
else:
    print("dataset2.xlsx not found")

print(f"\nTotal combined dataset size: {len(df)}")
print(f"Columns: {list(df.columns)}")
df.head()

In [None]:
# Save combined dataset to a single file
output_path = 'datasets/combined_chromophore_dataset.csv'
df.to_csv(output_path, index=False)
print(f"Saved combined dataset to: {output_path}")
print(f"Total rows: {len(df)}")
print(f"Columns: {list(df.columns)}")