In [7]:
import pandas as pd
from google.cloud import storage
import ipywidgets as widgets
from IPython.display import display, clear_output

# Google Cloud Storage setup
bucket_name = "intelli-ana-bucket"
storage_client = storage.Client()
bucket = storage_client.bucket(bucket_name)

# List all CSV files
blobs = bucket.list_blobs()
csv_files = [blob.name for blob in blobs if blob.name.endswith(".csv")]

# Read all CSVs into DataFrames
dfs = []
for file in csv_files:
    blob = bucket.blob(file)
    data = blob.download_as_bytes()
    df = pd.read_csv(io.BytesIO(data))
    df["source_file"] = file  # Track file name
    dfs.append(df)

# Find common columns dynamically
def find_common_columns(df_list):
    column_sets = [set(df.columns) for df in df_list]
    return list(set.intersection(*column_sets))

common_columns = find_common_columns(dfs)

# Dropdown for column selection
col_selector = widgets.SelectMultiple(
    options=common_columns,
    description="Columns:",
    layout={'width': 'max-content'}
)

# Dropdown for join type
join_type_selector = widgets.Dropdown(
    options=["inner", "left", "right", "outer"],
    value="inner",
    description="Join Type:"
)

# Button to trigger merging
merge_button = widgets.Button(description="Merge Files")
output_area = widgets.Output()

def merge_files(button):
    with output_area:
        clear_output(wait=True)
        selected_columns = list(col_selector.value)
        join_type = join_type_selector.value  # Get user-selected join type
        
        if selected_columns:
            print(f"\n🔄 Performing **{join_type.upper()}** join on: {selected_columns}")

            # Ensure selected columns are of the same type
            for df in dfs:
                for col in selected_columns:
                    if col in df.columns:
                        df[col] = df[col].astype(str).str.strip()

            # Merge DataFrames
            merged_df = dfs[0]
            for df in dfs[1:]:
                merged_df = pd.merge(merged_df, df, on=selected_columns, how=join_type)

            print("\n📊 Shape of Merged DataFrame:", merged_df.shape)

            # Save if merge is successful
            if not merged_df.empty:
                output_filename = f"merged_output_{join_type}.csv"
                output_blob = bucket.blob(output_filename)
                output_blob.upload_from_string(merged_df.to_csv(index=False), content_type="text/csv")
                print(f"\n✅ Merged file saved at gs://{bucket_name}/{output_filename}")
            else:
                print("\n⚠️ Merge resulted in an empty DataFrame. Check selected columns.")
        else:
            print("\n⚠️ No valid column selections made. Operation aborted.")

# Display UI
display(col_selector, join_type_selector, merge_button, output_area)
merge_button.on_click(merge_files)


SelectMultiple(description='Columns:', layout=Layout(width='max-content'), options=('source_file', 'enrollee_i…

Dropdown(description='Join Type:', options=('inner', 'left', 'right', 'outer'), value='inner')

Button(description='Merge Files', style=ButtonStyle())

Output()