In [2]:
import pandas as pd
from google.cloud import storage
import io

# Set up Google Cloud Storage client
bucket_name = "intelli-ana-bucket"
storage_client = storage.Client()
bucket = storage_client.bucket(bucket_name)

# List all CSV files in the GCS bucket
blobs = bucket.list_blobs()
csv_files = [blob.name for blob in blobs if blob.name.endswith(".csv")]

# Read all CSVs into DataFrames
dfs = []
for file in csv_files:
    blob = bucket.blob(file)
    data = blob.download_as_bytes()
    df = pd.read_csv(io.BytesIO(data))
    df["source_file"] = file  # Add filename as a column for tracking
    dfs.append(df)

# Function to find common columns dynamically
def find_common_columns(df_list):
    column_sets = [set(df.columns) for df in df_list]
    common_cols = set.intersection(*column_sets)  # Find intersection of all column sets
    return common_cols if common_cols else None

# Merge only if common columns exist
if len(dfs) > 1:
    common_columns = find_common_columns(dfs)
    
    if common_columns:
        merged_df = dfs[0]
        for df in dfs[1:]:
            merged_df = pd.merge(merged_df, df, on=list(common_columns), how="inner")  # Join on common columns

        # Save merged output back to GCS
        output_filename = "merged_output.csv"
        output_blob = bucket.blob(output_filename)
        output_blob.upload_from_string(merged_df.to_csv(index=False), content_type="text/csv")

        print(f"Merged file saved at gs://{bucket_name}/{output_filename}")
    else:
        print("No common columns found, skipping merge.")
else:
    print("Not enough CSV files to merge.")


Not enough CSV files to merge.
