In [39]:
import pandas as pd
df1  = pd.read_csv('../data/Customer_Master.csv')
df2 = pd.read_csv('../data/Customer_Updates.csv')


In [40]:
# ----------------------------
# Step 1: Prepare current df1 only
# ----------------------------
df1 = df1.copy()
df2 = df2.copy()

current_df1 = df1[df1["CurrentFlag"] == 1]

# ----------------------------
# Step 2: Find existing vs new customers
# ----------------------------
existing_ids = set(current_df1["CustomerID"])
incoming_ids = set(df2["CustomerID"])

new_ids = incoming_ids - existing_ids
existing_ids_to_update = incoming_ids & existing_ids

# ----------------------------
# Step 3: Handle updates for existing customers
# ----------------------------
merged = current_df1.merge(df2, on="CustomerID", suffixes=("_old", "_new"), how="inner")

new_records = []

for _, row in merged.iterrows():
    cust_id = row["CustomerID"]

    # --- SCD2 for Address ---
    if row["Address_old"] != row["Address_new"]:
        # Close old record
        df1.loc[(df1.CustomerID == cust_id) & (df1.CurrentFlag == 1), "CurrentFlag"] = 0

        # Insert new record with incremented version
        new_version = row["Version"] + 1
        new_records.append({
            "CustomerID": cust_id,
            "Address": row["Address_new"],
            "Email": row["Email_new"],  # keep email updated (SCD1 applied here)
            "Version": new_version,
            "CurrentFlag": 1
        })

    # --- SCD1 for Email ---
    elif row["Email_old"] != row["Email_new"]:
        df1.loc[(df1.CustomerID == cust_id) & (df1.CurrentFlag == 1), "Email"] = row["Email_new"]

# ----------------------------
# Step 4: Handle completely new customers
# ----------------------------
if new_ids:
    new_customers = df2[df2["CustomerID"].isin(new_ids)].copy()
    new_customers["Version"] = 1
    new_customers["CurrentFlag"] = 1
    new_records.extend(new_customers.to_dict(orient="records"))
# ----------------------------
# Step 5: Append new records
# ----------------------------
if new_records:
    df1 = pd.concat([df1, pd.DataFrame(new_records)], ignore_index=True)

# ----------------------------
# Final Result
# ----------------------------
df1 = df1.sort_values(["CustomerID", "Version"]).reset_index(drop=True)
print(df1)


    CustomerID FirstName LastName                 Email         Phone  \
0          101      John      Doe     john.doe@mail.com  1.234568e+09   
1          101       NaN      NaN  john.doe@newmail.com           NaN   
2          102      Jane    Smith   jane.smith@mail.com  9.876543e+09   
3          103     Alice  Johnson      alice.j@mail.com  5.551235e+09   
4          104       Bob    Brown     bob.b@newmail.com  4.449877e+09   
5          105     Carol    White      carol.w@mail.com  3.335559e+09   
6          106     David    Green      david.g@mail.com  2.224447e+09   
7          107      Emma    Black       emma.b@mail.com  1.113336e+09   
8          108     Frank     Blue      frank.b@mail.com  7.778890e+09   
9          108       NaN      NaN   frank.b@newmail.com           NaN   
10         109     Grace      Red      grace.r@mail.com  6.667779e+09   
11         110     Henry     Gold      henry.g@mail.com  5.556668e+09   
12         111       Ivy     Gray        ivy.g@mail

ValueError: Can only compare identically-labeled Series objects