<a href="https://colab.research.google.com/github/P-Brundha/info/blob/main/23BIT012_Incrementaldataprocessing.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# ============================================================
# 🔁 Change Data Capture (CDC) + Incremental Learning Pipeline
# ============================================================

!pip install -q pandas scikit-learn joblib

# ------------------------------------------------------------
# 1️⃣ Step 1: Create Initial Dataset
# ------------------------------------------------------------
import pandas as pd
import random

# Generate synthetic customer data
records = 100
dataset = pd.DataFrame({
    "cust_id": range(1, records + 1),
    "txn_amount": [round(random.uniform(10, 500), 2) for _ in range(records)],
    "age": [random.randint(18, 70) for _ in range(records)],
    "loyalty": [random.randint(0, 100) for _ in range(records)]
})

base_file = "/content/customers_base.csv"
dataset.to_csv(base_file, index=False)
print("✅ Base dataset created and stored at:", base_file)
print(dataset.head())

# ------------------------------------------------------------
# 2️⃣ Step 2: Train Initial Incremental Model
# ------------------------------------------------------------
from sklearn.linear_model import SGDRegressor
from sklearn.model_selection import train_test_split
from joblib import dump, load

# Split features and label
X = dataset[["age", "loyalty"]]
y = dataset["txn_amount"]

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=10
)

# Initialize and train incremental model
regressor = SGDRegressor(max_iter=1000, tol=1e-3, random_state=10)
regressor.partial_fit(X_train, y_train)

print("\n📘 Initial training complete.")
print("🔹 Model test accuracy:", round(regressor.score(X_test, y_test), 4))

# Save the model
dump(regressor, "/content/regressor_v1.joblib")
print("💾 Model saved successfully as /content/regressor_v1.joblib")

# ------------------------------------------------------------
# 3️⃣ Step 3: Simulate CDC Events (Insert, Update, Delete)
# ------------------------------------------------------------
import os

cdc_path = "/content/cdc_updates"
os.makedirs(cdc_path, exist_ok=True)

# Create CDC event data
cdc_events = [
    {"cust_id": 101, "txn_amount": 220.0, "age": 32, "loyalty": 55},  # new insert
    {"cust_id": 8, "txn_amount": 410.0, "age": 28, "loyalty": 65},   # update existing
    {"cust_id": 15, "txn_amount": 0.0, "age": 45, "loyalty": 12},    # delete record
]

# Save each CDC event as a separate file
for idx, ev in enumerate(cdc_events, start=1):
    pd.DataFrame([ev]).to_csv(f"{cdc_path}/cdc_event_{idx}.csv", index=False)
    print(f"📄 Event file generated: cdc_event_{idx}.csv")

# ------------------------------------------------------------
# 4️⃣ Step 4: Apply CDC Changes and Retrain Model Incrementally
# ------------------------------------------------------------
import glob

# Reload dataset and model
data = pd.read_csv(base_file)
model = load("/content/regressor_v1.joblib")

# Process CDC files sequentially
cdc_files = sorted(glob.glob(f"{cdc_path}/*.csv"))

for event_file in cdc_files:
    cdc_data = pd.read_csv(event_file)
    print(f"\n🌀 Processing {os.path.basename(event_file)}")

    for _, rec in cdc_data.iterrows():
        # DELETE operation
        if rec["txn_amount"] == 0:
            data = data[data.cust_id != rec["cust_id"]]
            print(f"❌ Removed cust_id {rec['cust_id']}")
        # UPDATE operation
        elif rec["cust_id"] in data["cust_id"].values:
            data.loc[data.cust_id == rec["cust_id"], ["txn_amount","age","loyalty"]] = \
                rec[["txn_amount","age","loyalty"]].values
            print(f"♻️ Updated cust_id {rec['cust_id']}")
        # INSERT operation
        else:
            data = pd.concat([data, pd.DataFrame([rec])], ignore_index=True)
            print(f"🆕 Inserted cust_id {rec['cust_id']}")

    # Incrementally retrain model
    X_new = cdc_data[["age", "loyalty"]]
    y_new = cdc_data["txn_amount"]
    model.partial_fit(X_new, y_new)
    print(f"✅ Model updated with {os.path.basename(event_file)}")

# ------------------------------------------------------------
# 5️⃣ Step 5: Save Updated Model & Dataset
# ------------------------------------------------------------
updated_data_file = "/content/customers_updated.csv"
updated_model_file = "/content/regressor_v2.joblib"

data.to_csv(updated_data_file, index=False)
dump(model, updated_model_file)

print("\n🚀 Incremental learning process completed.")
print(f"📂 Updated dataset saved at: {updated_data_file}")
print(f"🧠 Updated model saved at: {updated_model_file}")

# ------------------------------------------------------------
# 6️⃣ Step 6: Display Final Output Summary
# ------------------------------------------------------------
print("\n📊 Final Dataset (last 10 entries):")
print(data.tail(10))

final_model = load(updated_model_file)
print("\n⚙️ Model coefficients:", final_model.coef_)


✅ Base dataset created and stored at: /content/customers_base.csv
   cust_id  txn_amount  age  loyalty
0        1      212.33   28       81
1        2      142.28   30       45
2        3      455.66   43       46
3        4      186.14   31       98
4        5      433.72   65       85

📘 Initial training complete.
🔹 Model test accuracy: -3.335950266043837e+22
💾 Model saved successfully as /content/regressor_v1.joblib
📄 Event file generated: cdc_event_1.csv
📄 Event file generated: cdc_event_2.csv
📄 Event file generated: cdc_event_3.csv

🌀 Processing cdc_event_1.csv
🆕 Inserted cust_id 101.0
✅ Model updated with cdc_event_1.csv

🌀 Processing cdc_event_2.csv
♻️ Updated cust_id 8.0
✅ Model updated with cdc_event_2.csv

🌀 Processing cdc_event_3.csv
❌ Removed cust_id 15.0
✅ Model updated with cdc_event_3.csv

🚀 Incremental learning process completed.
📂 Updated dataset saved at: /content/customers_updated.csv
🧠 Updated model saved at: /content/regressor_v2.joblib

📊 Final Dataset (last 10 en