In [None]:
import urllib
from sqlalchemy import create_engine
import pandas as pd
from datetime import datetime

server='DESKTOP-HJVSCEN\MSSQLSERVER1'
database='Python ETL'
username='sa'a
password='Ka@12345678'


ConnectionString = f"""
    DRIVER={{ODBC Driver 18 for SQL Server}};
    SERVER={server};
    DATABASE={database};
    UID={username};
    PWD={password};
    TrustServerCertificate=yes;
"""
# URL-encode the connection string for SQLAlchemy
params=urllib.parse.quote_plus(ConnectionString)

engine=create_engine(f"mssql+pyodbc:///?odbc_connect={params}")


## Step 1: Create and Upload Initial Dimension Table

In this step, we will create a simple customer dimension dataset which we will later use to apply all Slowly Changing Dimension (SCD) types.

### Table Name:
`customers_base`

### Columns:
- **CustomerID** (int): Unique customer identifier (Primary Key)
- **Name** (varchar): Name of the customer
- **City** (varchar): Current city of the customer
- **Email** (varchar): Customer’s email address
- **LastUpdated** (date): The date when the record was last updated

### Objective:
We will:
1. Create a dummy `DataFrame` in Python representing our dimension table.
2. Insert it into SQL Server using SQLAlchemy.
3. Treat this as our **original dimension table**.

This table will remain **unchanged** throughout all SCD operations so that we can reuse it for comparison and versioning in each SCD implementation.


In [5]:
data=pd.DataFrame([
    {'customerid':101,'name':'Tanuj','city':'Hyderabad','email':'rangatanuj@gmail.com','lastupdated':datetime(2025,1,20)},
    {'customerid':102,'name':'Meenu','city':'Hyderabad','email':'meenu@gmail.com','lastupdated':datetime(2025,2,22)},
    {'customerid':103,'name':'John','city':'Pune','email':'john@gmail.com','lastupdated':datetime(2025,3,24)},
    {'customerid':104,'name':'Smrithi','city':'Mumbai','email':'smrithi@gmail.com','lastupdated':datetime(2025,4,26)},
    {'customerid':105,'name':'Chiru','city':'Banglore','email':'chiru@gmail.com','lastupdated':datetime(2025,5,28)}
])


print(data.dtypes)
data.head()

customerid              int64
name                   object
city                   object
email                  object
lastupdated    datetime64[ns]
dtype: object


Unnamed: 0,customerid,name,city,email,lastupdated
0,101,Tanuj,Hyderabad,rangatanuj@gmail.com,2025-01-20
1,102,Meenu,Hyderabad,meenu@gmail.com,2025-02-22
2,103,John,Pune,john@gmail.com,2025-03-24
3,104,Smrithi,Mumbai,smrithi@gmail.com,2025-04-26
4,105,Chiru,Banglore,chiru@gmail.com,2025-05-28


In [6]:
data.to_sql('customers_base',con=engine,index=False,if_exists='replace')

5

## SCD Type 0 – Retain Original Records (No Changes Allowed)

**Definition:**  
SCD Type 0 is the simplest form of slowly changing dimension where **no changes are ever applied** to the existing records in the dimension table. The original data is considered historically accurate and must remain **unchanged** regardless of incoming data updates.

### Use Case:
This method is useful when a particular column must never change—typically identifiers or historical facts such as:
- Date of Birth
- Original Customer Signup Location
- Social Security Numbers

### Business Rule:
- If any changes are detected in the incoming dataset for a record already present in the dimension table, the changes are **ignored completely**.
- Only **new records** (i.e., with new primary keys) are inserted.

### Technique:
1. Load the existing dimension data from the table `customers_base`.
2. Load a new or changed dataset representing updated customer information.
3. Compare the incoming data against the existing data on `CustomerID`.
4. Identify records in the incoming dataset where `CustomerID` exists **but other fields have changed**.
5. **Do nothing** for those matched-but-different records.
6. **Insert** only the new records (with `CustomerID` not present in `customers_base`).

### Summary:
- Existing records: **Retained as-is**
- Changed records: **Ignored**
- New records: **Inserted**

This ensures the integrity of the original dataset without overwriting or archiving historical changes.


In [7]:
existing_df=pd.read_sql('select * from customers_base',con=engine)
existing_df

Unnamed: 0,customerid,name,city,email,lastupdated
0,101,Tanuj,Hyderabad,rangatanuj@gmail.com,2025-01-20
1,102,Meenu,Hyderabad,meenu@gmail.com,2025-02-22
2,103,John,Pune,john@gmail.com,2025-03-24
3,104,Smrithi,Mumbai,smrithi@gmail.com,2025-04-26
4,105,Chiru,Banglore,chiru@gmail.com,2025-05-28


In [9]:
# we can access a data frame in this way using the boolean values
existing_df[[True,False,True,False,True]]

Unnamed: 0,customerid,name,city,email,lastupdated
0,101,Tanuj,Hyderabad,rangatanuj@gmail.com,2025-01-20
2,103,John,Pune,john@gmail.com,2025-03-24
4,105,Chiru,Banglore,chiru@gmail.com,2025-05-28


In [25]:
incoming_df=pd.DataFrame(
    [
        {'customerid':101,'name':'Tanuj','city':'Hyderabad','email':'rangatanuj50003@gmail.com','lastupdated':datetime(2025,1,20)},#i have changed the email but it will be ignored and not updated in the table and the row will remain unchanged and the email will not be updated
        {'customerid':102,'name':'Meenu','city':'Hyderabad','email':'meenu@gmail.com','lastupdated':datetime(2025,2,22)},#this one is left as it is so this row also remains unchanged
        {'customerid':103,'name':'John','city':'Pune','email':'john@gmail.com','lastupdated':datetime(2029,3,24)},#changed the date but it will remain as previous date only no changes will reflect in the table
        {'customerid':106,'name':'Jack','city':'Kolkata','email':'jack@email.com','lastupdated':datetime(2025,6,23)}
    ]
)

incoming_df

Unnamed: 0,customerid,name,city,email,lastupdated
0,101,Tanuj,Hyderabad,rangatanuj50003@gmail.com,2025-01-20
1,102,Meenu,Hyderabad,meenu@gmail.com,2025-02-22
2,103,John,Pune,john@gmail.com,2029-03-24
3,106,Jack,Kolkata,jack@email.com,2025-06-23


In [26]:
only_new_df=incoming_df[~incoming_df['customerid'].isin(existing_df['customerid'])]
only_new_df

Unnamed: 0,customerid,name,city,email,lastupdated
3,106,Jack,Kolkata,jack@email.com,2025-06-23


In [27]:
scd_0_df=pd.concat([existing_df,only_new_df],axis=0)
scd_0_df

Unnamed: 0,customerid,name,city,email,lastupdated
0,101,Tanuj,Hyderabad,rangatanuj@gmail.com,2025-01-20
1,102,Meenu,Hyderabad,meenu@gmail.com,2025-02-22
2,103,John,Pune,john@gmail.com,2025-03-24
3,104,Smrithi,Mumbai,smrithi@gmail.com,2025-04-26
4,105,Chiru,Banglore,chiru@gmail.com,2025-05-28
3,106,Jack,Kolkata,jack@email.com,2025-06-23


In [28]:
scd_0_df.to_sql("scd_0",con=engine,index=False,if_exists='replace')

6

In [29]:
scd_0=pd.read_sql('select * from scd_0',con=engine)

In [30]:
scd_0

Unnamed: 0,customerid,name,city,email,lastupdated
0,101,Tanuj,Hyderabad,rangatanuj@gmail.com,2025-01-20
1,102,Meenu,Hyderabad,meenu@gmail.com,2025-02-22
2,103,John,Pune,john@gmail.com,2025-03-24
3,104,Smrithi,Mumbai,smrithi@gmail.com,2025-04-26
4,105,Chiru,Banglore,chiru@gmail.com,2025-05-28
5,106,Jack,Kolkata,jack@email.com,2025-06-23
