## Defining Data Quality SLAs
### Data Completeness
**Description**: Set an SLA that ensures that 95% of data fields in your dataset are filled (non-null values). Practice by checking a dataset of your choice and calculate its completeness.

In [9]:
# write your code from here
import pandas as pd
data={'TransactionID':[101,102,103,104,105,106,107,108,109,110],'CustomerID':[1,2,None,4,5,6,7,8,9,10],'ProductName':['Laptop','Mouse','Keyboard','Monitor','Webcam','Speakers','Headphones','Microphone','Printer',None],'Price':[1200.00,25.00,75.00,300.00,50.00,150.00,80.00,60.00,None,250.00],'Quantity':[1,2,1,1,3,1,2,1,1,1],'OrderDate':['2023-01-01','2023-01-01','2023-01-02','2023-01-02','2023-01-03','2023-01-03','2023-01-04','2023-01-04','2023-01-05','2023-01-05'],'ShippingAddress':['Address A','Address B','Address C','Address D','Address E','Address F','Address G',None,'Address I','Address J']}
df=pd.DataFrame(data)
SLA_COMPLETENESS_PERCENTAGE=95.0
total_cells=df.shape[0]*df.shape[1]
non_null_cells=df.count().sum()
completeness_percentage=(non_null_cells/total_cells)*100
print(f"Calculated Data Completeness: {completeness_percentage:.2f}%")
if completeness_percentage>=SLA_COMPLETENESS_PERCENTAGE:
    print(f"SUCCESS: The dataset meets the Data Completeness SLA of {SLA_COMPLETENESS_PERCENTAGE}%!")
else:
    print(f"FAILURE: The dataset DOES NOT meet the Data Completeness SLA of {SLA_COMPLETENESS_PERCENTAGE}%.")
    print(f"Missing {SLA_COMPLETENESS_PERCENTAGE-completeness_percentage:.2f}% to meet the SLA.")

Calculated Data Completeness: 94.29%
FAILURE: The dataset DOES NOT meet the Data Completeness SLA of 95.0%.
Missing 0.71% to meet the SLA.


### Data Timeliness:
**Description**: Establish an SLA that specifies that data should be integrated and processed within 24 hours of acquisition. Monitor the data pipeline for timeliness.

In [10]:
# write your code from here
import pandas as pd
from datetime import datetime,timedelta
import pytz
DATA_TIMEZONE=pytz.timezone('Asia/Kolkata')
current_processing_time=datetime.now(DATA_TIMEZONE)
data={'RecordID':[1,2,3,4,5,6,7,8,9,10],'Value':['A','B','C','D','E','F','G','H','I','J'],'acquisition_time':[current_processing_time-timedelta(hours=12),current_processing_time-timedelta(hours=23),current_processing_time-timedelta(hours=24,minutes=1),current_processing_time-timedelta(hours=5),current_processing_time-timedelta(days=2),current_processing_time-timedelta(hours=18),current_processing_time-timedelta(hours=23,minutes=59),current_processing_time-timedelta(days=1,hours=2),current_processing_time-timedelta(hours=1),current_processing_time-timedelta(hours=48),]}
df=pd.DataFrame(data)
df['acquisition_time']=pd.to_datetime(df['acquisition_time']) # It's already timezone-aware from the current_processing_time calculations
SLA_TIMELINESS_HOURS=24
SLA_TIMELINESS_THRESHOLD=timedelta(hours=SLA_TIMELINESS_HOURS)
df['processing_time']=current_processing_time
df['processing_lag']=df['processing_time']-df['acquisition_time']
df['is_compliant']=df['processing_lag']<=SLA_TIMELINESS_THRESHOLD
compliant_records_count=df['is_compliant'].sum()
total_records=len(df)
compliance_percentage=(compliant_records_count/total_records)*100
if compliance_percentage>=100:
    print(f"SUCCESS: All records meet the Data Timeliness SLA of {SLA_TIMELINESS_HOURS} hours!")
elif compliance_percentage>95:
    print(f"WARNING: Some records ({100-compliance_percentage:.2f}%) are non-compliant, but overall compliance is high.")
else:
    print(f"FAILURE: The Data Timeliness SLA of {SLA_TIMELINESS_HOURS} hours is NOT met.")
    print("Non-compliant records details:")
    print(df[df['is_compliant']==False][['RecordID','acquisition_time','processing_time','processing_lag']])

FAILURE: The Data Timeliness SLA of 24 hours is NOT met.
Non-compliant records details:
   RecordID                 acquisition_time                  processing_time  \
2         3 2025-05-23 13:20:43.911572+05:30 2025-05-24 13:21:43.911572+05:30   
4         5 2025-05-22 13:21:43.911572+05:30 2025-05-24 13:21:43.911572+05:30   
7         8 2025-05-23 11:21:43.911572+05:30 2025-05-24 13:21:43.911572+05:30   
9        10 2025-05-22 13:21:43.911572+05:30 2025-05-24 13:21:43.911572+05:30   

   processing_lag  
2 1 days 00:01:00  
4 2 days 00:00:00  
7 1 days 02:00:00  
9 2 days 00:00:00  


### Data Consistency:
**Description**: Define an SLA for maintaining consistency across various related datasets. Implement a check to ensure that 99% of data entries are consistent.

In [None]:
# write your code from here
import pandas as pd

# --- 1. Simulate Related Datasets ---
print("1. Simulating Related Datasets:")

# Dataset 1: Products (Master Data)
products_data = {
    'product_id': [101, 102, 103, 104, 105],
    'product_name': ['Laptop', 'Mouse', 'Keyboard', 'Monitor', 'Webcam'],
    'category': ['Electronics', 'Electronics', 'Electronics', 'Electronics', 'Peripherals']
}
df_products = pd.DataFrame(products_data)
print("Products Dataset (df_products):\n", df_products)
print("-" * 30)
# Dataset 2: Orders (Transactional Data - dependent on Products)
# We'll intentionally introduce some inconsistent product_ids
orders_data = {
    'order_id': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20],
    'product_id': [101, 102, 103, 101, 104, # Consistent
                   105, 102, 103, 999, 101, # 999 is inconsistent
                   104, 105, 102, 103, 101, # Consistent
                   888, 104, 105, 102, 103], # 888 is inconsistent
    'quantity': [1, 2, 1, 1, 3, 1, 2, 1, 1, 1, 1, 1, 2, 1, 1, 1, 1, 1, 2, 1],
    'customer_id': [1, 1, 2, 3, 2, 4, 1, 5, 3, 4, 1, 2, 3, 4, 5, 1, 2, 3, 4, 5]
}
df_orders = pd.DataFrame(orders_data)
print("Orders Dataset (df_orders):\n", df_orders)
print(f"\nTotal records in Orders Dataset: {len(df_orders)}")
print("-" * 50)
# --- 2. Define the Data Consistency SLA ---
SLA_CONSISTENCY_PERCENTAGE = 99.0
print(f"2. Defined Data Consistency SLA: {SLA_CONSISTENCY_PERCENTAGE}% of 'product_id' in orders must exist in products.")
print("-" * 50)

# --- 3. Implement Consistency Check ---

print("3. Implementing Consistency Check: Checking if 'product_id' in orders exists in products.")

# Get all unique product_ids from the products dataset (master list)
valid_product_ids = df_products['product_id'].unique()

# Check which product_ids in the orders dataset are NOT in the valid_product_ids list
df_orders['is_consistent'] = df_orders['product_id'].isin(valid_product_ids)

consistent_records_count = df_orders['is_consistent'].sum()
total_records_to_check = len(df_orders)
consistency_percentage = (consistent_records_count / total_records_to_check) * 100

inconsistent_records = df_orders[df_orders['is_consistent'] == False]

print(f"Total records in Orders to check: {total_records_to_check}")
print(f"Consistent records found: {consistent_records_count}")
print(f"Inconsistent records found: {total_records_to_check - consistent_records_count}")
print(f"Calculated Data Consistency: {consistency_percentage:.2f}%")
print("-" * 50)

# --- 4. Check Compliance with SLA ---
print("4. Checking Compliance with SLA:")
if consistency_percentage >= SLA_CONSISTENCY_PERCENTAGE:
    print(f"SUCCESS: The dataset meets the Data Consistency SLA of {SLA_CONSISTENCY_PERCENTAGE}%!")
else:
    print(f"FAILURE: The dataset DOES NOT meet the Data Consistency SLA of {SLA_CONSISTENCY_PERCENTAGE}%.")
    print(f"  Calculated consistency: {consistency_percentage:.2f}%")
    print(f"  Missing {SLA_CONSISTENCY_PERCENTAGE - consistency_percentage:.2f}% to meet the SLA.")
    print("\nDetails of Inconsistent Records (Orders with invalid product_id):")
    print(inconsistent_records[['order_id', 'product_id', 'is_consistent']])

print("-" * 50)

1. Simulating Related Datasets:
Products Dataset (df_products):
    product_id product_name     category
0         101       Laptop  Electronics
1         102        Mouse  Electronics
2         103     Keyboard  Electronics
3         104      Monitor  Electronics
4         105       Webcam  Peripherals
------------------------------
Orders Dataset (df_orders):
     order_id  product_id  quantity  customer_id
0          1         101         1            1
1          2         102         2            1
2          3         103         1            2
3          4         101         1            3
4          5         104         3            2
5          6         105         1            4
6          7         102         2            1
7          8         103         1            5
8          9         999         1            3
9         10         101         1            4
10        11         104         1            1
11        12         105         1            2
12        1