<a href="https://colab.research.google.com/github/SamudralaAjaykumarrr/Dataset-Profiler-Automated-Report/blob/main/Dataset_Profiler_%2B_Automated_Report.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import os
import pandas as pd
import numpy as np
import json

# Create output directory
OUTPUT_DIR = "/content/data_quality_outputs"
os.makedirs(OUTPUT_DIR, exist_ok=True)

print("Environment ready. No installations needed.")


Environment ready. No installations needed.


In [None]:
import numpy as np
import pandas as pd

# Create Old Dataset
old_df = pd.DataFrame({
    "feature1": np.random.normal(50, 10, 1000),
    "feature2": np.random.uniform(0, 1, 1000),
    "feature3": np.random.randint(0, 100, 1000)
})

# Create New Dataset (small drift added)
new_df = pd.DataFrame({
    "feature1": np.random.normal(52, 11, 1000),   # small drift
    "feature2": np.random.uniform(0, 1, 1000),
    "feature3": np.random.randint(0, 100, 1000)
})

old_df.to_csv(f"{OUTPUT_DIR}/old_data.csv", index=False)
new_df.to_csv(f"{OUTPUT_DIR}/new_data.csv", index=False)

print("Generated old_data.csv and new_data.csv successfully.")
old_df.head()


Generated old_data.csv and new_data.csv successfully.


Unnamed: 0,feature1,feature2,feature3
0,56.031232,0.965909,59
1,53.059681,0.560743,55
2,59.856432,0.634756,98
3,54.725051,0.18348,83
4,50.933473,0.471874,65


In [None]:
file_old = f"{OUTPUT_DIR}/old_data.csv"
file_new = f"{OUTPUT_DIR}/new_data.csv"

old_df = pd.read_csv(file_old)
new_df = pd.read_csv(file_new)

print("Files loaded successfully.\nShapes:")
print("Old:", old_df.shape)
print("New:", new_df.shape)


Files loaded successfully.
Shapes:
Old: (1000, 3)
New: (1000, 3)


In [None]:
report = {}

# 1. Schema check
schema_match = list(old_df.columns) == list(new_df.columns)
report["schema_match"] = schema_match

# 2. Missing values
report["missing_values_old"] = old_df.isnull().sum().to_dict()
report["missing_values_new"] = new_df.isnull().sum().to_dict()

# 3. Mean drift check
mean_old = old_df.mean().to_dict()
mean_new = new_df.mean().to_dict()

mean_drift = {f: mean_new[f] - mean_old[f] for f in mean_old}

report["mean_old"] = mean_old
report["mean_new"] = mean_new
report["mean_drift"] = mean_drift

# 4. Outlier detection
def detect_outliers(df):
    outlier_report = {}
    for col in df.columns:
        q1, q3 = df[col].quantile([0.25, 0.75])
        iqr = q3 - q1
        lower, upper = q1 - 1.5*iqr, q3 + 1.5*iqr
        outliers = ((df[col] < lower) | (df[col] > upper)).sum()
        outlier_report[col] = int(outliers)
    return outlier_report

report["outliers_old"] = detect_outliers(old_df)
report["outliers_new"] = detect_outliers(new_df)

print("Data quality tests completed.")
report


Data quality tests completed.


{'schema_match': True,
 'missing_values_old': {'feature1': 0, 'feature2': 0, 'feature3': 0},
 'missing_values_new': {'feature1': 0, 'feature2': 0, 'feature3': 0},
 'mean_old': {'feature1': 50.249657558649965,
  'feature2': 0.49648717223379074,
  'feature3': 50.482},
 'mean_new': {'feature1': 52.18208769177745,
  'feature2': 0.5086266114114796,
  'feature3': 49.546},
 'mean_drift': {'feature1': 1.9324301331274825,
  'feature2': 0.012139439177688838,
  'feature3': -0.9359999999999999},
 'outliers_old': {'feature1': 6, 'feature2': 0, 'feature3': 0},
 'outliers_new': {'feature1': 10, 'feature2': 0, 'feature3': 0}}

In [None]:
report_path = f"{OUTPUT_DIR}/dq_report.json"

with open(report_path, "w") as f:
    json.dump(report, f, indent=4)

print("Saved report to:", report_path)

# List generated files
!ls -lh /content/data_quality_outputs


Saved report to: /content/data_quality_outputs/dq_report.json
total 84K
-rw-r--r-- 1 root root 842 Nov 16 15:42 dq_report.json
-rw-r--r-- 1 root root 40K Nov 16 15:41 new_data.csv
-rw-r--r-- 1 root root 40K Nov 16 15:41 old_data.csv
