In [1]:
from data_integrity import *



In [2]:
train_path = "train.csv"
test_path = "test.csv"
label = "label"
#model_file = "model.pkl"
#prediction_label_column = ""

In [3]:
# Display available checks
print("\nAvailable Data Integrity Checks:")
for i, test in enumerate(check_options.keys()):
    print(f"{i + 1}. {test}")


Available Data Integrity Checks:
1. Is Single Value
2. Special Characters
3. Mixed Nulls
4. Mixed Data Types
5. String Mismatch
6. Data Duplicates
7. String Length Out Of Bounds
8. Conflicting Labels
9. Outlier Sample Detection
10. Feature Label Correlation
11. Feature Feature Correlation
12. Identifier Label Correlation


In [4]:
# User selects which checks to run
selected_indices = input("Enter check numbers to run (comma-separated, e.g., 1,3,5): ")
selected_indices = [int(idx.strip()) - 1 for idx in selected_indices.split(',') if idx.strip().isdigit()]

# Map indices to selected check names
selected_tests = [list(check_options.keys())[i] for i in selected_indices]
selected_tests

Enter check numbers to run (comma-separated, e.g., 1,3,5):  1,2


['Is Single Value', 'Special Characters']

In [5]:
# Data Integirty checks that require user input
checks_with_params = {
    "Special Characters": {"max_ratio":0.001}, #
    "Mixed Nulls": {"max_allowed_null_types": 1}, #
    "Mixed Data Types": {"ratio_range": (0.01, 0.1)}, #
    "String Mismatch": {"num_max_variants": 5}, #
    "Data Duplicates": {"max_ratio": 0.05}, #
    "String Length Out Of Bounds": {"max_outliers":0}, #
    "Conflicting Labels": {"max_ratio":0.0}, #
    "Outlier Sample Detection": {"max_outliers_ratio": 0.1, "outlier_score_threshold": 5}, #
    "Feature Label Correlation": {"threshold": 0.8}, #
    "Feature Feature Correlation": {"threshold": 0.6}, #
    "Identifier Label Correlation": {"max_pps": 0}, #
    
}



In [6]:
print("Loading data...")
train_df = load_csv(train_path)
test_df = load_csv(test_path)    
# Load or create a sample dataset
run_data_integrity(train_df, test_df, label, selected_tests, checks_with_params)



Loading data...
Loaded train.csv with shape: (1000, 6)
Loaded test.csv with shape: (1000, 6)


Accordion(children=(VBox(children=(HTML(value='\n<h1 id="summary_PBCR8Z9PU9VG0QBYP0WA1ULDM">Custom Data Integr…

In [None]:
import deepchecks
import pandas as pd
from deepchecks.tabular import Suite
from deepchecks.tabular import Dataset
from deepchecks.tabular.checks import (
    IsSingleValue, SpecialCharacters, MixedNulls, MixedDataTypes, StringMismatch, DataDuplicates,
    StringLengthOutOfBounds, ConflictingLabels, OutlierSampleDetection, FeatureLabelCorrelation,
    FeatureFeatureCorrelation, IdentifierLabelCorrelation
)

# Define available checks (without parameters)
check_options = {
    "Is Single Value": IsSingleValue(),
    "Special Characters": SpecialCharacters(),
    "Mixed Nulls": MixedNulls(),
    "Mixed Data Types": MixedDataTypes(),
    "String Mismatch": StringMismatch(),  # Needs parameter
    "Data Duplicates": DataDuplicates(),
    "String Length Out Of Bounds": StringLengthOutOfBounds(),
    "Conflicting Labels": ConflictingLabels(),
    "Outlier Sample Detection": OutlierSampleDetection(),  # Needs parameter
    "Feature Label Correlation": FeatureLabelCorrelation(),
    "Feature Feature Correlation": FeatureFeatureCorrelation(),  # Needs parameter
    "Identifier Label Correlation": IdentifierLabelCorrelation()
}


def run_deepchecks(data1, data2, selected_tests=selected_tests, label=None):
    """Run selected deepchecks data integrity tests on the given dataset."""
    dataset1 = Dataset(data1, label=label)
    dataset2 = Dataset(data2, label=label)
        
    # Create selected checks with conditions where applicable
    selected_checks = []
    for test in selected_tests:
        if test == "String Mismatch":
            check = StringMismatch().add_condition_number_variants_less_or_equal(**checks_with_params[test])
        elif test == "Outlier Sample Detection":
            ignore_columns = []
            if label is not None:
                ignore_columns.append(label)
            check = OutlierSampleDetection(ignore_columns = [label]).add_condition_outlier_ratio_less_or_equal(**checks_with_params[test])
        elif test == "Feature Feature Correlation":
            check = FeatureFeatureCorrelation().add_condition_max_number_of_pairs_above_threshold(**checks_with_params[test],  
                n_pairs = 0)
        elif test == "Is Single Value":
            check = IsSingleValue().add_condition_not_single_value()
        elif test == "Special Characters":
            check = SpecialCharacters().add_condition_ratio_of_special_characters_less_or_equal(**checks_with_params[test])   
        elif test == "Mixed Nulls":
            check = MixedNulls().add_condition_different_nulls_less_equal_to(**checks_with_params[test])
        elif test == "Mixed Data Types":
            check = MixedDataTypes().add_condition_rare_type_ratio_not_in_range(**checks_with_params[test])
        elif test == "Data Duplicates":
            check = DataDuplicates().add_condition_ratio_less_or_equal(**checks_with_params[test])
        elif test == "Conflicting Labels":
            print(test)
            check = ConflictingLabels().add_condition_ratio_of_conflicting_labels_less_or_equal(**checks_with_params[test])
        elif test == "Feature Label Correlation":
            check = FeatureLabelCorrelation().add_condition_feature_pps_less_than(**checks_with_params[test])
        elif test == "Identifier Label Correlation":
            check = IdentifierLabelCorrelation().add_condition_pps_less_or_equal(**checks_with_params[test])
        elif test == "String Length Out Of Bounds":
            check = StringLengthOutOfBounds().add_condition_number_of_outliers_less_or_equal(**checks_with_params[test])
        else:
            check = check_options[test]

        selected_checks.append(check)
    
    # Create and run suite
    suite = Suite("Custom Data Integrity Suite", *selected_checks)
    result = suite.run(dataset1, dataset2)

    # Show results
    result.show()
    result.save_as_html('data_integrity_results.html')
    return result


def load_csv(file_path):
    """Load CSV file into a Pandas DataFrame."""
    try:
        df = pd.read_csv(file_path)
        if df.empty:
            raise ValueError("Uploaded file is empty.")
        print(f"Loaded {file_path} with shape: {df.shape}")
        return df
    except Exception as e:
        print(f"Error loading {file_path}: {e}")
        return None




In [None]:
train_df = load_csv(train_path)
test_df = load_csv(test_path) 

dataset1 = Dataset(train_df, label=label)
dataset2 = Dataset(test_df, label=label)


from deepchecks.tabular.suites import data_integrity
suite = data_integrity()
result = suite.run(dataset1, dataset2)
result.show()

####################################################################################################################
####################################################################################################################
####################################################################################################################
####################################################################################################################
####################################################################################################################
####################################################################################################################
####################################################################################################################
####################################################################################################################
####################################################################################################################
####################################################################################################################
####################################################################################################################
####################################################################################################################

In [None]:
You are an expert python programmer. Create an UI in python as per following instructions
1. Select the test from check_options and print the list of selected options
2. Based on selection of check_options, provide the parameter box for parameters defined in checks_with_params
3. Based on input provided by user, print the selection in dictionary format

# Define available checks (without parameters)
check_options = {
    "String Mismatch Comparison": StringMismatchComparison(),
    "Train Test Samples Mix": TrainTestSamplesMix(),
    "New Label Train Test": NewLabelTrainTest(),
    "New Category Train Test": NewCategoryTrainTest(),
    "Label Drift": LabelDrift(), 
    "Feature Drift": FeatureDrift(),
}

# Checks that require user input parameters
checks_with_params = { 
    "Train Test Samples Mix": {"max_ratio": 0.05},
    "New Label Train Test": {"max_new": 0},
    "New Category Train Test": {"max_new": 0},
    "Label Drift": {"max_allowed_drift_score": 0.15},
    "Feature Drift": {"max_allowed_categorical_score": 0.2, "max_allowed_numeric_score": 0.2, "allowed_num_features_exceeding_threshold": 0},    
}