Library Imports


In [3]:
import pandas as pd
from openai import OpenAI
import os
import time
from pydantic import BaseModel
import json
from collections import Counter, defaultdict
import datetime
import numpy as np
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from typing import Dict, Any
from datasketch import MinHash, MinHashLSH

GPT API setup


In [2]:
client = OpenAI(
    organization="org-Efj3WwiBs01tiD9ogyAb1vgz",
    project="proj_ItFIKb0eOHXEFM65qPMVLpHt",
    api_key="sk-proj-waDJ9nwjNNOcQa6Ol0epT3BlbkFJbwwL9qZnqZmBMVCwtvOX",
)

Prompting Utilities


In [3]:
# Write the prompts used, and the GPT output to a folder for later inspection
def write_output(
    gpt_output: Dict[str, Any],
    user_prompt: str,
    system_prompt: str,
    prompt_title: str,
    directory: str,
    input_data_dict: pd.DataFrame,
) -> None:

    try:
        # Create the subdirectory for the pompt_title if it doesn't exist
        prompt_directory = os.path.join(
            directory, prompt_title.strip()
        )  # Strip whitespace
        os.makedirs(prompt_directory, exist_ok=True)

        # Define the file paths
        output_text_file = os.path.join(prompt_directory, "output.json")
        user_prompt_text_file = os.path.join(prompt_directory, "user_prompt.txt")
        sys_prompt_text_file = os.path.join(prompt_directory, "system_prompt.txt")
        input_data_dict_file = os.path.join(prompt_directory, "dict.csv")

        # Write the output
        with open(output_text_file, "w") as f:
            f.write(gpt_output)

        # Write the prompt
        with open(user_prompt_text_file, "w") as f:
            f.write(user_prompt)

        with open(sys_prompt_text_file, "w") as f:
            f.write(system_prompt)

        input_data_dict.to_csv(
            input_data_dict_file, index=False, sep=",", lineterminator="\n"
        )

        print(f"Output and prompt saved in directory: {prompt_directory}")

    except FileNotFoundError as e:
        print(f"Error: {e}")

    except Exception as e:
        print(f"An unexpected error occurred: {e}")

In [4]:
# Write the number of tokens usd, the time to retrieve the output and other metadata to a CSV
def write_prompt_metadata(
    completion_tokens: int,
    prompt_tokens: int,
    total_tokens: int,
    elapsed_time: str,
    directory: str,
    prompt_title: str,
) -> None:
    current_data = pd.DataFrame(
        [
            {
                "Completion Tokens": completion_tokens,
                "Prompt Tokens": prompt_tokens,
                "Total Tokens": total_tokens,
                "Elapsed Time": elapsed_time,
                "Prompt Name": prompt_title,
            }
        ]
    )

    prompt_metadata_path = os.path.join(directory, "prompt_metadata.csv")

    # Check if the file exists to determine whether to write the header
    if os.path.exists(prompt_metadata_path):
        current_data.to_csv(prompt_metadata_path, mode="a", header=False, index=False)
    else:
        current_data.to_csv(prompt_metadata_path, mode="w", header=True, index=False)

    print("Processing complete. Results and token/time data have been saved.")

In [5]:
# Define the pompt of the GPT, providing one system prompt, a user prompt, and the expected schema for the output
def prompt_gpt(
    system_prompt: str,
    user_prompt: str,
    prompt_title: str,
    response_format: dict[str],
    output_directory: str,
    input_data_dict: pd.DataFrame,
) -> None:

    class Output(BaseModel):
        output: list[str]

    try:
        start_time = time.time()

        completion = client.chat.completions.create(
            model="gpt-4o-mini",
            messages=[
                {"role": "system", "content": system_prompt},
                {"role": "user", "content": user_prompt},
            ],
            max_tokens=16000,
            response_format=response_format,
        )

        completion_tokens = completion.usage.completion_tokens
        prompt_tokens = completion.usage.prompt_tokens
        total_tokens = completion.usage.total_tokens

        gpt_output = completion.choices[0].message.content

        end_time = time.time()
        elapsed_time = end_time - start_time
        # Calculate hours, minutes, seconds, and milliseconds

        minutes = int((elapsed_time % 3600) // 60)
        seconds = int(elapsed_time % 60)
        milliseconds = int((elapsed_time % 1) * 1000)

        # Format the output as mm:ss.ms
        formatted_time = f"{minutes:02}:{seconds:02}.{milliseconds:03}"

        # Write the output of the prompting
        write_output(
            gpt_output=gpt_output,
            user_prompt=user_prompt,
            system_prompt=system_prompt,
            prompt_title=prompt_title,
            directory=output_directory,
            input_data_dict=input_data_dict,
        )

        # Write the prompt metadata
        write_prompt_metadata(
            completion_tokens=completion_tokens,
            prompt_tokens=prompt_tokens,
            total_tokens=total_tokens,
            elapsed_time=formatted_time,
            directory=output_directory,
            prompt_title=prompt_title,
        )
    except Exception as e:
        # Return zeros and the error message in case of an exception
        print(str(e))
        return 0, 0, 0, f"Error: {str(e)}"

In [6]:
def create_attribute_dict(attribute, column_name: str) -> pd.DataFrame:

    attribute = pd.DataFrame(attribute)
    attribute["original_index"] = attribute.index

    # Retreive unique values
    attribute_unique = pd.DataFrame(
        attribute[column_name].unique(), columns=[column_name]
    )

    # Create an index for unique values
    attribute_unique["unique_index"] = attribute_unique.index

    # Join unique indexs to the original values, creating a dictionary of original index, value, and unique index
    attribute_dict = pd.merge(
        attribute, attribute_unique, on=column_name, how="left", suffixes=("", "_df2")
    )

    attribute_dict.rename(columns={"index_df2": "unique_index"}, inplace=True)

    return attribute_dict, attribute_unique

Load Dataset


In [7]:
dataset = "hospital"

dataset = pd.read_csv(f"./datasets/{dataset}/dirty.csv")

Attribute Level Prompting


In [8]:
response_format = {
    "type": "json_schema",
    "json_schema": {
        "name": "math_response",
        "strict": True,
        "schema": {
            "type": "object",
            "properties": {
                "output": {
                    "type": "array",
                    "items": {
                        "type": "object",
                        "properties": {
                            "explanation": {
                                "type": "string",
                                "description": "The explanation for why a value is considered an error",
                            },
                            "index": {
                                "type": "number",
                                "description": "The index in the list where the value occurs",
                            },
                            "annotation": {
                                "type": "number",
                                "description": "The annotation denoting whether a value is an error or not",
                                "enum": [1, 0],
                            },
                            "possible_repair": {"type": "string"},
                        },
                        "required": [
                            "explanation",
                            "index",
                            "annotation",
                            "possible_repair",
                        ],
                        "additionalProperties": False,
                    },
                },
            },
            "required": ["output"],
            "additionalProperties": False,
        },
    },
}

In [9]:
numeric_prompt = """You are given a list of numeric values with their corresponding index. You have to identify all errors in the list.
"""

In [10]:
text_prompt = """You are given a list of unique values in a column with their corresponding index. You have to find all syntactic errors in the dataset and recommend a possible repair.
A syntactic error occurs when a value does not conform to the structure or domain of correct values. The domain and structure of correct values have to be derived from the values themselves.
A semantic error occurs when a value falls outside of the reasonable context of a column. Use the context description to determine if a value is a semantic error.
You have to annotate an error with a '1' and a correct value of the '0' in the output.
For the possible repair only provide the reparied value is output.
You also have to provide a brief explanation referencing the examples a proof for each annotation.
Evaluate each value and provide an annotation and explanation regardless of error status.
Values denoting empty or null values can be found in any given context, and are considered correct.
Note! Only check for syntactic errors. Do not check for language errors.

Syntactic deviations can be one of the following examples
1. Invalid characters
- Characters appear in values that do not often appear in others or make them uninterpretable
Example 1
1, John = 0
2, Greg = 0
3, Frank15 = 1

Example 2 
15, Apple = 0
16, Pxar = 1
17, Banana = 0

2. Misspelling
- Words that are misspelled. Values that are considered names are less likely to be misspelt.
Example 1
2, Blue = 0
3, Green = 0
4, Orage = 1

Example 2 
67, Reservation for two people near the window = 0
68, Reservatton for five poeple at the entrance = 1
69, Three humans arriving at 9 for drinks = 0
70, A single peersonn at three = 1

3. Pattern non-conformity
- Some values may have a common pattern with certain values deviating from this pattern. There might exist more than one valid pattern in a single attribute.
Possible repairs should attempt to conform with the most prevalent patterns. Removing a pattern does not constitute a soluition.
Example 1
32, 2024/03/12 19:00 = 0
33, 2024-12-31 12:00 = 1
34, 1994/01/13 15:15 = 0
35, 12:00am 3 January 2024 = 1

Example 2
70, Admin123 = 0
71, Bob443 = 0
72, 99Alex = 1
"""

# And a context description of the attribute.
# 4. Contextual meaning
# - The contextual meaning of a column could provide some clues as to the expected values in a column.
# Example 1
# Contextual meaning: A column possibly storing state codes
# 22, MA = 0
# 23, TX = 0
# 24, ZA = 1

In [11]:
def generate_attribute_prompt_string(attribute_unique) -> str:
    attribute_delimited = attribute_unique[
        [attribute_unique.columns[1], attribute_unique.columns[0]]
    ]

    if attribute_delimited[attribute_unique.columns[0]].dtype != "int64":
        attribute_delimited.loc[:, attribute_unique.columns[0]] = attribute_delimited[
            attribute_unique.columns[0]
        ].apply(lambda x: x.strip())

    attribute_delimited.loc[:, attribute_unique.columns[0]] = attribute_delimited[
        attribute_unique.columns[0]
    ].apply(lambda x: f"|{x}|")

    attribute_delimited.loc[:, attribute_unique.columns[1]] = attribute_delimited[
        attribute_unique.columns[1]
    ].apply(lambda x: f"[{x}]")

    attribute_unique_string = attribute_delimited.to_csv(
        index=False, header=False, lineterminator="\n", sep=">"
    )

    return attribute_unique_string

In [12]:
for col in dataset.columns:
    attribute = pd.DataFrame(dataset[col].copy())

    attribute_dict, attribute_unique = create_attribute_dict(attribute, col)

    if attribute[attribute.columns[0]].dtype == object:
        system_prompt = text_prompt
    else:
        system_prompt = numeric_prompt

    attribute_unique.columns = ["value", "index"]
    json_sample = attribute_unique.to_json(orient="records", indent=4)

    user_prompt = f"""Input:
    {json_sample}
"""
    directory = r"D:\Documents\UU\Thesis\Artifact\CAED\dataset_analyzer\notebook\output\attribute_output"

    prompt_gpt(
        system_prompt,
        user_prompt,
        col,
        response_format,
        directory,
        attribute_dict,
    )

Output and prompt saved in directory: D:\Documents\UU\Thesis\Artifact\CAED\dataset_analyzer\notebook\output\attribute_output\index
Processing complete. Results and token/time data have been saved.
Output and prompt saved in directory: D:\Documents\UU\Thesis\Artifact\CAED\dataset_analyzer\notebook\output\attribute_output\provider_number
Processing complete. Results and token/time data have been saved.
Output and prompt saved in directory: D:\Documents\UU\Thesis\Artifact\CAED\dataset_analyzer\notebook\output\attribute_output\name
Processing complete. Results and token/time data have been saved.
Output and prompt saved in directory: D:\Documents\UU\Thesis\Artifact\CAED\dataset_analyzer\notebook\output\attribute_output\address_1
Processing complete. Results and token/time data have been saved.
Output and prompt saved in directory: D:\Documents\UU\Thesis\Artifact\CAED\dataset_analyzer\notebook\output\attribute_output\address_2
Processing complete. Results and token/time data have been saved

In [13]:
# Initialize an empty dictionary to store data for each folder
data_dict = {}

# Loop through all subdirectories and files
for root, dirs, files in os.walk(directory):
    # Get the folder name (which will be the column name)
    folder_name = os.path.basename(root)

    # Skip the root directory itself (attribute_output)
    if root == directory:
        continue

    annotated_output = None
    dict_data = None

    for file in files:
        if file.endswith(".json"):
            # Construct the full file path
            file_path = os.path.join(root, file)

            # Open and load the JSON file
            with open(file_path, "r") as json_file:
                try:
                    json_data = json.load(json_file)
                    # Retrieve the annotation and index from the json output and create a dataframe
                    annotations = [item["annotation"] for item in json_data["output"]]
                    index = [item["index"] for item in json_data["output"]]

                    annotated_output = pd.DataFrame(
                        {"annotation": annotations, "index": index}
                    )

                except json.JSONDecodeError as e:
                    print(f"Error decoding JSON from file {file_path}: {e}")
                except Exception as e:
                    print(f"Error reading file {file_path}: {e}")

        elif file == "dict.csv":
            dict_file_path = os.path.join(root, file)
            try:
                dict_data = pd.read_csv(dict_file_path)
                dict_data.columns = dict_data.columns.str.strip()
            except Exception as e:
                print(f"Error reading dict.csv from file {dict_file_path}: {e}")

    if annotated_output is not None and dict_data is not None:
        annotated_output = annotated_output.drop_duplicates(
            subset="index", keep="first"
        )

        dict_out_merged = pd.merge(
            dict_data,
            annotated_output,
            left_on="unique_index",
            right_on="index",
            how="left",
        )  # Join unique indices to the original values

        dict_out_merged.fillna(0, inplace=True)
        data_dict[folder_name] = dict_out_merged["annotation"]
    else:
        print(f"Missing files for folder {folder_name}")

# Convert the dictionary to a DataFrame
output = pd.DataFrame(data_dict)
output = output[dataset.columns.str.strip()]  # Strip whitespace from columns
output = output.astype(int)
output.to_csv("./output/attribute_output/output.csv", index=False)

Testing


In [45]:
output = pd.read_csv("./output/attribute_output/output.csv")

In [46]:
error_annotation = pd.read_csv(
    "./datasets/hospital/errors.csv", header=None, index_col=None, skiprows=1
)

In [16]:
def calculate_metrics(df1, df2):
    # Flatten the dataframes to 1D arrays
    y_true = df1.values.flatten()
    y_pred = df2.values.flatten()

    # Accuracy
    accuracy = accuracy_score(y_true, y_pred)

    # Precision
    precision = precision_score(y_true, y_pred)

    # Recall
    recall = recall_score(y_true, y_pred)

    f_score = f1_score(y_true, y_pred)

    return accuracy, precision, recall, f_score

In [47]:
accuracy, precision, recall, f_score = calculate_metrics(output, error_annotation)

print(f"Accuracy: {accuracy}")
print(f"Precision: {precision}")
print(f"Recall: {recall}")
print(f"F1 score: {f_score}")

Accuracy: 0.9793
Precision: 0.8231827111984283
Recall: 0.5639300134589502
F1 score: 0.6693290734824281


In [18]:
def inspect_classification(error_annotation, output, input):
    error_annotation.columns = input.columns
    output.columns = input.columns

    calc = error_annotation.add(2)
    calc_out = output
    calc_out[calc_out == 0] = -1

    calc = calc.add(calc_out)

    # True positive calculation
    tp = calc == 4
    tp = input[tp]

    # False positive calculation
    fp = calc == 3
    fp = input[fp]

    # False negative calculation
    fn = calc == 2
    fn = input[fn]

    return tp, fp, fn

In [19]:
tp, fp, fn = inspect_classification(
    error_annotation=error_annotation,
    output=output,
    input=dataset,
)

In [20]:
tp.to_csv("./output/attribute_output/tp.csv")
fp.to_csv("./output/attribute_output/fp.csv")
fn.to_csv("./output/attribute_output/fn.csv")

In [19]:
def annotate_errors(
    df_fixed: pd.DataFrame, df_with_errors: pd.DataFrame
) -> pd.DataFrame:
    # Check if the dataframes have the same shape and columns after sorting
    if df_fixed.shape != df_with_errors.shape or not all(
        df_fixed.columns == df_with_errors.columns
    ):
        raise ValueError("Both dataframes must have the same structure.")

    # Convert both dataframes to strings for datatype-agnostic comparison
    df_fixed_str = df_fixed.astype(str)
    df_with_errors_str = df_with_errors.astype(str)

    # Create the annotation dataframe by comparing the two dataframes
    error_annotation = (df_fixed_str != df_with_errors_str).astype(int)

    return error_annotation

In [21]:
dirty = pd.read_csv("./datasets/hospital/dirty.csv")
clean = pd.read_csv("./datasets/hospital/clean.csv")


error_annotation = annotate_errors(clean, dirty)

Utill Here in Flask. Test to see if it works. Next, recreate the code to get the error annotation when you have a dirty dataset and a clean dataset


In [21]:
response_format = {
    "type": "json_schema",
    "json_schema": {
        "name": "math_response",
        "strict": True,
        "schema": {
            "type": "object",
            "properties": {
                "output": {
                    "type": "array",
                    "items": {
                        "type": "object",
                        "properties": {
                            "columns": {
                                "type": "array",
                                "items": {"type": "number"},
                                "description": "An array of columns that might share a dependency.",
                            },
                            "dependency": {
                                "type": "string",
                                "description": "A description of the dependency between the identified columns.",
                            },
                        },
                        "required": ["columns", "dependency"],
                        "additionalProperties": False,
                    },
                },
            },
            "required": ["output"],
            "additionalProperties": False,
        },
    },
}

In [22]:
system_prompt = """
You are given a sample of similar records from a relational database. You have to determine which columns might have dependencies between them. 
For the output, only provide pairs of columns that might have dependencies. If multipule dependencies exist with a single column, provide more than one outputs for that column.
Do not check for dependencies between a column and itself.

Dependencies can occur in two ways.

1. Semantic dependency
- The meaning of the values of one column determines the meaning of another. 
Example:
One column represents cities, and another countries. This may indicated that there is a dependency between the two columns, and that the citities column, contains cities that are present in that country.
Col 1                   , Col 2
Great Britain           , London
United States of America, Washington DC
South Africa            , Pretoria

Note! Other semantic dependecies may occur in different domains from the one mentioned in the example.

2. Pattern Dependency
- One column may have a pattern, which in part is based on the meaning of another column. 
Example:
One column represents emergency codes, the other emergency descriptions.
Col 1      , Col 2
FRE-003_a  , Forrest fire with damages above $2,000
FLD-001_z  , Floods that destroyed local buildings
ERQ-777_n  , Earthquakes that caused power outages 
HUR-008_t  , Hurricanes that was able to breach sea walls
"""

In [23]:
records = dataset.values.tolist()

# Create an LSH index with a threshold and number of permutations
lsh = MinHashLSH(threshold=0.5, num_perm=128)

# Insert each record into the LSH index
for i, record in enumerate(records):
    m = MinHash(num_perm=128)
    for feature in record:
        m.update(str(feature).encode("utf8"))  # Hashing the attributes of the record
    lsh.insert(i, m)

    buckets = []
visited = set()  # To track records that have already been assigned to a bucket

# Querying similar records for each record
for i, record in enumerate(records):
    if i not in visited:  # Only process records that haven't been visited
        # Create MinHash for the current record
        m = MinHash(num_perm=128)
        for feature in record:
            m.update(str(feature).encode("utf8"))

        # Query LSH to get similar records
        similar_records = lsh.query(m)

        # Add the current record and its similar ones as a new bucket
        buckets.append(similar_records)

        # Mark all similar records as visited
        visited.update(similar_records)

# Calculate the sizes of all buckets
bucket_sizes = [(i, len(bucket)) for i, bucket in enumerate(buckets)]

# Sort buckets by size in descending order
sorted_buckets = sorted(bucket_sizes, key=lambda x: x[1], reverse=True)

# Retrieve the top 10 buckets
top_buckets_raw = sorted_buckets[:10]  # Get the top 10 buckets

# Define a minimum size threshold (e.g., 2)
min_size_threshold = 2

# Filter the top buckets based on the minimum size threshold
filtered_top_buckets = [
    bucket for bucket in top_buckets_raw if bucket[1] >= min_size_threshold
]

In [24]:
for bucket_index, _ in filtered_top_buckets:
    dynamic_directory = os.path.join(
        r"D:\Documents\UU\Thesis\Artifact\CAED\dataset_analyzer\notebook\output\dependency",
        f"bucket_{bucket_index}",
    )

    os.makedirs(dynamic_directory, exist_ok=True)

    current_bucket_records = [records[i] for i in buckets[bucket_index]]

    dataset_sample = pd.DataFrame(current_bucket_records)
    dataset_sample.columns = range(dataset_sample.shape[1])

    json_sample = dataset_sample.to_json(orient="records", indent=4)

    user_prompt = f"""Input:
The following is a formatted table with the data to be checked.
{json_sample}
"""

    # Call your prompt_gpt function with the dynamic directory
    prompt_gpt(
        system_prompt,
        user_prompt,
        "output",
        response_format,
        dynamic_directory,
        dataset_sample,
    )

Output and prompt saved in directory: D:\Documents\UU\Thesis\Artifact\CAED\dataset_analyzer\notebook\output\dependency\bucket_29\output
Processing complete. Results and token/time data have been saved.
Output and prompt saved in directory: D:\Documents\UU\Thesis\Artifact\CAED\dataset_analyzer\notebook\output\dependency\bucket_2\output
Processing complete. Results and token/time data have been saved.
Output and prompt saved in directory: D:\Documents\UU\Thesis\Artifact\CAED\dataset_analyzer\notebook\output\dependency\bucket_0\output
Processing complete. Results and token/time data have been saved.
Output and prompt saved in directory: D:\Documents\UU\Thesis\Artifact\CAED\dataset_analyzer\notebook\output\dependency\bucket_173\output
Processing complete. Results and token/time data have been saved.
Output and prompt saved in directory: D:\Documents\UU\Thesis\Artifact\CAED\dataset_analyzer\notebook\output\dependency\bucket_225\output
Processing complete. Results and token/time data have be

In [25]:
# Initialize a dictionary to hold unique dependencies
dependencies_dict = defaultdict(set)

# Assuming you have a list of dynamic directories for each bucket
for bucket_index, _ in filtered_top_buckets:
    # Construct the directory for the current bucket
    dynamic_directory = os.path.join(
        r"D:\Documents\UU\Thesis\Artifact\CAED\dataset_analyzer\notebook\output\dependency",
        f"bucket_{bucket_index}\output\output.json",
    )

    # Check if the JSON file exists before trying to read it
    if os.path.exists(dynamic_directory):
        with open(dynamic_directory, "r") as json_file:
            try:
                json_data = json.load(json_file)

                # Retrieve the annotations and create a list of tuples for dependencies
                for item in json_data["output"]:
                    columns = item["columns"]
                    # Sort the columns numerically to ensure the order is consistent
                    sorted_columns_tuple = tuple(
                        sorted(columns, key=int)
                    )  # Sort numerically
                    dependency_description = item["dependency"]

                    # Store the dependency description in the set for this tuple of columns
                    dependencies_dict[sorted_columns_tuple].add(dependency_description)

            except json.JSONDecodeError as e:
                print(f"Error decoding JSON from file {dynamic_directory}: {e}")
            except Exception as e:
                print(f"Error reading file {dynamic_directory}: {e}")

# Prepare the final list for DataFrame
dependencies_list = []
for columns, descriptions in dependencies_dict.items():

    first_description = next(iter(descriptions))  # Extract the first item from the set
    dependencies_list.append(
        {
            "columns": list(columns),  # Convert back to list for the DataFrame
            "dependency": first_description,  # Store only the first description
            # "dependency": list(descriptions),  # Store all descriptions as a list
        }
    )

# Sort dependencies list first by the first column, then by the second column
dependencies_list.sort(key=lambda x: (x["columns"][0], x["columns"][1]))

# Create a DataFrame from the sorted unique dependencies
dependencies_df = pd.DataFrame(dependencies_list)

# Optionally, you can display the DataFrame or save it to a file
print(dependencies_df)

# Save the sorted dependencies DataFrame to a CSV file
dependencies_df.to_csv("dependencies.csv", index=False)

     columns                                         dependency
0     [0, 1]  Unique record identifiers may have an associat...
1     [1, 2]  The zip code (column 1) indicates the specific...
2     [1, 6]  The zip code (column 1) suggests that the loca...
3    [1, 11]  The facility ID (Col 1) is related to the cate...
4    [1, 12]  The facility ID (Col 1) is related to the type...
5    [1, 14]  The facility ID (Col 1) determines the type of...
6    [1, 19]  The identifier (column 19) corresponds context...
7     [2, 3]  The street address (Col 3) likely relies on th...
8     [2, 9]  The hospital name (Col 2) might relate to the ...
9    [2, 12]  The hospital name (Col 2) is dependent on the ...
10   [2, 14]  The medical center name in column 2 indicates ...
11    [3, 4]  The address (Col 4) likely depends on the stre...
12    [4, 9]  The county name (Col 9) may semantically depen...
13   [5, 14]  The disease category (Col 14) might depend on ...
14    [6, 7]  The city (Col 6) relates t

In [31]:
response_format = {
    "type": "json_schema",
    "json_schema": {
        "name": "math_response",
        "strict": True,
        "schema": {
            "type": "object",
            "properties": {
                "output": {
                    "type": "array",
                    "items": {
                        "type": "object",
                        "properties": {
                            "explanation": {
                                "type": "string",
                                "description": "The explanation for why a value is considered an error",
                            },
                            "index": {
                                "type": "number",
                                "description": "The index in the list where the value occurs",
                            },
                            "column": {
                                "type": "number",
                                "description": "The column where the violation occured",
                            },
                            "annotation": {
                                "type": "number",
                                "description": "The annotation denoting whether a value is an error or not",
                                "enum": [1, 0],
                            },
                            "possible_repair": {"type": "string"},
                        },
                        "required": [
                            "explanation",
                            "index",
                            "annotation",
                            "possible_repair",
                            "column",
                        ],
                        "additionalProperties": False,
                    },
                },
            },
            "required": ["output"],
            "additionalProperties": False,
        },
    },
}

In [32]:
system_prompt = """You are given data from columns in a dataset that is said to have a dependency between each other. You have to detect violations in this dependency.

You are also given a description of the dependency that you can use to help you identify violations.

For the possible repair provide only the repair value. The error value and column is the value that is abnormal and that causes the violation.

Dependecy violations could be either semantic or syntactic violations. Use the dependency description to determine the violation. Do not check for language usage errors.

"""

In [33]:
def generate_dependency_prompt_string(columns: pd.DataFrame) -> str:
    # Create a true copy of the DataFrame to avoid warnings about modifying slices
    data_columns = columns.copy()

    # Remove duplicate rows based on all columns
    unique_combinations = data_columns.drop_duplicates(
        subset=data_columns.columns.tolist()
    ).copy()

    # Add index as a new column, and format it with brackets using .loc[]
    unique_combinations["index"] = (
        unique_combinations.index
    )  # Ensure you're adding a new column
    unique_combinations["index"] = unique_combinations["index"].apply(
        lambda x: f"[{x}]"
    )

    # Apply delimiters to other columns
    for col in unique_combinations.columns:
        if col != "index":  # Skip the 'index' column
            unique_combinations.loc[:, col] = unique_combinations[col].apply(
                lambda x: f"|{x}|"
            )

    # Reorder columns to move 'index' to the front
    cols = ["index"] + [col for col in unique_combinations.columns if col != "index"]
    unique_combinations = unique_combinations[cols]

    # Convert the DataFrame to CSV format with the required delimiters
    unique_combinations_string = unique_combinations.to_csv(
        index=False, header=False, lineterminator="\n", sep=","
    )

    return unique_combinations_string

In [34]:
# Function to create unique row dictionary
def create_row_dict(selected_columns: pd.DataFrame) -> pd.DataFrame:
    # Make a copy to avoid SettingWithCopyWarning
    selected_columns = selected_columns.copy()

    # Store original index
    selected_columns["original_index"] = selected_columns.index
    # Retrieve unique rows by dropping duplicates, ignoring 'original_index'
    unique_rows = (
        selected_columns.drop("original_index", axis=1)
        .drop_duplicates(subset=selected_columns.columns[:-1])
        .reset_index(drop=True)
    )
    unique_rows["unique_index"] = (
        unique_rows.index
    )  # Create unique index for unique rows

    # print(selected_columns.columns)
    # print(unique_rows.columns)

    # Merge unique row index back to original data to track the mapping
    row_dict = pd.merge(
        selected_columns,
        unique_rows,
        how="left",
        on=list(
            selected_columns.columns[:-1]
        ),  # Merge on actual data columns, excluding 'original_index' and 'unique_row_index'
    )
    # attribute_dict = pd.merge(
    #     attribute, attribute_unique, on=column_name, how="left", suffixes=("", "_df2")
    # )
    # (print(selected_columns.columns[:-1]))
    # (print(row_dict))

    return row_dict, unique_rows

In [35]:
# Group the dependencies by the first index
grouped_dependencies = defaultdict(list)

for _, row in dependencies_df.iterrows():
    first_index = row["columns"][0]  # Get the first index of the pair
    grouped_dependencies[first_index].append(row)  # Group by first index

# Now process each group of dependencies based on the first index
for first_index, dependencies in grouped_dependencies.items():

    selected_columns = [
        dataset.columns[first_index]
    ]  # Select columns based on the first index
    dependencies_list = []

    for dep in dependencies:
        columns = dep["columns"]
        dependency_description = dep["dependency"]
        selected_columns = dataset[
            dataset.columns[columns]
        ]  # Get the relevant columns from the dataset

        # Check if there are duplicates
        if selected_columns.drop_duplicates().shape[0] < dataset.shape[0]:
            # Create the unique row dictionary using the previously defined create_row_dict function
            row_dict, unique_rows = create_row_dict(selected_columns)

            # Only use the unique rows for prompting
            directory = r"D:\Documents\UU\Thesis\Artifact\CAED\dataset_analyzer\notebook\output\dependency_violations"

            # Prepare the data to prompt for each unique row combination
            # data_columns = unique_rows.drop("original_index", axis=1)

            unique_rows.columns = [
                str(columns[0]),
                str(columns[1]),
                "index",
            ]

            json_sample = unique_rows.to_json(orient="records", indent=4)

            # Prepare the user prompt with dependency and unique rows
            user_prompt = f"""Input:
The dependency identified in this table is defined as follows:
{dependency_description}

The following is a formatted table with the unique data to be checked.
{json_sample}
"""

            # Uncomment this to send the prompt to the GPT system
            prompt_gpt(
                system_prompt,
                user_prompt,
                str(columns),
                response_format,
                directory,
                row_dict,
            )

Output and prompt saved in directory: D:\Documents\UU\Thesis\Artifact\CAED\dataset_analyzer\notebook\output\dependency_violations\[1, 2]
Processing complete. Results and token/time data have been saved.
Output and prompt saved in directory: D:\Documents\UU\Thesis\Artifact\CAED\dataset_analyzer\notebook\output\dependency_violations\[1, 6]
Processing complete. Results and token/time data have been saved.
Output and prompt saved in directory: D:\Documents\UU\Thesis\Artifact\CAED\dataset_analyzer\notebook\output\dependency_violations\[1, 11]
Processing complete. Results and token/time data have been saved.
Output and prompt saved in directory: D:\Documents\UU\Thesis\Artifact\CAED\dataset_analyzer\notebook\output\dependency_violations\[1, 12]
Processing complete. Results and token/time data have been saved.
Output and prompt saved in directory: D:\Documents\UU\Thesis\Artifact\CAED\dataset_analyzer\notebook\output\dependency_violations\[1, 14]
Processing complete. Results and token/time data

In [36]:
import os
import json
import pandas as pd

# Path to the directory containing the JSON files
directory = r"D:\Documents\UU\Thesis\Artifact\CAED\dataset_analyzer\notebook\output\dependency_violations"

# Assuming the original DataFrame structure is known
original_dataframe = dataset  # Replace with your actual DataFrame

# Initialize the DataFrame with zeros, same shape as original dataframe
annotated_output = pd.DataFrame(
    0, index=original_dataframe.index, columns=original_dataframe.columns
)

# Loop through all subdirectories and files
for root, dirs, files in os.walk(directory):
    # Get the folder name (which could be the column name or some identifier)
    folder_name = os.path.basename(root)

    # Skip the root directory itself (attribute_output)
    if root == directory:
        continue

    # Variables to store JSON and dictionary data
    annotations_data = None
    dict_data = None

    for file in files:
        if file.endswith(".json"):
            # Construct the full file path for JSON
            file_path = os.path.join(root, file)

            # Open and load the JSON file
            with open(file_path, "r") as json_file:
                try:
                    json_data = json.load(json_file)

                    # Retrieve the annotation, index, and column from the JSON output
                    annotations = [item["annotation"] for item in json_data["output"]]
                    indices = [item["index"] for item in json_data["output"]]
                    columns = [item["column"] for item in json_data["output"]]

                    # Create a DataFrame for the JSON annotations
                    annotations_data = pd.DataFrame(
                        {"annotation": annotations, "index": indices, "column": columns}
                    )

                except json.JSONDecodeError as e:
                    print(f"Error decoding JSON from file {file_path}: {e}")
                except Exception as e:
                    print(f"Error reading file {file_path}: {e}")

        elif file == "dict.csv":
            # Construct the full file path for dict.csv
            dict_file_path = os.path.join(root, file)
            try:
                # Read the dict.csv file into a DataFrame
                dict_data = pd.read_csv(dict_file_path)
                dict_data.columns = dict_data.columns.str.strip()

            except Exception as e:
                print(f"Error reading dict.csv from file {dict_file_path}: {e}")

    # Ensure both annotation data and dictionary data are available
    if annotations_data is not None and dict_data is not None:
        # Merge the annotation data (based on the index) with the dictionary data (unique_index mapping)
        dict_out_merged = pd.merge(
            dict_data,
            annotations_data,
            left_on="unique_index",  # Use the unique index from dict.csv
            right_on="index",  # Match with index in the annotation data
            how="left",
        )

        # Replace NaN annotations with 0
        dict_out_merged.fillna(0, inplace=True)

        # Process the detailed annotation matching to the original dataframe
        for _, row in dict_out_merged.iterrows():
            original_index = int(
                row["original_index"]
            )  # Ensure original index is an integer
            annotation = row["annotation"]
            column = int(row["column"])  # Ensure the column is integer

            if annotation == 1:  # Only proceed if there's an error annotation
                # Update the annotated_output DataFrame at the specific cell
                col_name = original_dataframe.columns[
                    column
                ]  # Get column name based on column index
                annotated_output.at[original_index, col_name] = annotation

    else:
        print(f"Missing files for folder {folder_name}")

# Save the annotated DataFrame to CSV
annotated_output.to_csv("./output/dependency_violations/output.csv", index=False)

In [48]:
annotated_output = pd.read_csv("./output/dependency_violations/output.csv")

In [49]:
accuracy, precision, recall, f_score = calculate_metrics(
    annotated_output, error_annotation
)

print(f"Accuracy: {accuracy}")
print(f"Precision: {precision}")
print(f"Recall: {recall}")
print(f"F1 score: {f_score}")

Accuracy: 0.86035
Precision: 0.6208251473477406
Recall: 0.1083676268861454
F1 score: 0.18452554744525546


In [37]:
tp, fp, fn = inspect_classification(
    error_annotation=error_annotation,
    output=annotated_output,
    input=dataset,
)

In [38]:
tp.to_csv("./output/dependency_violations/tp.csv")
fp.to_csv("./output/dependency_violations/fp.csv")
fn.to_csv("./output/dependency_violations/fn.csv")

In [39]:
import pandas as pd

# Load the two datasets
dataset_1 = pd.read_csv(
    "./output/dependency_violations/output.csv"
)  # Path to first dataset
dataset_2 = pd.read_csv(
    "./output/attribute_output/output.csv"
)  # Path to second dataset

# Ensure both datasets have the same structure (columns and index)
# Initialize a consolidated DataFrame with the same shape as the original datasets
consolidated_data = pd.DataFrame(0, index=dataset_1.index, columns=dataset_1.columns)

# Loop through the indices and columns to consolidate annotations in place
for col in dataset_1.columns:
    for index in dataset_1.index:
        # Check if the current index has an error in dataset 1
        error_1 = dataset_1.at[index, col]
        error_2 = (
            dataset_2.at[index, col] if index < len(dataset_2) else 0
        )  # Handle cases where index exceeds

        # Logic to consolidate annotations
        if error_1 == 1 or error_2 == 1:  # If either dataset has an error
            consolidated_data.at[index, col] = 1  # Mark as error
        else:
            consolidated_data.at[index, col] = 0  # No error


# Save to CSV
consolidated_data.to_csv("./output/consolidated_error_annotations.csv", index=False)

In [40]:
accuracy, precision, recall, f_score = calculate_metrics(
    consolidated_data, error_annotation
)

print(f"Accuracy: {accuracy}")
print(f"Precision: {precision}")
print(f"Recall: {recall}")
print(f"F1 score: {f_score}")

Accuracy: 0.8515
Precision: 0.899803536345776
Recall: 0.13562333432040272
F1 score: 0.2357179619145651


In [41]:
tp, fp, fn = inspect_classification(
    error_annotation=error_annotation,
    output=output,
    input=dataset,
)

In [42]:
tp.to_csv("./output/tp.csv")
fp.to_csv("./output/fp.csv")
fn.to_csv("./output/fn.csv")

balanced accuracy
comparing with only errors (accuracy)

entity resolution for dependency preprocessing, clustering, blocking\*


Raha, synodc = experminets, experminental setup. Datasets, performance, baseline


Check for new papers on LLM data cleaning
