<a href="https://colab.research.google.com/github/RyanKelvinFord/Policy/blob/main/Conference_paper.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [29]:
# prompt: Lets load the json files into memeory and put them into a dataframe with columns file name and then file content

import json
import pandas as pd
import os
import glob

def json_to_dataframe(directory):
    """
    Loads JSON files from a directory into a pandas DataFrame.

    Args:
        directory: The path to the directory containing the JSON files.

    Returns:
        A pandas DataFrame with 'file_name' and 'file_content' columns.
        Returns an empty DataFrame if no JSON files are found or if an error occurs.
    """

    json_files = glob.glob(os.path.join(directory, '*.json'))
    data = []

    for file in json_files:
        try:
            with open(file, 'r') as f:
                content = json.load(f)
                data.append({'file_name': os.path.basename(file), 'file_content': content})
        except json.JSONDecodeError as e:
            print(f"Error decoding JSON in file {file}: {e}")
        except Exception as e:
            print(f"An error occurred while processing {file}: {e}")


    if data:
      return pd.DataFrame(data)
    else:
      return pd.DataFrame(columns=['file_name', 'file_content'])


df = json_to_dataframe('/content/')
df.head()

Unnamed: 0,file_name,file_content
0,Data access is limited to 5 times.json,{'@context': {'xsd': 'http://www.w3.org/2001/X...
1,Permission to access data a limited number of ...,{'@context': {'xsd': 'http://www.w3.org/2001/X...
2,Data can be accessed from 1st January 2024 to ...,{'@context': {'xsd': 'http://www.w3.org/2001/X...
3,Data can be accessed up to 5 times.json,{'@context': {'xsd': 'http://www.w3.org/2001/X...
4,Data can be accessed 5 times.json,{'@context': {'xsd': 'http://www.w3.org/2001/X...


In [30]:
import pandas as pd
import json

def process_json_column(df, column_name):
    """
    Convert a specified column in a dataframe from string to valid JSON objects.

    :param df: Pandas DataFrame containing the column with JSON-like strings.
    :param column_name: Name of the column to be processed.
    :return: DataFrame with the processed column.
    """
    def try_parse_json(value):
        try:
            return json.loads(value.replace("'", '"')) if isinstance(value, str) else value
        except json.JSONDecodeError:
            return None  # Return None for invalid JSON values

    df[column_name] = df[column_name].apply(try_parse_json)
    return df

df = process_json_column(df, 'file_content')
df.head()

Unnamed: 0,file_name,file_content
0,Data access is limited to 5 times.json,{'@context': {'xsd': 'http://www.w3.org/2001/X...
1,Permission to access data a limited number of ...,{'@context': {'xsd': 'http://www.w3.org/2001/X...
2,Data can be accessed from 1st January 2024 to ...,{'@context': {'xsd': 'http://www.w3.org/2001/X...
3,Data can be accessed up to 5 times.json,{'@context': {'xsd': 'http://www.w3.org/2001/X...
4,Data can be accessed 5 times.json,{'@context': {'xsd': 'http://www.w3.org/2001/X...


In [31]:
import pandas as pd
import json

def validate_json_column(df, column_name):
    """
    Validate if the JSON data in the specified column is correctly formatted.

    :param df: Pandas DataFrame containing the column with JSON data.
    :param column_name: Name of the column to be validated.
    :return: DataFrame with an additional column 'is_valid_json' indicating validity.
    """
    def is_valid_json(value):
        if isinstance(value, dict):
            return True  # Already a valid JSON object
        elif isinstance(value, str):
            try:
                json.loads(value)  # Attempt to parse
                return True
            except json.JSONDecodeError:
                return False
        return False  # Not a string or dictionary

    df["is_valid_json"] = df[column_name].apply(is_valid_json)
    return df

df = validate_json_column(df, 'file_content')
df.head()

Unnamed: 0,file_name,file_content,is_valid_json
0,Data access is limited to 5 times.json,{'@context': {'xsd': 'http://www.w3.org/2001/X...,True
1,Permission to access data a limited number of ...,{'@context': {'xsd': 'http://www.w3.org/2001/X...,True
2,Data can be accessed from 1st January 2024 to ...,{'@context': {'xsd': 'http://www.w3.org/2001/X...,True
3,Data can be accessed up to 5 times.json,{'@context': {'xsd': 'http://www.w3.org/2001/X...,True
4,Data can be accessed 5 times.json,{'@context': {'xsd': 'http://www.w3.org/2001/X...,True


In [32]:
import json
import torch
from sentence_transformers import SentenceTransformer, util

# Load SBERT model
model = SentenceTransformer("all-MiniLM-L6-v2")

def semantic_match_descriptions(df, file_name_col, content_col):
    """
    Checks the semantic similarity between the 'file_name' and the '@value' text in 'ids:description'.

    :param df: Pandas DataFrame containing the JSON data.
    :param file_name_col: Name of the column that contains file names.
    :param content_col: Name of the column that contains JSON data.
    :return: DataFrame with an additional 'semantic_similarity_description' column.
    """
    similarities = []

    for index, row in df.iterrows():
        file_name = row[file_name_col]
        json_data = row[content_col]
        best_score = 0  # Default similarity score

        # Ensure file name and JSON content are valid
        if isinstance(file_name, str) and isinstance(json_data, dict):
            try:
                # Extract 'ids:description' if it exists
                if "ids:description" in json_data and isinstance(json_data["ids:description"], list):
                    for desc in json_data["ids:description"]:
                        if "@value" in desc:
                            description_text = desc["@value"]

                            # Encode sentences into embeddings
                            embeddings = model.encode([file_name, description_text], convert_to_tensor=True)

                            # Compute cosine similarity
                            similarity = util.pytorch_cos_sim(embeddings[0], embeddings[1]).item()

                            best_score = max(best_score, similarity)  # Keep the highest similarity score
            except Exception as e:
                print(f"Error processing row {index}: {e}")

        similarities.append(best_score)

    # Add similarity scores to DataFrame with the new column name
    df["semantic_similarity_description"] = similarities
    return df

# Run semantic matching
df = semantic_match_descriptions(df, "file_name", "file_content")

# Display results in Google Colab
from IPython.display import display
display(df)

Unnamed: 0,file_name,file_content,is_valid_json,semantic_similarity_description
0,Data access is limited to 5 times.json,{'@context': {'xsd': 'http://www.w3.org/2001/X...,True,0.725082
1,Permission to access data a limited number of ...,{'@context': {'xsd': 'http://www.w3.org/2001/X...,True,0.770015
2,Data can be accessed from 1st January 2024 to ...,{'@context': {'xsd': 'http://www.w3.org/2001/X...,True,0.745148
3,Data can be accessed up to 5 times.json,{'@context': {'xsd': 'http://www.w3.org/2001/X...,True,0.510487
4,Data can be accessed 5 times.json,{'@context': {'xsd': 'http://www.w3.org/2001/X...,True,0.560698


In [33]:
import json
import torch
from sentence_transformers import SentenceTransformer, util

# Load SBERT model
model = SentenceTransformer("all-MiniLM-L6-v2")

def semantic_match_titles(df, file_name_col, content_col):
    """
    Checks the semantic similarity between the 'file_name' and the '@value' text in 'ids:title'.

    :param df: Pandas DataFrame containing the JSON data.
    :param file_name_col: Name of the column that contains file names.
    :param content_col: Name of the column that contains JSON data.
    :return: DataFrame with an additional 'semantic_similarity_title' column.
    """
    similarities = []

    for index, row in df.iterrows():
        file_name = row[file_name_col]
        json_data = row[content_col]
        best_score = 0  # Default similarity score

        # Ensure file name and JSON content are valid
        if isinstance(file_name, str) and isinstance(json_data, dict):
            try:
                # Extract 'ids:title' if it exists
                if "ids:title" in json_data and isinstance(json_data["ids:title"], list):
                    for title in json_data["ids:title"]:
                        if "@value" in title:
                            title_text = title["@value"]

                            # Encode sentences into embeddings
                            embeddings = model.encode([file_name, title_text], convert_to_tensor=True)

                            # Compute cosine similarity
                            similarity = util.pytorch_cos_sim(embeddings[0], embeddings[1]).item()

                            best_score = max(best_score, similarity)  # Keep the highest similarity score
            except Exception as e:
                print(f"Error processing row {index}: {e}")

        similarities.append(best_score)

    # Add similarity scores to DataFrame with the new column name
    df["semantic_similarity_title"] = similarities
    return df

# Run semantic matching for titles
df = semantic_match_titles(df, "file_name", "file_content")

# Display results in Google Colab
from IPython.display import display
display(df)

Unnamed: 0,file_name,file_content,is_valid_json,semantic_similarity_description,semantic_similarity_title
0,Data access is limited to 5 times.json,{'@context': {'xsd': 'http://www.w3.org/2001/X...,True,0.725082,0.543967
1,Permission to access data a limited number of ...,{'@context': {'xsd': 'http://www.w3.org/2001/X...,True,0.770015,0.491442
2,Data can be accessed from 1st January 2024 to ...,{'@context': {'xsd': 'http://www.w3.org/2001/X...,True,0.745148,0.429334
3,Data can be accessed up to 5 times.json,{'@context': {'xsd': 'http://www.w3.org/2001/X...,True,0.510487,0.357892
4,Data can be accessed 5 times.json,{'@context': {'xsd': 'http://www.w3.org/2001/X...,True,0.560698,0.331907


In [34]:
!pip install rdflib



In [35]:
import re
import json
import pandas as pd
from rdflib import Graph, URIRef

# Function to load ontology with auto-detected format
def load_ontology(ontology_path):
    """
    Load the ontology and extract valid individuals (URIs).

    :param ontology_path: Path to the ontology RDF file.
    :return: Set of ontology individuals (URIs).
    """
    g = Graph()
    try:
        g.parse(ontology_path)  # Auto-detect format
        ontology_individuals = set(g.subjects())
        print(f"Ontology loaded successfully! Found {len(ontology_individuals)} valid URIs.")
        return ontology_individuals
    except Exception as e:
        print(f"Error loading ontology: {e}")
        return set()  # Return an empty set if parsing fails

# Define ignored patterns (like auto-generated URIs)
ignored_value_patterns = [
    r"https://w3id.org/idsa/autogen/.*"  # Regex to match autogenerated values
]

# Function to validate JSON policies against the ontology
def validate_json_against_ontology(df, ontology_individuals, column_name="file_content"):
    """
    Validate if ids:action and ids:constraint fields in JSON match the ontology.

    :param df: Pandas DataFrame containing JSON data.
    :param ontology_individuals: Set of valid URIs from the ontology.
    :param column_name: Column in the DataFrame with JSON content.
    :return: DataFrame with an additional column 'ontology_validation_errors'.
    """
    validation_errors_list = []

    def is_ignored(value):
        """Check if a value matches ignored patterns."""
        return any(re.match(pattern, value) for pattern in ignored_value_patterns)

    for index, row in df.iterrows():
        policy_id = row["file_name"]
        json_data = row[column_name]
        errors = []

        if isinstance(json_data, dict):  # Ensure JSON is parsed
            # Validate ids:action
            if "ids:action" in json_data:
                for action in json_data["ids:action"]:
                    if "@id" in action:
                        uri = URIRef(action["@id"])
                        if not is_ignored(action["@id"]) and uri not in ontology_individuals:
                            errors.append(f"ERROR: Action '{action['@id']}' not found in ontology.")

            # Validate ids:constraint
            if "ids:constraint" in json_data:
                for i, constraint in enumerate(json_data["ids:constraint"]):
                    constraint_path = f"ids:constraint[{i}]"

                    # Validate ids:operator
                    if "ids:operator" in constraint and "@id" in constraint["ids:operator"]:
                        operator_uri = URIRef(constraint["ids:operator"]["@id"])
                        if not is_ignored(constraint["ids:operator"]["@id"]) and operator_uri not in ontology_individuals:
                            errors.append(f"ERROR: Operator '{constraint['ids:operator']['@id']}' not found in ontology.")

                    # Validate ids:leftOperand
                    if "ids:leftOperand" in constraint and "@id" in constraint["ids:leftOperand"]:
                        left_operand_uri = URIRef(constraint["ids:leftOperand"]["@id"])
                        if not is_ignored(constraint["ids:leftOperand"]["@id"]) and left_operand_uri not in ontology_individuals:
                            errors.append(f"ERROR: LeftOperand '{constraint['ids:leftOperand']['@id']}' not found in ontology.")

                    # Validate ids:rightOperand
                    if "ids:rightOperand" in constraint and "@id" in constraint["ids:rightOperand"]:
                        right_operand_uri = URIRef(constraint["ids:rightOperand"]["@id"])
                        if not is_ignored(constraint["ids:rightOperand"]["@id"]) and right_operand_uri not in ontology_individuals:
                            errors.append(f"ERROR: RightOperand '{constraint['ids:rightOperand']['@id']}' not found in ontology.")

        # Store results
        validation_errors_list.append(errors if errors else ["No errors - All values found in ontology."])

    # Add validation results as a new column in the existing DataFrame
    df["ontology_validation_errors"] = validation_errors_list
    return df

# ---- STEP 1: Load the Ontology ----
ontology_file_path = "/content/ontology.rdf"  # Adjust with the actual path
ontology_individuals = load_ontology(ontology_file_path)

# ---- STEP 2: Validate JSON Policies Against Ontology ----
df = validate_json_against_ontology(df, ontology_individuals, "file_content")

# ---- STEP 3: Display Updated DataFrame with Validation Column ----
from IPython.display import display
display(df)

Ontology loaded successfully! Found 1170 valid URIs.


Unnamed: 0,file_name,file_content,is_valid_json,semantic_similarity_description,semantic_similarity_title,ontology_validation_errors
0,Data access is limited to 5 times.json,{'@context': {'xsd': 'http://www.w3.org/2001/X...,True,0.725082,0.543967,[No errors - All values found in ontology.]
1,Permission to access data a limited number of ...,{'@context': {'xsd': 'http://www.w3.org/2001/X...,True,0.770015,0.491442,[No errors - All values found in ontology.]
2,Data can be accessed from 1st January 2024 to ...,{'@context': {'xsd': 'http://www.w3.org/2001/X...,True,0.745148,0.429334,[No errors - All values found in ontology.]
3,Data can be accessed up to 5 times.json,{'@context': {'xsd': 'http://www.w3.org/2001/X...,True,0.510487,0.357892,[No errors - All values found in ontology.]
4,Data can be accessed 5 times.json,{'@context': {'xsd': 'http://www.w3.org/2001/X...,True,0.560698,0.331907,[No errors - All values found in ontology.]


In [36]:
import re
import json
import pandas as pd
from rdflib import Graph, URIRef
from datetime import datetime

# Function to evaluate policy logic
def evaluate_policy_logic(df, column_name="file_content"):
    """
    Evaluates the logical correctness of JSON policies in the DataFrame.

    :param df: Pandas DataFrame containing JSON policies.
    :param column_name: Column name where JSON policies are stored.
    :return: DataFrame with an additional 'policy_logic_issues' column.
    """
    logic_issues_list = []

    for index, row in df.iterrows():
        policy_id = row["file_name"]
        json_data = row[column_name]
        issues = []

        if isinstance(json_data, dict):  # Ensure JSON is parsed
            # Validate constraints exist if required
            if "ids:constraint" in json_data:
                constraints = json_data["ids:constraint"]
                time_constraints = []
                numeric_constraints = {}

                for constraint in constraints:
                    # Validate `ids:operator`
                    if "ids:operator" in constraint and "@id" in constraint["ids:operator"]:
                        operator = constraint["ids:operator"]["@id"]

                        # Ensure a leftOperand exists
                        if "ids:leftOperand" not in constraint or "@id" not in constraint["ids:leftOperand"]:
                            issues.append(f"ERROR: Constraint '{operator}' is missing leftOperand.")

                        # Ensure a rightOperand exists
                        if "ids:rightOperand" not in constraint or "@value" not in constraint["ids:rightOperand"]:
                            issues.append(f"ERROR: Constraint '{operator}' is missing rightOperand.")

                        left_operand = constraint.get("ids:leftOperand", {}).get("@id", None)
                        right_operand = constraint.get("ids:rightOperand", {}).get("@value", None)

                        # Detect conflicting time constraints
                        if operator in ["https://w3id.org/idsa/code/DURING", "https://w3id.org/idsa/code/BEFORE", "https://w3id.org/idsa/code/AFTER"]:
                            try:
                                time_constraints.append(datetime.fromisoformat(right_operand.replace("Z", "")))
                            except ValueError:
                                issues.append(f"ERROR: Invalid datetime format in constraint '{operator}'.")

                        # Detect invalid numeric constraints
                        if operator in ["https://w3id.org/idsa/code/LTEQ", "https://w3id.org/idsa/code/GTEQ"]:
                            try:
                                numeric_constraints[left_operand] = float(right_operand)
                            except ValueError:
                                issues.append(f"ERROR: Invalid numeric value in constraint '{operator}'.")

                # Check for conflicting constraints
                if len(time_constraints) >= 2:
                    if any(t1 > t2 for t1, t2 in zip(time_constraints, time_constraints[1:])):
                        issues.append("ERROR: Conflicting time constraints detected.")

                # Check for redundant constraints
                if len(numeric_constraints) != len(set(numeric_constraints.values())):
                    issues.append("ERROR: Redundant numeric constraints detected.")

            # Validate that actions make sense
            if "ids:action" in json_data:
                for action in json_data["ids:action"]:
                    if "@id" in action:
                        action_type = action["@id"]
                        if action_type not in [
                            "https://w3id.org/idsa/code/USE",
                            "https://w3id.org/idsa/code/READ",
                            "https://w3id.org/idsa/code/WRITE"
                        ]:
                            issues.append(f"ERROR: Unrecognized action type '{action_type}'.")

        # Store results
        logic_issues_list.append(issues if issues else ["No issues - Policy logic is valid."])

    # Add logic validation results to the DataFrame
    df["policy_logic_issues"] = logic_issues_list
    return df

# ---- STEP 1: Evaluate Policy Logic ----
df = evaluate_policy_logic(df, "file_content")

# ---- STEP 2: Display Updated DataFrame with Logic Issues ----
from IPython.display import display
display(df)

Unnamed: 0,file_name,file_content,is_valid_json,semantic_similarity_description,semantic_similarity_title,ontology_validation_errors,policy_logic_issues
0,Data access is limited to 5 times.json,{'@context': {'xsd': 'http://www.w3.org/2001/X...,True,0.725082,0.543967,[No errors - All values found in ontology.],[No issues - Policy logic is valid.]
1,Permission to access data a limited number of ...,{'@context': {'xsd': 'http://www.w3.org/2001/X...,True,0.770015,0.491442,[No errors - All values found in ontology.],[No issues - Policy logic is valid.]
2,Data can be accessed from 1st January 2024 to ...,{'@context': {'xsd': 'http://www.w3.org/2001/X...,True,0.745148,0.429334,[No errors - All values found in ontology.],[ERROR: Constraint 'https://w3id.org/idsa/code...
3,Data can be accessed up to 5 times.json,{'@context': {'xsd': 'http://www.w3.org/2001/X...,True,0.510487,0.357892,[No errors - All values found in ontology.],[No issues - Policy logic is valid.]
4,Data can be accessed 5 times.json,{'@context': {'xsd': 'http://www.w3.org/2001/X...,True,0.560698,0.331907,[No errors - All values found in ontology.],[No issues - Policy logic is valid.]


In [38]:
import re
import json
import pandas as pd
from rdflib import URIRef

# Function to evaluate policy logic and detect contradictions
def evaluate_policy_logic(df, column_name="file_content"):
    """
    Evaluates the logical correctness of JSON policies in the DataFrame.
    Detects conflicting actions and contradictory constraints.

    :param df: Pandas DataFrame containing JSON policies.
    :param column_name: Column name where JSON policies are stored.
    :return: DataFrame with an additional 'policy_logic_issues' column.
    """
    logic_issues_list = []

    for index, row in df.iterrows():
        policy_id = row["file_name"]
        json_data = row[column_name]
        issues = []

        if isinstance(json_data, dict):  # Ensure JSON is parsed
            actions = set()
            constraints = {}

            # Extract actions
            if "ids:action" in json_data:
                for action in json_data["ids:action"]:
                    if isinstance(action, dict) and "@id" in action:
                        actions.add(action["@id"])

            # Detect conflicting actions
            if "https://w3id.org/idsa/code/USE" in actions and "https://w3id.org/idsa/code/MODIFY" in actions:
                issues.append("ERROR: Conflicting actions - 'USE' and 'MODIFY' cannot both be allowed.")

            # Extract constraints
            if "ids:constraint" in json_data:
                for constraint in json_data["ids:constraint"]:
                    if not isinstance(constraint, dict):  # Skip invalid constraints
                        continue

                    operator = constraint.get("ids:operator", {}).get("@id")
                    left_operand = constraint.get("ids:leftOperand", {}).get("@id")
                    right_operand = constraint.get("ids:rightOperand", {}).get("@value")

                    # Ensure all required values exist
                    if operator and left_operand and right_operand:
                        # Store constraints based on leftOperand
                        if left_operand not in constraints:
                            constraints[left_operand] = []
                        constraints[left_operand].append((operator, right_operand))

            # Detect contradictory constraints
            for left_operand, operand_constraints in constraints.items():
                lteq_values = [float(value) for op, value in operand_constraints if op == "https://w3id.org/idsa/code/LTEQ"]
                gteq_values = [float(value) for op, value in operand_constraints if op == "https://w3id.org/idsa/code/GTEQ"]

                # If max(GTEQ) > min(LTEQ), there is a contradiction
                if lteq_values and gteq_values and max(gteq_values) > min(lteq_values):
                    issues.append(f"ERROR: Conflicting constraints for '{left_operand}' - GTEQ({max(gteq_values)}) > LTEQ({min(lteq_values)})")

        # Store results
        logic_issues_list.append(issues if issues else ["No issues - Policy logic is valid."])

    # Add logic validation results to the DataFrame
    df["policy_logic_issues"] = logic_issues_list
    return df

# ---- STEP 1: Evaluate Policy Logic ----
df = evaluate_policy_logic(df, "file_content")

# ---- STEP 2: Display Updated DataFrame with Logic Issues ----
from IPython.display import display
display(df)

Unnamed: 0,file_name,file_content,is_valid_json,semantic_similarity_description,semantic_similarity_title,ontology_validation_errors,policy_logic_issues
0,Data access is limited to 5 times.json,{'@context': {'xsd': 'http://www.w3.org/2001/X...,True,0.725082,0.543967,[No errors - All values found in ontology.],[No issues - Policy logic is valid.]
1,Permission to access data a limited number of ...,{'@context': {'xsd': 'http://www.w3.org/2001/X...,True,0.770015,0.491442,[No errors - All values found in ontology.],[No issues - Policy logic is valid.]
2,Data can be accessed from 1st January 2024 to ...,{'@context': {'xsd': 'http://www.w3.org/2001/X...,True,0.745148,0.429334,[No errors - All values found in ontology.],[No issues - Policy logic is valid.]
3,Data can be accessed up to 5 times.json,{'@context': {'xsd': 'http://www.w3.org/2001/X...,True,0.510487,0.357892,[No errors - All values found in ontology.],[No issues - Policy logic is valid.]
4,Data can be accessed 5 times.json,{'@context': {'xsd': 'http://www.w3.org/2001/X...,True,0.560698,0.331907,[No errors - All values found in ontology.],[No issues - Policy logic is valid.]


In [39]:
import json
import pandas as pd
from jsonschema import validate, ValidationError

# Define the expected JSON schema
policy_schema = {
    "type": "object",
    "required": ["@context", "@type", "ids:action"],
    "properties": {
        "@context": {"type": "object"},
        "@type": {"type": "string"},
        "ids:action": {
            "type": "array",
            "items": {
                "type": "object",
                "properties": {
                    "@id": {"type": "string"}
                },
                "required": ["@id"]
            }
        },
        "ids:constraint": {
            "type": "array",
            "items": {
                "type": "object",
                "properties": {
                    "ids:operator": {
                        "type": "object",
                        "properties": {
                            "@id": {"type": "string"}
                        },
                        "required": ["@id"]
                    },
                    "ids:leftOperand": {
                        "type": "object",
                        "properties": {
                            "@id": {"type": "string"}
                        },
                        "required": ["@id"]
                    },
                    "ids:rightOperand": {
                        "type": "object",
                        "properties": {
                            "@value": {
                                "oneOf": [
                                    {"type": "string"},
                                    {"type": "number"},
                                    {"type": "boolean"}
                                ]
                            }
                        },
                        "required": ["@value"]
                    }
                },
                "required": ["ids:operator", "ids:leftOperand", "ids:rightOperand"]
            }
        }
    }
}

# Function to validate JSON schema conformance
def validate_json_schema(df, column_name="file_content"):
    """
    Validates JSON schema conformance for policies.

    :param df: Pandas DataFrame containing JSON policies.
    :param column_name: Column name where JSON policies are stored.
    :return: DataFrame with an additional 'json_schema_issues' column.
    """
    schema_issues_list = []

    for index, row in df.iterrows():
        policy_id = row["file_name"]
        json_data = row[column_name]
        issues = []

        if isinstance(json_data, dict):  # Ensure JSON is parsed
            try:
                validate(instance=json_data, schema=policy_schema)
            except ValidationError as e:
                issues.append(f"Schema Error: {e.message}")

        # Store results
        schema_issues_list.append(issues if issues else ["No schema issues - JSON structure is valid."])

    # Add schema validation results to the DataFrame
    df["json_schema_issues"] = schema_issues_list
    return df

# ---- STEP 1: Validate JSON Schema ----
df = validate_json_schema(df, "file_content")

# ---- STEP 2: Display Updated DataFrame with Schema Issues ----
from IPython.display import display
display(df)

Unnamed: 0,file_name,file_content,is_valid_json,semantic_similarity_description,semantic_similarity_title,ontology_validation_errors,policy_logic_issues,json_schema_issues
0,Data access is limited to 5 times.json,{'@context': {'xsd': 'http://www.w3.org/2001/X...,True,0.725082,0.543967,[No errors - All values found in ontology.],[No issues - Policy logic is valid.],[No schema issues - JSON structure is valid.]
1,Permission to access data a limited number of ...,{'@context': {'xsd': 'http://www.w3.org/2001/X...,True,0.770015,0.491442,[No errors - All values found in ontology.],[No issues - Policy logic is valid.],[No schema issues - JSON structure is valid.]
2,Data can be accessed from 1st January 2024 to ...,{'@context': {'xsd': 'http://www.w3.org/2001/X...,True,0.745148,0.429334,[No errors - All values found in ontology.],[No issues - Policy logic is valid.],[Schema Error: '@id' is a required property]
3,Data can be accessed up to 5 times.json,{'@context': {'xsd': 'http://www.w3.org/2001/X...,True,0.510487,0.357892,[No errors - All values found in ontology.],[No issues - Policy logic is valid.],[No schema issues - JSON structure is valid.]
4,Data can be accessed 5 times.json,{'@context': {'xsd': 'http://www.w3.org/2001/X...,True,0.560698,0.331907,[No errors - All values found in ontology.],[No issues - Policy logic is valid.],[No schema issues - JSON structure is valid.]


In [40]:
import json
import pandas as pd

# Function to check dependencies between constraints
def check_constraint_dependencies(df, column_name="file_content"):
    """
    Validates dependencies between constraints in JSON policies.

    :param df: Pandas DataFrame containing JSON policies.
    :param column_name: Column name where JSON policies are stored.
    :return: DataFrame with an additional 'constraint_dependency_issues' column.
    """
    dependency_issues_list = []

    for index, row in df.iterrows():
        policy_id = row["file_name"]
        json_data = row[column_name]
        issues = []

        if isinstance(json_data, dict):  # Ensure JSON is parsed
            constraints = json_data.get("ids:constraint", [])
            has_duration = False
            has_start_time = False
            has_gteq = False
            has_lteq = False
            has_after = False
            has_before = False

            for constraint in constraints:
                operator = constraint.get("ids:operator", {}).get("@id")
                right_operand = constraint.get("ids:rightOperand", {}).get("@value")

                # Check for DURATION without a start time
                if operator == "https://w3id.org/idsa/code/DURATION":
                    has_duration = True
                    if right_operand is None or not isinstance(right_operand, (int, float)):
                        issues.append("ERROR: DURATION constraint requires a valid numeric duration value.")

                # Check for AFTER without a timestamp
                if operator == "https://w3id.org/idsa/code/AFTER":
                    has_after = True
                    if not isinstance(right_operand, str):  # Ensure a valid timestamp
                        issues.append("ERROR: AFTER constraint requires a valid timestamp.")

                # Check for BEFORE without AFTER
                if operator == "https://w3id.org/idsa/code/BEFORE":
                    has_before = True

                # Track GTEQ and LTEQ constraints
                if operator == "https://w3id.org/idsa/code/GTEQ":
                    has_gteq = True
                if operator == "https://w3id.org/idsa/code/LTEQ":
                    has_lteq = True

            # Dependency checks
            if has_duration and not has_start_time:
                issues.append("ERROR: DURATION constraint exists but no start time is provided.")

            if has_after and not has_before:
                issues.append("WARNING: AFTER is used without a BEFORE constraint. Verify time logic.")

            if has_gteq and not has_lteq:
                issues.append("WARNING: GTEQ exists without LTEQ, making the constraint unbounded.")

        # Store results
        dependency_issues_list.append(issues if issues else ["No dependency issues - Constraints are logically valid."])

    # Add dependency validation results to the DataFrame
    df["constraint_dependency_issues"] = dependency_issues_list
    return df

# ---- STEP 1: Validate Constraint Dependencies ----
df = check_constraint_dependencies(df, "file_content")

# ---- STEP 2: Display Updated DataFrame with Dependency Issues ----
from IPython.display import display
display(df)

Unnamed: 0,file_name,file_content,is_valid_json,semantic_similarity_description,semantic_similarity_title,ontology_validation_errors,policy_logic_issues,json_schema_issues,constraint_dependency_issues
0,Data access is limited to 5 times.json,{'@context': {'xsd': 'http://www.w3.org/2001/X...,True,0.725082,0.543967,[No errors - All values found in ontology.],[No issues - Policy logic is valid.],[No schema issues - JSON structure is valid.],[No dependency issues - Constraints are logica...
1,Permission to access data a limited number of ...,{'@context': {'xsd': 'http://www.w3.org/2001/X...,True,0.770015,0.491442,[No errors - All values found in ontology.],[No issues - Policy logic is valid.],[No schema issues - JSON structure is valid.],[No dependency issues - Constraints are logica...
2,Data can be accessed from 1st January 2024 to ...,{'@context': {'xsd': 'http://www.w3.org/2001/X...,True,0.745148,0.429334,[No errors - All values found in ontology.],[No issues - Policy logic is valid.],[Schema Error: '@id' is a required property],[No dependency issues - Constraints are logica...
3,Data can be accessed up to 5 times.json,{'@context': {'xsd': 'http://www.w3.org/2001/X...,True,0.510487,0.357892,[No errors - All values found in ontology.],[No issues - Policy logic is valid.],[No schema issues - JSON structure is valid.],[No dependency issues - Constraints are logica...
4,Data can be accessed 5 times.json,{'@context': {'xsd': 'http://www.w3.org/2001/X...,True,0.560698,0.331907,[No errors - All values found in ontology.],[No issues - Policy logic is valid.],[No schema issues - JSON structure is valid.],[No dependency issues - Constraints are logica...


In [41]:
import json
import pandas as pd

# Function to validate that leftOperand and rightOperand match expected types
def check_constraint_data_types(df, column_name="file_content"):
    """
    Validates that the leftOperand and rightOperand in constraints match expected data types.

    :param df: Pandas DataFrame containing JSON policies.
    :param column_name: Column name where JSON policies are stored.
    :return: DataFrame with an additional 'constraint_data_type_issues' column.
    """
    data_type_issues_list = []

    for index, row in df.iterrows():
        policy_id = row["file_name"]
        json_data = row[column_name]
        issues = []

        if isinstance(json_data, dict):  # Ensure JSON is parsed
            constraints = json_data.get("ids:constraint", [])

            for constraint in constraints:
                operator = constraint.get("ids:operator", {}).get("@id")
                left_operand = constraint.get("ids:leftOperand", {}).get("@id")
                right_operand = constraint.get("ids:rightOperand", {}).get("@value")

                # Ensure rightOperand exists
                if right_operand is None:
                    issues.append(f"ERROR: RightOperand is missing for constraint '{operator}'.")
                    continue

                # COUNT should always compare numbers
                if left_operand == "https://w3id.org/idsa/code/COUNT":
                    if not isinstance(right_operand, (int, float)):
                        issues.append(f"ERROR: COUNT operand should compare numbers, but got '{type(right_operand).__name__}'.")

                # DURATION should always compare numeric values (representing seconds/minutes)
                if left_operand == "https://w3id.org/idsa/code/DURATION":
                    if not isinstance(right_operand, (int, float)):
                        issues.append(f"ERROR: DURATION operand should compare a numeric value, but got '{type(right_operand).__name__}'.")

                # STRING values should not be used with LTEQ or GTEQ
                if operator in ["https://w3id.org/idsa/code/LTEQ", "https://w3id.org/idsa/code/GTEQ"]:
                    if isinstance(right_operand, str):
                        issues.append(f"ERROR: {operator.split('/')[-1]} should not be used with a STRING value ('{right_operand}').")

                # BOOLEAN values should not be used with mathematical comparisons
                if operator in ["https://w3id.org/idsa/code/LTEQ", "https://w3id.org/idsa/code/GTEQ"] and isinstance(right_operand, bool):
                    issues.append(f"ERROR: {operator.split('/')[-1]} should not be used with a BOOLEAN value ('{right_operand}').")

        # Store results
        data_type_issues_list.append(issues if issues else ["No data type issues - Operands and values match correctly."])

    # Add data type validation results to the DataFrame
    df["constraint_data_type_issues"] = data_type_issues_list
    return df

# ---- STEP 1: Validate Constraint Data Types ----
df = check_constraint_data_types(df, "file_content")

# ---- STEP 2: Display Updated DataFrame with Data Type Issues ----
from IPython.display import display
display(df)

Unnamed: 0,file_name,file_content,is_valid_json,semantic_similarity_description,semantic_similarity_title,ontology_validation_errors,policy_logic_issues,json_schema_issues,constraint_dependency_issues,constraint_data_type_issues
0,Data access is limited to 5 times.json,{'@context': {'xsd': 'http://www.w3.org/2001/X...,True,0.725082,0.543967,[No errors - All values found in ontology.],[No issues - Policy logic is valid.],[No schema issues - JSON structure is valid.],[No dependency issues - Constraints are logica...,"[ERROR: COUNT operand should compare numbers, ..."
1,Permission to access data a limited number of ...,{'@context': {'xsd': 'http://www.w3.org/2001/X...,True,0.770015,0.491442,[No errors - All values found in ontology.],[No issues - Policy logic is valid.],[No schema issues - JSON structure is valid.],[No dependency issues - Constraints are logica...,"[ERROR: COUNT operand should compare numbers, ..."
2,Data can be accessed from 1st January 2024 to ...,{'@context': {'xsd': 'http://www.w3.org/2001/X...,True,0.745148,0.429334,[No errors - All values found in ontology.],[No issues - Policy logic is valid.],[Schema Error: '@id' is a required property],[No dependency issues - Constraints are logica...,[No data type issues - Operands and values mat...
3,Data can be accessed up to 5 times.json,{'@context': {'xsd': 'http://www.w3.org/2001/X...,True,0.510487,0.357892,[No errors - All values found in ontology.],[No issues - Policy logic is valid.],[No schema issues - JSON structure is valid.],[No dependency issues - Constraints are logica...,"[ERROR: COUNT operand should compare numbers, ..."
4,Data can be accessed 5 times.json,{'@context': {'xsd': 'http://www.w3.org/2001/X...,True,0.560698,0.331907,[No errors - All values found in ontology.],[No issues - Policy logic is valid.],[No schema issues - JSON structure is valid.],[No dependency issues - Constraints are logica...,[No data type issues - Operands and values mat...


In [42]:
import json
import pandas as pd

# Define the set of allowed operators
STANDARD_OPERATORS = {
    "https://w3id.org/idsa/code/LTEQ",
    "https://w3id.org/idsa/code/GTEQ",
    "https://w3id.org/idsa/code/EQUALS",
    "https://w3id.org/idsa/code/DURING",
    "https://w3id.org/idsa/code/AFTER",
    "https://w3id.org/idsa/code/BEFORE",
    "https://w3id.org/idsa/code/IN",
    "https://w3id.org/idsa/code/NEQ",
}

# Function to detect non-standard operators
def check_non_standard_operators(df, column_name="file_content"):
    """
    Checks if all ids:operator values match the expected standard operators.

    :param df: Pandas DataFrame containing JSON policies.
    :param column_name: Column name where JSON policies are stored.
    :return: DataFrame with an additional 'non_standard_operator_issues' column.
    """
    operator_issues_list = []

    for index, row in df.iterrows():
        policy_id = row["file_name"]
        json_data = row[column_name]
        issues = []

        if isinstance(json_data, dict):  # Ensure JSON is parsed
            constraints = json_data.get("ids:constraint", [])

            for constraint in constraints:
                operator = constraint.get("ids:operator", {}).get("@id")

                # Flag any operator that is not in the standard set
                if operator and operator not in STANDARD_OPERATORS:
                    issues.append(f"ERROR: '{operator}' is not a recognized standard operator.")

        # Store results
        operator_issues_list.append(issues if issues else ["No issues - All operators are standard."])

    # Add operator validation results to the DataFrame
    df["non_standard_operator_issues"] = operator_issues_list
    return df

# ---- STEP 1: Validate Operators ----
df = check_non_standard_operators(df, "file_content")

# ---- STEP 2: Display Updated DataFrame with Operator Issues ----
from IPython.display import display
display(df)

Unnamed: 0,file_name,file_content,is_valid_json,semantic_similarity_description,semantic_similarity_title,ontology_validation_errors,policy_logic_issues,json_schema_issues,constraint_dependency_issues,constraint_data_type_issues,non_standard_operator_issues
0,Data access is limited to 5 times.json,{'@context': {'xsd': 'http://www.w3.org/2001/X...,True,0.725082,0.543967,[No errors - All values found in ontology.],[No issues - Policy logic is valid.],[No schema issues - JSON structure is valid.],[No dependency issues - Constraints are logica...,"[ERROR: COUNT operand should compare numbers, ...",[No issues - All operators are standard.]
1,Permission to access data a limited number of ...,{'@context': {'xsd': 'http://www.w3.org/2001/X...,True,0.770015,0.491442,[No errors - All values found in ontology.],[No issues - Policy logic is valid.],[No schema issues - JSON structure is valid.],[No dependency issues - Constraints are logica...,"[ERROR: COUNT operand should compare numbers, ...",[No issues - All operators are standard.]
2,Data can be accessed from 1st January 2024 to ...,{'@context': {'xsd': 'http://www.w3.org/2001/X...,True,0.745148,0.429334,[No errors - All values found in ontology.],[No issues - Policy logic is valid.],[Schema Error: '@id' is a required property],[No dependency issues - Constraints are logica...,[No data type issues - Operands and values mat...,[No issues - All operators are standard.]
3,Data can be accessed up to 5 times.json,{'@context': {'xsd': 'http://www.w3.org/2001/X...,True,0.510487,0.357892,[No errors - All values found in ontology.],[No issues - Policy logic is valid.],[No schema issues - JSON structure is valid.],[No dependency issues - Constraints are logica...,"[ERROR: COUNT operand should compare numbers, ...",[No issues - All operators are standard.]
4,Data can be accessed 5 times.json,{'@context': {'xsd': 'http://www.w3.org/2001/X...,True,0.560698,0.331907,[No errors - All values found in ontology.],[No issues - Policy logic is valid.],[No schema issues - JSON structure is valid.],[No dependency issues - Constraints are logica...,[No data type issues - Operands and values mat...,[No issues - All operators are standard.]


In [43]:
import json
import pandas as pd
from datetime import datetime

# Function to validate time-based constraints
def check_time_constraint_validity(df, column_name="file_content"):
    """
    Validates logical correctness of time-based constraints (DURING, BEFORE, AFTER).

    :param df: Pandas DataFrame containing JSON policies.
    :param column_name: Column name where JSON policies are stored.
    :return: DataFrame with an additional 'time_constraint_issues' column.
    """
    time_issues_list = []
    current_time = datetime.utcnow()  # Get the current UTC time

    for index, row in df.iterrows():
        policy_id = row["file_name"]
        json_data = row[column_name]
        issues = []
        during_periods = []

        if isinstance(json_data, dict):  # Ensure JSON is parsed
            constraints = json_data.get("ids:constraint", [])

            for constraint in constraints:
                operator = constraint.get("ids:operator", {}).get("@id")
                right_operand = constraint.get("ids:rightOperand", {}).get("@value")

                # Validate BEFORE and AFTER timestamps
                if operator in ["https://w3id.org/idsa/code/BEFORE", "https://w3id.org/idsa/code/AFTER"]:
                    try:
                        constraint_time = datetime.fromisoformat(right_operand.replace("Z", ""))
                        if operator == "https://w3id.org/idsa/code/BEFORE" and constraint_time < current_time:
                            issues.append(f"WARNING: BEFORE constraint refers to past time ('{right_operand}').")
                        if operator == "https://w3id.org/idsa/code/AFTER" and constraint_time > current_time:
                            issues.append(f"WARNING: AFTER constraint is set in the future ('{right_operand}').")
                    except ValueError:
                        issues.append(f"ERROR: Invalid timestamp format in '{operator}' constraint ('{right_operand}').")

                # Track DURING periods
                if operator == "https://w3id.org/idsa/code/DURING":
                    try:
                        start_time, end_time = right_operand.split("/")
                        start_time = datetime.fromisoformat(start_time.replace("Z", ""))
                        end_time = datetime.fromisoformat(end_time.replace("Z", ""))
                        during_periods.append((start_time, end_time))

                        # Check if DURING period has already expired
                        if end_time < current_time:
                            issues.append(f"WARNING: Policy DURING period has expired (ended at '{end_time}').")
                    except ValueError:
                        issues.append(f"ERROR: Invalid DURING period format ('{right_operand}').")

            # Detect overlapping DURING constraints
            during_periods.sort()
            for i in range(1, len(during_periods)):
                if during_periods[i][0] < during_periods[i - 1][1]:  # Overlapping condition
                    issues.append(f"ERROR: Overlapping DURING constraints detected ({during_periods[i - 1]} overlaps {during_periods[i]}).")

        # Store results
        time_issues_list.append(issues if issues else ["No time constraint issues - All constraints are valid."])

    # Add time constraint validation results to the DataFrame
    df["time_constraint_issues"] = time_issues_list
    return df

# ---- STEP 1: Validate Time Constraints ----
df = check_time_constraint_validity(df, "file_content")

# ---- STEP 2: Display Updated DataFrame with Time Constraint Issues ----
from IPython.display import display
display(df)

Unnamed: 0,file_name,file_content,is_valid_json,semantic_similarity_description,semantic_similarity_title,ontology_validation_errors,policy_logic_issues,json_schema_issues,constraint_dependency_issues,constraint_data_type_issues,non_standard_operator_issues,time_constraint_issues
0,Data access is limited to 5 times.json,{'@context': {'xsd': 'http://www.w3.org/2001/X...,True,0.725082,0.543967,[No errors - All values found in ontology.],[No issues - Policy logic is valid.],[No schema issues - JSON structure is valid.],[No dependency issues - Constraints are logica...,"[ERROR: COUNT operand should compare numbers, ...",[No issues - All operators are standard.],[No time constraint issues - All constraints a...
1,Permission to access data a limited number of ...,{'@context': {'xsd': 'http://www.w3.org/2001/X...,True,0.770015,0.491442,[No errors - All values found in ontology.],[No issues - Policy logic is valid.],[No schema issues - JSON structure is valid.],[No dependency issues - Constraints are logica...,"[ERROR: COUNT operand should compare numbers, ...",[No issues - All operators are standard.],[No time constraint issues - All constraints a...
2,Data can be accessed from 1st January 2024 to ...,{'@context': {'xsd': 'http://www.w3.org/2001/X...,True,0.745148,0.429334,[No errors - All values found in ontology.],[No issues - Policy logic is valid.],[Schema Error: '@id' is a required property],[No dependency issues - Constraints are logica...,[No data type issues - Operands and values mat...,[No issues - All operators are standard.],[ERROR: Invalid DURING period format ('2024-06...
3,Data can be accessed up to 5 times.json,{'@context': {'xsd': 'http://www.w3.org/2001/X...,True,0.510487,0.357892,[No errors - All values found in ontology.],[No issues - Policy logic is valid.],[No schema issues - JSON structure is valid.],[No dependency issues - Constraints are logica...,"[ERROR: COUNT operand should compare numbers, ...",[No issues - All operators are standard.],[No time constraint issues - All constraints a...
4,Data can be accessed 5 times.json,{'@context': {'xsd': 'http://www.w3.org/2001/X...,True,0.560698,0.331907,[No errors - All values found in ontology.],[No issues - Policy logic is valid.],[No schema issues - JSON structure is valid.],[No dependency issues - Constraints are logica...,[No data type issues - Operands and values mat...,[No issues - All operators are standard.],[No time constraint issues - All constraints a...


In [44]:
import json
import pandas as pd

# Function to check policy scope and coverage
def check_policy_scope_coverage(df, column_name="file_content"):
    """
    Validates policy scope by checking for missing targets, undefined actions, and conflicting scope.

    :param df: Pandas DataFrame containing JSON policies.
    :param column_name: Column name where JSON policies are stored.
    :return: DataFrame with an additional 'policy_scope_issues' column.
    """
    scope_issues_list = []

    for index, row in df.iterrows():
        policy_id = row["file_name"]
        json_data = row[column_name]
        issues = []
        allowed_actions = set()
        restricted_actions = set()

        if isinstance(json_data, dict):  # Ensure JSON is parsed
            target = json_data.get("ids:target", None)
            actions = json_data.get("ids:action", [])

            # Check for missing target
            if target is None:
                issues.append("ERROR: Missing 'ids:target' field. The policy does not specify what data it applies to.")

            # Check for undefined or invalid actions
            for action in actions:
                action_id = action.get("@id", None)
                if not action_id:
                    issues.append("ERROR: Policy contains an action without an '@id' field.")
                else:
                    # Categorize actions into allowed and restricted
                    if "https://w3id.org/idsa/code/ALLOW" in action_id:
                        allowed_actions.add(action_id.replace("https://w3id.org/idsa/code/ALLOW_", ""))
                    elif "https://w3id.org/idsa/code/DENY" in action_id:
                        restricted_actions.add(action_id.replace("https://w3id.org/idsa/code/DENY_", ""))

            # Detect conflicting scope
            conflicts = allowed_actions.intersection(restricted_actions)
            if conflicts:
                issues.append(f"ERROR: Conflicting scope - '{conflicts}' is both allowed and denied in the same policy.")

        # Store results
        scope_issues_list.append(issues if issues else ["No scope issues - Policy defines a valid scope."])

    # Add scope validation results to the DataFrame
    df["policy_scope_issues"] = scope_issues_list
    return df

# ---- STEP 1: Validate Policy Scope ----
df = check_policy_scope_coverage(df, "file_content")

# ---- STEP 2: Display Updated DataFrame with Scope Issues ----
from IPython.display import display
display(df)

Unnamed: 0,file_name,file_content,is_valid_json,semantic_similarity_description,semantic_similarity_title,ontology_validation_errors,policy_logic_issues,json_schema_issues,constraint_dependency_issues,constraint_data_type_issues,non_standard_operator_issues,time_constraint_issues,policy_scope_issues
0,Data access is limited to 5 times.json,{'@context': {'xsd': 'http://www.w3.org/2001/X...,True,0.725082,0.543967,[No errors - All values found in ontology.],[No issues - Policy logic is valid.],[No schema issues - JSON structure is valid.],[No dependency issues - Constraints are logica...,"[ERROR: COUNT operand should compare numbers, ...",[No issues - All operators are standard.],[No time constraint issues - All constraints a...,[ERROR: Missing 'ids:target' field. The policy...
1,Permission to access data a limited number of ...,{'@context': {'xsd': 'http://www.w3.org/2001/X...,True,0.770015,0.491442,[No errors - All values found in ontology.],[No issues - Policy logic is valid.],[No schema issues - JSON structure is valid.],[No dependency issues - Constraints are logica...,"[ERROR: COUNT operand should compare numbers, ...",[No issues - All operators are standard.],[No time constraint issues - All constraints a...,[ERROR: Missing 'ids:target' field. The policy...
2,Data can be accessed from 1st January 2024 to ...,{'@context': {'xsd': 'http://www.w3.org/2001/X...,True,0.745148,0.429334,[No errors - All values found in ontology.],[No issues - Policy logic is valid.],[Schema Error: '@id' is a required property],[No dependency issues - Constraints are logica...,[No data type issues - Operands and values mat...,[No issues - All operators are standard.],[ERROR: Invalid DURING period format ('2024-06...,[ERROR: Missing 'ids:target' field. The policy...
3,Data can be accessed up to 5 times.json,{'@context': {'xsd': 'http://www.w3.org/2001/X...,True,0.510487,0.357892,[No errors - All values found in ontology.],[No issues - Policy logic is valid.],[No schema issues - JSON structure is valid.],[No dependency issues - Constraints are logica...,"[ERROR: COUNT operand should compare numbers, ...",[No issues - All operators are standard.],[No time constraint issues - All constraints a...,[ERROR: Missing 'ids:target' field. The policy...
4,Data can be accessed 5 times.json,{'@context': {'xsd': 'http://www.w3.org/2001/X...,True,0.560698,0.331907,[No errors - All values found in ontology.],[No issues - Policy logic is valid.],[No schema issues - JSON structure is valid.],[No dependency issues - Constraints are logica...,[No data type issues - Operands and values mat...,[No issues - All operators are standard.],[No time constraint issues - All constraints a...,[ERROR: Missing 'ids:target' field. The policy...


In [45]:
import json
import pandas as pd

# Function to detect redundant or conflicting policies
def check_redundant_policies(df, column_name="file_content"):
    """
    Detects duplicate, conflicting, or redundant policies applying to the same ids:target.

    :param df: Pandas DataFrame containing JSON policies.
    :param column_name: Column name where JSON policies are stored.
    :return: DataFrame with an additional 'redundant_policy_issues' column.
    """
    policy_issues_list = []
    policy_map = {}  # Dictionary to track actions and constraints per target

    for index, row in df.iterrows():
        policy_id = row["file_name"]
        json_data = row[column_name]
        issues = []
        target = json_data.get("ids:target", None)
        actions = set()
        constraints = set()

        if isinstance(json_data, dict):  # Ensure JSON is parsed
            # Collect actions
            if "ids:action" in json_data:
                for action in json_data["ids:action"]:
                    action_id = action.get("@id", None)
                    if action_id:
                        actions.add(action_id)

            # Collect constraints
            if "ids:constraint" in json_data:
                for constraint in json_data["ids:constraint"]:
                    operator = constraint.get("ids:operator", {}).get("@id")
                    left_operand = constraint.get("ids:leftOperand", {}).get("@id")
                    right_operand = constraint.get("ids:rightOperand", {}).get("@value")

                    if operator and left_operand and right_operand:
                        constraints.add((left_operand, operator, right_operand))

            # Detect identical policies
            policy_signature = (target, frozenset(actions), frozenset(constraints))
            if policy_signature in policy_map:
                issues.append("ERROR: This policy is a duplicate of another existing policy.")
            else:
                policy_map[policy_signature] = policy_id

            # Detect conflicting policies on the same target
            if target in policy_map:
                existing_actions, existing_constraints = policy_map[target]

                # Check for action conflicts
                conflicting_actions = actions.intersection(existing_actions)
                if conflicting_actions:
                    issues.append(f"ERROR: Conflicting actions detected on target '{target}' - {conflicting_actions}")

                # Check for constraint conflicts
                for left_operand, operator, right_operand in constraints:
                    for existing_left, existing_op, existing_right in existing_constraints:
                        if left_operand == existing_left and operator != existing_op:
                            issues.append(f"ERROR: Conflicting constraints on target '{target}' - {left_operand} has both '{operator}' and '{existing_op}'.")

        # Store results
        policy_issues_list.append(issues if issues else ["No redundancy or conflicts detected."])

    # Add redundancy validation results to the DataFrame
    df["redundant_policy_issues"] = policy_issues_list
    return df

# ---- STEP 1: Validate Redundant Policies ----
df = check_redundant_policies(df, "file_content")

# ---- STEP 2: Display Updated DataFrame with Policy Redundancies ----
from IPython.display import display
display(df)

Unnamed: 0,file_name,file_content,is_valid_json,semantic_similarity_description,semantic_similarity_title,ontology_validation_errors,policy_logic_issues,json_schema_issues,constraint_dependency_issues,constraint_data_type_issues,non_standard_operator_issues,time_constraint_issues,policy_scope_issues,redundant_policy_issues
0,Data access is limited to 5 times.json,{'@context': {'xsd': 'http://www.w3.org/2001/X...,True,0.725082,0.543967,[No errors - All values found in ontology.],[No issues - Policy logic is valid.],[No schema issues - JSON structure is valid.],[No dependency issues - Constraints are logica...,"[ERROR: COUNT operand should compare numbers, ...",[No issues - All operators are standard.],[No time constraint issues - All constraints a...,[ERROR: Missing 'ids:target' field. The policy...,[No redundancy or conflicts detected.]
1,Permission to access data a limited number of ...,{'@context': {'xsd': 'http://www.w3.org/2001/X...,True,0.770015,0.491442,[No errors - All values found in ontology.],[No issues - Policy logic is valid.],[No schema issues - JSON structure is valid.],[No dependency issues - Constraints are logica...,"[ERROR: COUNT operand should compare numbers, ...",[No issues - All operators are standard.],[No time constraint issues - All constraints a...,[ERROR: Missing 'ids:target' field. The policy...,[ERROR: This policy is a duplicate of another ...
2,Data can be accessed from 1st January 2024 to ...,{'@context': {'xsd': 'http://www.w3.org/2001/X...,True,0.745148,0.429334,[No errors - All values found in ontology.],[No issues - Policy logic is valid.],[Schema Error: '@id' is a required property],[No dependency issues - Constraints are logica...,[No data type issues - Operands and values mat...,[No issues - All operators are standard.],[ERROR: Invalid DURING period format ('2024-06...,[ERROR: Missing 'ids:target' field. The policy...,[No redundancy or conflicts detected.]
3,Data can be accessed up to 5 times.json,{'@context': {'xsd': 'http://www.w3.org/2001/X...,True,0.510487,0.357892,[No errors - All values found in ontology.],[No issues - Policy logic is valid.],[No schema issues - JSON structure is valid.],[No dependency issues - Constraints are logica...,"[ERROR: COUNT operand should compare numbers, ...",[No issues - All operators are standard.],[No time constraint issues - All constraints a...,[ERROR: Missing 'ids:target' field. The policy...,[ERROR: This policy is a duplicate of another ...
4,Data can be accessed 5 times.json,{'@context': {'xsd': 'http://www.w3.org/2001/X...,True,0.560698,0.331907,[No errors - All values found in ontology.],[No issues - Policy logic is valid.],[No schema issues - JSON structure is valid.],[No dependency issues - Constraints are logica...,[No data type issues - Operands and values mat...,[No issues - All operators are standard.],[No time constraint issues - All constraints a...,[ERROR: Missing 'ids:target' field. The policy...,[No redundancy or conflicts detected.]


In [46]:
import json
import pandas as pd

# Function to check required fields for specific actions
def check_required_fields(df, column_name="file_content"):
    """
    Validates that specific actions include the required fields.

    :param df: Pandas DataFrame containing JSON policies.
    :param column_name: Column name where JSON policies are stored.
    :return: DataFrame with an additional 'required_field_issues' column.
    """
    required_issues_list = []

    for index, row in df.iterrows():
        policy_id = row["file_name"]
        json_data = row[column_name]
        issues = []
        has_transfer = False
        has_write = False
        has_duration = False
        has_after = False
        has_before = False

        if isinstance(json_data, dict):  # Ensure JSON is parsed
            # Check for actions
            actions = json_data.get("ids:action", [])
            for action in actions:
                action_id = action.get("@id", None)
                if action_id == "https://w3id.org/idsa/code/TRANSFER":
                    has_transfer = True
                if action_id == "https://w3id.org/idsa/code/WRITE":
                    has_write = True

            # Check for constraints
            constraints = json_data.get("ids:constraint", [])
            for constraint in constraints:
                operator = constraint.get("ids:operator", {}).get("@id")
                if operator == "https://w3id.org/idsa/code/DURATION":
                    has_duration = True
                if operator == "https://w3id.org/idsa/code/AFTER":
                    has_after = True
                if operator == "https://w3id.org/idsa/code/BEFORE":
                    has_before = True

            # Validate required fields
            if has_transfer and "ids:recipient" not in json_data:
                issues.append("ERROR: TRANSFER action requires 'ids:recipient' field.")

            if has_write and "ids:provenance" not in json_data:
                issues.append("ERROR: WRITE action requires 'ids:provenance' field (authentication details).")

            if has_duration and not (has_after or has_before):
                issues.append("ERROR: DURATION constraint exists but has no AFTER or BEFORE constraint.")

        # Store results
        required_issues_list.append(issues if issues else ["No issues - All required fields are present."])

    # Add required field validation results to the DataFrame
    df["required_field_issues"] = required_issues_list
    return df

# ---- STEP 1: Validate Required Fields ----
df = check_required_fields(df, "file_content")

# ---- STEP 2: Display Updated DataFrame with Required Field Issues ----
from IPython.display import display
display(df)

Unnamed: 0,file_name,file_content,is_valid_json,semantic_similarity_description,semantic_similarity_title,ontology_validation_errors,policy_logic_issues,json_schema_issues,constraint_dependency_issues,constraint_data_type_issues,non_standard_operator_issues,time_constraint_issues,policy_scope_issues,redundant_policy_issues,required_field_issues
0,Data access is limited to 5 times.json,{'@context': {'xsd': 'http://www.w3.org/2001/X...,True,0.725082,0.543967,[No errors - All values found in ontology.],[No issues - Policy logic is valid.],[No schema issues - JSON structure is valid.],[No dependency issues - Constraints are logica...,"[ERROR: COUNT operand should compare numbers, ...",[No issues - All operators are standard.],[No time constraint issues - All constraints a...,[ERROR: Missing 'ids:target' field. The policy...,[No redundancy or conflicts detected.],[No issues - All required fields are present.]
1,Permission to access data a limited number of ...,{'@context': {'xsd': 'http://www.w3.org/2001/X...,True,0.770015,0.491442,[No errors - All values found in ontology.],[No issues - Policy logic is valid.],[No schema issues - JSON structure is valid.],[No dependency issues - Constraints are logica...,"[ERROR: COUNT operand should compare numbers, ...",[No issues - All operators are standard.],[No time constraint issues - All constraints a...,[ERROR: Missing 'ids:target' field. The policy...,[ERROR: This policy is a duplicate of another ...,[No issues - All required fields are present.]
2,Data can be accessed from 1st January 2024 to ...,{'@context': {'xsd': 'http://www.w3.org/2001/X...,True,0.745148,0.429334,[No errors - All values found in ontology.],[No issues - Policy logic is valid.],[Schema Error: '@id' is a required property],[No dependency issues - Constraints are logica...,[No data type issues - Operands and values mat...,[No issues - All operators are standard.],[ERROR: Invalid DURING period format ('2024-06...,[ERROR: Missing 'ids:target' field. The policy...,[No redundancy or conflicts detected.],[No issues - All required fields are present.]
3,Data can be accessed up to 5 times.json,{'@context': {'xsd': 'http://www.w3.org/2001/X...,True,0.510487,0.357892,[No errors - All values found in ontology.],[No issues - Policy logic is valid.],[No schema issues - JSON structure is valid.],[No dependency issues - Constraints are logica...,"[ERROR: COUNT operand should compare numbers, ...",[No issues - All operators are standard.],[No time constraint issues - All constraints a...,[ERROR: Missing 'ids:target' field. The policy...,[ERROR: This policy is a duplicate of another ...,[No issues - All required fields are present.]
4,Data can be accessed 5 times.json,{'@context': {'xsd': 'http://www.w3.org/2001/X...,True,0.560698,0.331907,[No errors - All values found in ontology.],[No issues - Policy logic is valid.],[No schema issues - JSON structure is valid.],[No dependency issues - Constraints are logica...,[No data type issues - Operands and values mat...,[No issues - All operators are standard.],[No time constraint issues - All constraints a...,[ERROR: Missing 'ids:target' field. The policy...,[No redundancy or conflicts detected.],[No issues - All required fields are present.]


In [47]:
import json
import pandas as pd
from collections import defaultdict

# Function to detect contradictions across multiple policies
def check_policy_contradictions(df, column_name="file_content"):
    """
    Detects conflicting policies that allow and deny actions on the same ids:target.

    :param df: Pandas DataFrame containing JSON policies.
    :param column_name: Column name where JSON policies are stored.
    :return: DataFrame with an additional 'policy_contradiction_issues' column.
    """
    contradiction_issues_list = []
    policy_map = defaultdict(lambda: {"allow": set(), "deny": set(), "during": []})

    # Step 1: Collect all actions and constraints per target
    for index, row in df.iterrows():
        json_data = row[column_name]
        target = json_data.get("ids:target", None)
        if target is None:
            continue

        actions = json_data.get("ids:action", [])
        constraints = json_data.get("ids:constraint", [])

        for action in actions:
            action_id = action.get("@id", None)
            if action_id:
                if "ALLOW" in action_id:
                    policy_map[target]["allow"].add(action_id.replace("https://w3id.org/idsa/code/ALLOW_", ""))
                elif "DENY" in action_id:
                    policy_map[target]["deny"].add(action_id.replace("https://w3id.org/idsa/code/DENY_", ""))

        for constraint in constraints:
            operator = constraint.get("ids:operator", {}).get("@id")
            right_operand = constraint.get("ids:rightOperand", {}).get("@value")

            if operator == "https://w3id.org/idsa/code/DURING":
                try:
                    start_time, end_time = right_operand.split("/")
                    policy_map[target]["during"].append((start_time, end_time))
                except ValueError:
                    continue  # Skip malformed DURING constraints

    # Step 2: Detect contradictions
    for index, row in df.iterrows():
        policy_id = row["file_name"]
        json_data = row[column_name]
        issues = []
        target = json_data.get("ids:target", None)

        if target and target in policy_map:
            allowed_actions = policy_map[target]["allow"]
            denied_actions = policy_map[target]["deny"]
            during_periods = policy_map[target]["during"]

            # Check for action contradictions
            conflicts = allowed_actions.intersection(denied_actions)
            if conflicts:
                issues.append(f"ERROR: Conflicting actions on target '{target}' - {conflicts} is both allowed and denied.")

            # Check for DURING contradictions (one policy allows, another denies all)
            if during_periods:
                if not allowed_actions and denied_actions:
                    issues.append(f"ERROR: DURING access exists but all actions are denied for '{target}'.")

        # Store results
        contradiction_issues_list.append(issues if issues else ["No contradictions detected across policies."])

    # Add contradiction validation results to the DataFrame
    df["policy_contradiction_issues"] = contradiction_issues_list
    return df

# ---- STEP 1: Validate Policy Contradictions ----
df = check_policy_contradictions(df, "file_content")

# ---- STEP 2: Display Updated DataFrame with Contradictions ----
from IPython.display import display
display(df)

Unnamed: 0,file_name,file_content,is_valid_json,semantic_similarity_description,semantic_similarity_title,ontology_validation_errors,policy_logic_issues,json_schema_issues,constraint_dependency_issues,constraint_data_type_issues,non_standard_operator_issues,time_constraint_issues,policy_scope_issues,redundant_policy_issues,required_field_issues,policy_contradiction_issues
0,Data access is limited to 5 times.json,{'@context': {'xsd': 'http://www.w3.org/2001/X...,True,0.725082,0.543967,[No errors - All values found in ontology.],[No issues - Policy logic is valid.],[No schema issues - JSON structure is valid.],[No dependency issues - Constraints are logica...,"[ERROR: COUNT operand should compare numbers, ...",[No issues - All operators are standard.],[No time constraint issues - All constraints a...,[ERROR: Missing 'ids:target' field. The policy...,[No redundancy or conflicts detected.],[No issues - All required fields are present.],[No contradictions detected across policies.]
1,Permission to access data a limited number of ...,{'@context': {'xsd': 'http://www.w3.org/2001/X...,True,0.770015,0.491442,[No errors - All values found in ontology.],[No issues - Policy logic is valid.],[No schema issues - JSON structure is valid.],[No dependency issues - Constraints are logica...,"[ERROR: COUNT operand should compare numbers, ...",[No issues - All operators are standard.],[No time constraint issues - All constraints a...,[ERROR: Missing 'ids:target' field. The policy...,[ERROR: This policy is a duplicate of another ...,[No issues - All required fields are present.],[No contradictions detected across policies.]
2,Data can be accessed from 1st January 2024 to ...,{'@context': {'xsd': 'http://www.w3.org/2001/X...,True,0.745148,0.429334,[No errors - All values found in ontology.],[No issues - Policy logic is valid.],[Schema Error: '@id' is a required property],[No dependency issues - Constraints are logica...,[No data type issues - Operands and values mat...,[No issues - All operators are standard.],[ERROR: Invalid DURING period format ('2024-06...,[ERROR: Missing 'ids:target' field. The policy...,[No redundancy or conflicts detected.],[No issues - All required fields are present.],[No contradictions detected across policies.]
3,Data can be accessed up to 5 times.json,{'@context': {'xsd': 'http://www.w3.org/2001/X...,True,0.510487,0.357892,[No errors - All values found in ontology.],[No issues - Policy logic is valid.],[No schema issues - JSON structure is valid.],[No dependency issues - Constraints are logica...,"[ERROR: COUNT operand should compare numbers, ...",[No issues - All operators are standard.],[No time constraint issues - All constraints a...,[ERROR: Missing 'ids:target' field. The policy...,[ERROR: This policy is a duplicate of another ...,[No issues - All required fields are present.],[No contradictions detected across policies.]
4,Data can be accessed 5 times.json,{'@context': {'xsd': 'http://www.w3.org/2001/X...,True,0.560698,0.331907,[No errors - All values found in ontology.],[No issues - Policy logic is valid.],[No schema issues - JSON structure is valid.],[No dependency issues - Constraints are logica...,[No data type issues - Operands and values mat...,[No issues - All operators are standard.],[No time constraint issues - All constraints a...,[ERROR: Missing 'ids:target' field. The policy...,[No redundancy or conflicts detected.],[No issues - All required fields are present.],[No contradictions detected across policies.]


In [48]:
import json
import pandas as pd

# Function to check if policies can be properly enforced
def check_policy_enforcement_feasibility(df, column_name="file_content"):
    """
    Validates whether a policy is technically enforceable.

    :param df: Pandas DataFrame containing JSON policies.
    :param column_name: Column name where JSON policies are stored.
    :return: DataFrame with an additional 'policy_enforcement_issues' column.
    """
    enforcement_issues_list = []

    for index, row in df.iterrows():
        policy_id = row["file_name"]
        json_data = row[column_name]
        issues = []
        has_transfer = False
        has_modify = False
        has_duration = False
        missing_time_unit = False

        if isinstance(json_data, dict):  # Ensure JSON is parsed
            # Check for actions
            actions = json_data.get("ids:action", [])
            for action in actions:
                action_id = action.get("@id", None)
                if action_id == "https://w3id.org/idsa/code/TRANSFER":
                    has_transfer = True
                if action_id == "https://w3id.org/idsa/code/MODIFY":
                    has_modify = True

            # Check for constraints
            constraints = json_data.get("ids:constraint", [])
            for constraint in constraints:
                operator = constraint.get("ids:operator", {}).get("@id")
                right_operand = constraint.get("ids:rightOperand", {}).get("@value")

                if operator == "https://w3id.org/idsa/code/DURATION":
                    has_duration = True
                    # Check if the duration value has a valid time unit
                    if isinstance(right_operand, str) and not any(unit in right_operand.lower() for unit in ["s", "sec", "min", "hour", "day"]):
                        missing_time_unit = True

            # Validate enforcement feasibility
            if has_transfer and "ids:recipient" not in json_data:
                issues.append("ERROR: TRANSFER action requires 'ids:recipient' field for enforcement.")

            if has_modify and "ids:compliance" not in json_data:
                issues.append("WARNING: MODIFY action exists but no 'ids:compliance' enforcement mechanism is defined.")

            if has_duration and missing_time_unit:
                issues.append("ERROR: DURATION constraint exists but does not specify a valid time unit (e.g., sec, min, day).")

        # Store results
        enforcement_issues_list.append(issues if issues else ["No enforcement issues - Policy is feasible."])

    # Add enforcement feasibility validation results to the DataFrame
    df["policy_enforcement_issues"] = enforcement_issues_list
    return df

# ---- STEP 1: Validate Policy Enforcement Feasibility ----
df = check_policy_enforcement_feasibility(df, "file_content")

# ---- STEP 2: Display Updated DataFrame with Enforcement Issues ----
from IPython.display import display
display(df)

Unnamed: 0,file_name,file_content,is_valid_json,semantic_similarity_description,semantic_similarity_title,ontology_validation_errors,policy_logic_issues,json_schema_issues,constraint_dependency_issues,constraint_data_type_issues,non_standard_operator_issues,time_constraint_issues,policy_scope_issues,redundant_policy_issues,required_field_issues,policy_contradiction_issues,policy_enforcement_issues
0,Data access is limited to 5 times.json,{'@context': {'xsd': 'http://www.w3.org/2001/X...,True,0.725082,0.543967,[No errors - All values found in ontology.],[No issues - Policy logic is valid.],[No schema issues - JSON structure is valid.],[No dependency issues - Constraints are logica...,"[ERROR: COUNT operand should compare numbers, ...",[No issues - All operators are standard.],[No time constraint issues - All constraints a...,[ERROR: Missing 'ids:target' field. The policy...,[No redundancy or conflicts detected.],[No issues - All required fields are present.],[No contradictions detected across policies.],[No enforcement issues - Policy is feasible.]
1,Permission to access data a limited number of ...,{'@context': {'xsd': 'http://www.w3.org/2001/X...,True,0.770015,0.491442,[No errors - All values found in ontology.],[No issues - Policy logic is valid.],[No schema issues - JSON structure is valid.],[No dependency issues - Constraints are logica...,"[ERROR: COUNT operand should compare numbers, ...",[No issues - All operators are standard.],[No time constraint issues - All constraints a...,[ERROR: Missing 'ids:target' field. The policy...,[ERROR: This policy is a duplicate of another ...,[No issues - All required fields are present.],[No contradictions detected across policies.],[No enforcement issues - Policy is feasible.]
2,Data can be accessed from 1st January 2024 to ...,{'@context': {'xsd': 'http://www.w3.org/2001/X...,True,0.745148,0.429334,[No errors - All values found in ontology.],[No issues - Policy logic is valid.],[Schema Error: '@id' is a required property],[No dependency issues - Constraints are logica...,[No data type issues - Operands and values mat...,[No issues - All operators are standard.],[ERROR: Invalid DURING period format ('2024-06...,[ERROR: Missing 'ids:target' field. The policy...,[No redundancy or conflicts detected.],[No issues - All required fields are present.],[No contradictions detected across policies.],[No enforcement issues - Policy is feasible.]
3,Data can be accessed up to 5 times.json,{'@context': {'xsd': 'http://www.w3.org/2001/X...,True,0.510487,0.357892,[No errors - All values found in ontology.],[No issues - Policy logic is valid.],[No schema issues - JSON structure is valid.],[No dependency issues - Constraints are logica...,"[ERROR: COUNT operand should compare numbers, ...",[No issues - All operators are standard.],[No time constraint issues - All constraints a...,[ERROR: Missing 'ids:target' field. The policy...,[ERROR: This policy is a duplicate of another ...,[No issues - All required fields are present.],[No contradictions detected across policies.],[No enforcement issues - Policy is feasible.]
4,Data can be accessed 5 times.json,{'@context': {'xsd': 'http://www.w3.org/2001/X...,True,0.560698,0.331907,[No errors - All values found in ontology.],[No issues - Policy logic is valid.],[No schema issues - JSON structure is valid.],[No dependency issues - Constraints are logica...,[No data type issues - Operands and values mat...,[No issues - All operators are standard.],[No time constraint issues - All constraints a...,[ERROR: Missing 'ids:target' field. The policy...,[No redundancy or conflicts detected.],[No issues - All required fields are present.],[No contradictions detected across policies.],[No enforcement issues - Policy is feasible.]


In [49]:
import json
import pandas as pd

# Function to check Role-Based Access Control (RBAC)
def check_rbac_validity(df, column_name="file_content"):
    """
    Validates RBAC by ensuring policies define roles and do not assign conflicting roles.

    :param df: Pandas DataFrame containing JSON policies.
    :param column_name: Column name where JSON policies are stored.
    :return: DataFrame with an additional 'rbac_issues' column.
    """
    rbac_issues_list = []
    role_conflicts = {
        "Admin": {"Guest", "Anonymous"},
        "User": {"Guest"},
        "Manager": {"Intern"},
        # Define more conflicting roles if needed
    }

    for index, row in df.iterrows():
        policy_id = row["file_name"]
        json_data = row[column_name]
        issues = []
        assigned_roles = set()

        if isinstance(json_data, dict):  # Ensure JSON is parsed
            roles = json_data.get("ids:role", [])

            # Check if roles are defined
            if not roles:
                issues.append("ERROR: Missing 'ids:role' field. The policy does not specify who it applies to.")
            else:
                for role in roles:
                    role_value = role.get("@value", None)
                    if role_value:
                        assigned_roles.add(role_value)

                # Check for conflicting roles in the same policy
                for role in assigned_roles:
                    if role in role_conflicts:
                        conflict_set = role_conflicts[role]
                        conflicting_roles = assigned_roles.intersection(conflict_set)
                        if conflicting_roles:
                            issues.append(f"ERROR: Conflicting roles detected in policy - '{role}' and '{conflicting_roles}'.")

        # Store results
        rbac_issues_list.append(issues if issues else ["No RBAC issues - Policy roles are valid."])

    # Add RBAC validation results to the DataFrame
    df["rbac_issues"] = rbac_issues_list
    return df

# ---- STEP 1: Validate Role-Based Access Control (RBAC) ----
df = check_rbac_validity(df, "file_content")

# ---- STEP 2: Display Updated DataFrame with RBAC Issues ----
from IPython.display import display
display(df)

Unnamed: 0,file_name,file_content,is_valid_json,semantic_similarity_description,semantic_similarity_title,ontology_validation_errors,policy_logic_issues,json_schema_issues,constraint_dependency_issues,constraint_data_type_issues,non_standard_operator_issues,time_constraint_issues,policy_scope_issues,redundant_policy_issues,required_field_issues,policy_contradiction_issues,policy_enforcement_issues,rbac_issues
0,Data access is limited to 5 times.json,{'@context': {'xsd': 'http://www.w3.org/2001/X...,True,0.725082,0.543967,[No errors - All values found in ontology.],[No issues - Policy logic is valid.],[No schema issues - JSON structure is valid.],[No dependency issues - Constraints are logica...,"[ERROR: COUNT operand should compare numbers, ...",[No issues - All operators are standard.],[No time constraint issues - All constraints a...,[ERROR: Missing 'ids:target' field. The policy...,[No redundancy or conflicts detected.],[No issues - All required fields are present.],[No contradictions detected across policies.],[No enforcement issues - Policy is feasible.],[ERROR: Missing 'ids:role' field. The policy d...
1,Permission to access data a limited number of ...,{'@context': {'xsd': 'http://www.w3.org/2001/X...,True,0.770015,0.491442,[No errors - All values found in ontology.],[No issues - Policy logic is valid.],[No schema issues - JSON structure is valid.],[No dependency issues - Constraints are logica...,"[ERROR: COUNT operand should compare numbers, ...",[No issues - All operators are standard.],[No time constraint issues - All constraints a...,[ERROR: Missing 'ids:target' field. The policy...,[ERROR: This policy is a duplicate of another ...,[No issues - All required fields are present.],[No contradictions detected across policies.],[No enforcement issues - Policy is feasible.],[ERROR: Missing 'ids:role' field. The policy d...
2,Data can be accessed from 1st January 2024 to ...,{'@context': {'xsd': 'http://www.w3.org/2001/X...,True,0.745148,0.429334,[No errors - All values found in ontology.],[No issues - Policy logic is valid.],[Schema Error: '@id' is a required property],[No dependency issues - Constraints are logica...,[No data type issues - Operands and values mat...,[No issues - All operators are standard.],[ERROR: Invalid DURING period format ('2024-06...,[ERROR: Missing 'ids:target' field. The policy...,[No redundancy or conflicts detected.],[No issues - All required fields are present.],[No contradictions detected across policies.],[No enforcement issues - Policy is feasible.],[ERROR: Missing 'ids:role' field. The policy d...
3,Data can be accessed up to 5 times.json,{'@context': {'xsd': 'http://www.w3.org/2001/X...,True,0.510487,0.357892,[No errors - All values found in ontology.],[No issues - Policy logic is valid.],[No schema issues - JSON structure is valid.],[No dependency issues - Constraints are logica...,"[ERROR: COUNT operand should compare numbers, ...",[No issues - All operators are standard.],[No time constraint issues - All constraints a...,[ERROR: Missing 'ids:target' field. The policy...,[ERROR: This policy is a duplicate of another ...,[No issues - All required fields are present.],[No contradictions detected across policies.],[No enforcement issues - Policy is feasible.],[ERROR: Missing 'ids:role' field. The policy d...
4,Data can be accessed 5 times.json,{'@context': {'xsd': 'http://www.w3.org/2001/X...,True,0.560698,0.331907,[No errors - All values found in ontology.],[No issues - Policy logic is valid.],[No schema issues - JSON structure is valid.],[No dependency issues - Constraints are logica...,[No data type issues - Operands and values mat...,[No issues - All operators are standard.],[No time constraint issues - All constraints a...,[ERROR: Missing 'ids:target' field. The policy...,[No redundancy or conflicts detected.],[No issues - All required fields are present.],[No contradictions detected across policies.],[No enforcement issues - Policy is feasible.],[ERROR: Missing 'ids:role' field. The policy d...


In [50]:
import json
import pandas as pd

# Function to check if policies are complete
def check_policy_completeness(df, column_name="file_content"):
    """
    Ensures policies have all required fields for completeness.

    :param df: Pandas DataFrame containing JSON policies.
    :param column_name: Column name where JSON policies are stored.
    :return: DataFrame with an additional 'policy_completeness_issues' column.
    """
    completeness_issues_list = []

    for index, row in df.iterrows():
        policy_id = row["file_name"]
        json_data = row[column_name]
        issues = []
        has_action = False
        has_enforcement_date = False

        if isinstance(json_data, dict):  # Ensure JSON is parsed
            # Check for provenance (who created the policy)
            if "ids:provenance" not in json_data:
                issues.append("ERROR: Missing 'ids:provenance' field (policy creator not defined).")

            # Check if actions are defined
            actions = json_data.get("ids:action", [])
            if not actions:
                issues.append("ERROR: Missing 'ids:action' field (no defined actions).")
            else:
                has_action = True

            # Check for enforcement dates (BEFORE or DURING)
            constraints = json_data.get("ids:constraint", [])
            for constraint in constraints:
                operator = constraint.get("ids:operator", {}).get("@id")
                if operator in ["https://w3id.org/idsa/code/BEFORE", "https://w3id.org/idsa/code/DURING"]:
                    has_enforcement_date = True
                    break

            if not has_enforcement_date:
                issues.append("ERROR: No enforcement date (BEFORE or DURING) is defined.")

        # Store results
        completeness_issues_list.append(issues if issues else ["No completeness issues - Policy is well-defined."])

    # Add completeness validation results to the DataFrame
    df["policy_completeness_issues"] = completeness_issues_list
    return df

# ---- STEP 1: Validate Policy Completeness ----
df = check_policy_completeness(df, "file_content")

# ---- STEP 2: Display Updated DataFrame with Completeness Issues ----
from IPython.display import display
display(df)

Unnamed: 0,file_name,file_content,is_valid_json,semantic_similarity_description,semantic_similarity_title,ontology_validation_errors,policy_logic_issues,json_schema_issues,constraint_dependency_issues,constraint_data_type_issues,non_standard_operator_issues,time_constraint_issues,policy_scope_issues,redundant_policy_issues,required_field_issues,policy_contradiction_issues,policy_enforcement_issues,rbac_issues,policy_completeness_issues
0,Data access is limited to 5 times.json,{'@context': {'xsd': 'http://www.w3.org/2001/X...,True,0.725082,0.543967,[No errors - All values found in ontology.],[No issues - Policy logic is valid.],[No schema issues - JSON structure is valid.],[No dependency issues - Constraints are logica...,"[ERROR: COUNT operand should compare numbers, ...",[No issues - All operators are standard.],[No time constraint issues - All constraints a...,[ERROR: Missing 'ids:target' field. The policy...,[No redundancy or conflicts detected.],[No issues - All required fields are present.],[No contradictions detected across policies.],[No enforcement issues - Policy is feasible.],[ERROR: Missing 'ids:role' field. The policy d...,[ERROR: Missing 'ids:provenance' field (policy...
1,Permission to access data a limited number of ...,{'@context': {'xsd': 'http://www.w3.org/2001/X...,True,0.770015,0.491442,[No errors - All values found in ontology.],[No issues - Policy logic is valid.],[No schema issues - JSON structure is valid.],[No dependency issues - Constraints are logica...,"[ERROR: COUNT operand should compare numbers, ...",[No issues - All operators are standard.],[No time constraint issues - All constraints a...,[ERROR: Missing 'ids:target' field. The policy...,[ERROR: This policy is a duplicate of another ...,[No issues - All required fields are present.],[No contradictions detected across policies.],[No enforcement issues - Policy is feasible.],[ERROR: Missing 'ids:role' field. The policy d...,[ERROR: Missing 'ids:provenance' field (policy...
2,Data can be accessed from 1st January 2024 to ...,{'@context': {'xsd': 'http://www.w3.org/2001/X...,True,0.745148,0.429334,[No errors - All values found in ontology.],[No issues - Policy logic is valid.],[Schema Error: '@id' is a required property],[No dependency issues - Constraints are logica...,[No data type issues - Operands and values mat...,[No issues - All operators are standard.],[ERROR: Invalid DURING period format ('2024-06...,[ERROR: Missing 'ids:target' field. The policy...,[No redundancy or conflicts detected.],[No issues - All required fields are present.],[No contradictions detected across policies.],[No enforcement issues - Policy is feasible.],[ERROR: Missing 'ids:role' field. The policy d...,[ERROR: Missing 'ids:provenance' field (policy...
3,Data can be accessed up to 5 times.json,{'@context': {'xsd': 'http://www.w3.org/2001/X...,True,0.510487,0.357892,[No errors - All values found in ontology.],[No issues - Policy logic is valid.],[No schema issues - JSON structure is valid.],[No dependency issues - Constraints are logica...,"[ERROR: COUNT operand should compare numbers, ...",[No issues - All operators are standard.],[No time constraint issues - All constraints a...,[ERROR: Missing 'ids:target' field. The policy...,[ERROR: This policy is a duplicate of another ...,[No issues - All required fields are present.],[No contradictions detected across policies.],[No enforcement issues - Policy is feasible.],[ERROR: Missing 'ids:role' field. The policy d...,[ERROR: Missing 'ids:provenance' field (policy...
4,Data can be accessed 5 times.json,{'@context': {'xsd': 'http://www.w3.org/2001/X...,True,0.560698,0.331907,[No errors - All values found in ontology.],[No issues - Policy logic is valid.],[No schema issues - JSON structure is valid.],[No dependency issues - Constraints are logica...,[No data type issues - Operands and values mat...,[No issues - All operators are standard.],[No time constraint issues - All constraints a...,[ERROR: Missing 'ids:target' field. The policy...,[No redundancy or conflicts detected.],[No issues - All required fields are present.],[No contradictions detected across policies.],[No enforcement issues - Policy is feasible.],[ERROR: Missing 'ids:role' field. The policy d...,[ERROR: Missing 'ids:provenance' field (policy...


In [51]:
import json
import pandas as pd

# Function to check policy compliance with regulations
def check_policy_compliance(df, column_name="file_content"):
    """
    Validates whether policies comply with GDPR, CCPA, or internal data governance rules.

    :param df: Pandas DataFrame containing JSON policies.
    :param column_name: Column name where JSON policies are stored.
    :return: DataFrame with an additional 'policy_compliance_issues' column.
    """
    compliance_issues_list = []

    for index, row in df.iterrows():
        policy_id = row["file_name"]
        json_data = row[column_name]
        issues = []
        has_transfer = False
        has_modify = False
        has_duration = False
        requires_consent = False

        if isinstance(json_data, dict):  # Ensure JSON is parsed
            # Check for actions
            actions = json_data.get("ids:action", [])
            for action in actions:
                action_id = action.get("@id", None)
                if action_id == "https://w3id.org/idsa/code/TRANSFER":
                    has_transfer = True
                if action_id == "https://w3id.org/idsa/code/MODIFY":
                    has_modify = True

            # Check for constraints
            constraints = json_data.get("ids:constraint", [])
            for constraint in constraints:
                operator = constraint.get("ids:operator", {}).get("@id")
                left_operand = constraint.get("ids:leftOperand", {}).get("@id")

                if operator == "https://w3id.org/idsa/code/DURATION":
                    has_duration = True

                if left_operand == "https://w3id.org/idsa/code/CONSENT":
                    requires_consent = True

            # Validate GDPR/CCPA compliance
            if has_transfer and not requires_consent:
                issues.append("ERROR: TRANSFER action detected without required CONSENT.")

            if has_modify and "ids:sensitiveData" in json_data:
                issues.append("WARNING: MODIFY action applies to sensitive data, review compliance.")

            if has_transfer and has_duration:
                issues.append("ERROR: Data retention (DURATION) should not be enforced for TRANSFER policies.")

        # Store results
        compliance_issues_list.append(issues if issues else ["No compliance issues - Policy aligns with regulations."])

    # Add compliance validation results to the DataFrame
    df["policy_compliance_issues"] = compliance_issues_list
    return df

# ---- STEP 1: Validate Policy Compliance with Regulations ----
df = check_policy_compliance(df, "file_content")

# ---- STEP 2: Display Updated DataFrame with Compliance Issues ----
from IPython.display import display
display(df)

Unnamed: 0,file_name,file_content,is_valid_json,semantic_similarity_description,semantic_similarity_title,ontology_validation_errors,policy_logic_issues,json_schema_issues,constraint_dependency_issues,constraint_data_type_issues,non_standard_operator_issues,time_constraint_issues,policy_scope_issues,redundant_policy_issues,required_field_issues,policy_contradiction_issues,policy_enforcement_issues,rbac_issues,policy_completeness_issues,policy_compliance_issues
0,Data access is limited to 5 times.json,{'@context': {'xsd': 'http://www.w3.org/2001/X...,True,0.725082,0.543967,[No errors - All values found in ontology.],[No issues - Policy logic is valid.],[No schema issues - JSON structure is valid.],[No dependency issues - Constraints are logica...,"[ERROR: COUNT operand should compare numbers, ...",[No issues - All operators are standard.],[No time constraint issues - All constraints a...,[ERROR: Missing 'ids:target' field. The policy...,[No redundancy or conflicts detected.],[No issues - All required fields are present.],[No contradictions detected across policies.],[No enforcement issues - Policy is feasible.],[ERROR: Missing 'ids:role' field. The policy d...,[ERROR: Missing 'ids:provenance' field (policy...,[No compliance issues - Policy aligns with reg...
1,Permission to access data a limited number of ...,{'@context': {'xsd': 'http://www.w3.org/2001/X...,True,0.770015,0.491442,[No errors - All values found in ontology.],[No issues - Policy logic is valid.],[No schema issues - JSON structure is valid.],[No dependency issues - Constraints are logica...,"[ERROR: COUNT operand should compare numbers, ...",[No issues - All operators are standard.],[No time constraint issues - All constraints a...,[ERROR: Missing 'ids:target' field. The policy...,[ERROR: This policy is a duplicate of another ...,[No issues - All required fields are present.],[No contradictions detected across policies.],[No enforcement issues - Policy is feasible.],[ERROR: Missing 'ids:role' field. The policy d...,[ERROR: Missing 'ids:provenance' field (policy...,[No compliance issues - Policy aligns with reg...
2,Data can be accessed from 1st January 2024 to ...,{'@context': {'xsd': 'http://www.w3.org/2001/X...,True,0.745148,0.429334,[No errors - All values found in ontology.],[No issues - Policy logic is valid.],[Schema Error: '@id' is a required property],[No dependency issues - Constraints are logica...,[No data type issues - Operands and values mat...,[No issues - All operators are standard.],[ERROR: Invalid DURING period format ('2024-06...,[ERROR: Missing 'ids:target' field. The policy...,[No redundancy or conflicts detected.],[No issues - All required fields are present.],[No contradictions detected across policies.],[No enforcement issues - Policy is feasible.],[ERROR: Missing 'ids:role' field. The policy d...,[ERROR: Missing 'ids:provenance' field (policy...,[No compliance issues - Policy aligns with reg...
3,Data can be accessed up to 5 times.json,{'@context': {'xsd': 'http://www.w3.org/2001/X...,True,0.510487,0.357892,[No errors - All values found in ontology.],[No issues - Policy logic is valid.],[No schema issues - JSON structure is valid.],[No dependency issues - Constraints are logica...,"[ERROR: COUNT operand should compare numbers, ...",[No issues - All operators are standard.],[No time constraint issues - All constraints a...,[ERROR: Missing 'ids:target' field. The policy...,[ERROR: This policy is a duplicate of another ...,[No issues - All required fields are present.],[No contradictions detected across policies.],[No enforcement issues - Policy is feasible.],[ERROR: Missing 'ids:role' field. The policy d...,[ERROR: Missing 'ids:provenance' field (policy...,[No compliance issues - Policy aligns with reg...
4,Data can be accessed 5 times.json,{'@context': {'xsd': 'http://www.w3.org/2001/X...,True,0.560698,0.331907,[No errors - All values found in ontology.],[No issues - Policy logic is valid.],[No schema issues - JSON structure is valid.],[No dependency issues - Constraints are logica...,[No data type issues - Operands and values mat...,[No issues - All operators are standard.],[No time constraint issues - All constraints a...,[ERROR: Missing 'ids:target' field. The policy...,[No redundancy or conflicts detected.],[No issues - All required fields are present.],[No contradictions detected across policies.],[No enforcement issues - Policy is feasible.],[ERROR: Missing 'ids:role' field. The policy d...,[ERROR: Missing 'ids:provenance' field (policy...,[No compliance issues - Policy aligns with reg...


In [52]:
import json
import pandas as pd

# Function to detect vague or unclear policies
def check_policy_vagueness(df, column_name="file_content"):
    """
    Detects policies that are too vague to be enforced properly.

    :param df: Pandas DataFrame containing JSON policies.
    :param column_name: Column name where JSON policies are stored.
    :return: DataFrame with an additional 'policy_vagueness_issues' column.
    """
    vagueness_issues_list = []

    for index, row in df.iterrows():
        policy_id = row["file_name"]
        json_data = row[column_name]
        issues = []
        has_description_only = True
        has_action = False
        has_duration = False
        missing_time_value = False

        if isinstance(json_data, dict):  # Ensure JSON is parsed
            # Check if the policy only contains a description with no enforceable rules
            if "ids:description" in json_data and len(json_data.keys()) == 2:  # Only @context and ids:description exist
                issues.append("ERROR: Policy contains only a description with no enforceable constraints or actions.")

            # Check for actions
            actions = json_data.get("ids:action", [])
            if actions:
                has_action = True
                for action in actions:
                    action_id = action.get("@id", None)
                    if action_id and "ALLOW_EVERYTHING" in action_id:
                        issues.append("WARNING: Policy allows all actions without restriction ('ALLOW EVERYTHING').")

            # Check for constraints
            constraints = json_data.get("ids:constraint", [])
            for constraint in constraints:
                operator = constraint.get("ids:operator", {}).get("@id")
                right_operand = constraint.get("ids:rightOperand", {}).get("@value")

                if operator == "https://w3id.org/idsa/code/DURATION":
                    has_duration = True
                    # Check if DURATION has a valid numeric time value
                    if not isinstance(right_operand, (int, float)):
                        missing_time_value = True

            if has_duration and missing_time_value:
                issues.append("ERROR: Policy mentions DURATION but does not specify a time limit.")

            # Ensure at least one enforceable rule exists
            if has_description_only and not has_action and not has_duration:
                issues.append("ERROR: Policy is vague and does not define any clear enforceable rules.")

        # Store results
        vagueness_issues_list.append(issues if issues else ["No vagueness issues - Policy is well-defined."])

    # Add vagueness validation results to the DataFrame
    df["policy_vagueness_issues"] = vagueness_issues_list
    return df

# ---- STEP 1: Validate Policy Vagueness ----
df = check_policy_vagueness(df, "file_content")

# ---- STEP 2: Display Updated DataFrame with Vagueness Issues ----
from IPython.display import display
display(df)

Unnamed: 0,file_name,file_content,is_valid_json,semantic_similarity_description,semantic_similarity_title,ontology_validation_errors,policy_logic_issues,json_schema_issues,constraint_dependency_issues,constraint_data_type_issues,...,time_constraint_issues,policy_scope_issues,redundant_policy_issues,required_field_issues,policy_contradiction_issues,policy_enforcement_issues,rbac_issues,policy_completeness_issues,policy_compliance_issues,policy_vagueness_issues
0,Data access is limited to 5 times.json,{'@context': {'xsd': 'http://www.w3.org/2001/X...,True,0.725082,0.543967,[No errors - All values found in ontology.],[No issues - Policy logic is valid.],[No schema issues - JSON structure is valid.],[No dependency issues - Constraints are logica...,"[ERROR: COUNT operand should compare numbers, ...",...,[No time constraint issues - All constraints a...,[ERROR: Missing 'ids:target' field. The policy...,[No redundancy or conflicts detected.],[No issues - All required fields are present.],[No contradictions detected across policies.],[No enforcement issues - Policy is feasible.],[ERROR: Missing 'ids:role' field. The policy d...,[ERROR: Missing 'ids:provenance' field (policy...,[No compliance issues - Policy aligns with reg...,[No vagueness issues - Policy is well-defined.]
1,Permission to access data a limited number of ...,{'@context': {'xsd': 'http://www.w3.org/2001/X...,True,0.770015,0.491442,[No errors - All values found in ontology.],[No issues - Policy logic is valid.],[No schema issues - JSON structure is valid.],[No dependency issues - Constraints are logica...,"[ERROR: COUNT operand should compare numbers, ...",...,[No time constraint issues - All constraints a...,[ERROR: Missing 'ids:target' field. The policy...,[ERROR: This policy is a duplicate of another ...,[No issues - All required fields are present.],[No contradictions detected across policies.],[No enforcement issues - Policy is feasible.],[ERROR: Missing 'ids:role' field. The policy d...,[ERROR: Missing 'ids:provenance' field (policy...,[No compliance issues - Policy aligns with reg...,[No vagueness issues - Policy is well-defined.]
2,Data can be accessed from 1st January 2024 to ...,{'@context': {'xsd': 'http://www.w3.org/2001/X...,True,0.745148,0.429334,[No errors - All values found in ontology.],[No issues - Policy logic is valid.],[Schema Error: '@id' is a required property],[No dependency issues - Constraints are logica...,[No data type issues - Operands and values mat...,...,[ERROR: Invalid DURING period format ('2024-06...,[ERROR: Missing 'ids:target' field. The policy...,[No redundancy or conflicts detected.],[No issues - All required fields are present.],[No contradictions detected across policies.],[No enforcement issues - Policy is feasible.],[ERROR: Missing 'ids:role' field. The policy d...,[ERROR: Missing 'ids:provenance' field (policy...,[No compliance issues - Policy aligns with reg...,[No vagueness issues - Policy is well-defined.]
3,Data can be accessed up to 5 times.json,{'@context': {'xsd': 'http://www.w3.org/2001/X...,True,0.510487,0.357892,[No errors - All values found in ontology.],[No issues - Policy logic is valid.],[No schema issues - JSON structure is valid.],[No dependency issues - Constraints are logica...,"[ERROR: COUNT operand should compare numbers, ...",...,[No time constraint issues - All constraints a...,[ERROR: Missing 'ids:target' field. The policy...,[ERROR: This policy is a duplicate of another ...,[No issues - All required fields are present.],[No contradictions detected across policies.],[No enforcement issues - Policy is feasible.],[ERROR: Missing 'ids:role' field. The policy d...,[ERROR: Missing 'ids:provenance' field (policy...,[No compliance issues - Policy aligns with reg...,[No vagueness issues - Policy is well-defined.]
4,Data can be accessed 5 times.json,{'@context': {'xsd': 'http://www.w3.org/2001/X...,True,0.560698,0.331907,[No errors - All values found in ontology.],[No issues - Policy logic is valid.],[No schema issues - JSON structure is valid.],[No dependency issues - Constraints are logica...,[No data type issues - Operands and values mat...,...,[No time constraint issues - All constraints a...,[ERROR: Missing 'ids:target' field. The policy...,[No redundancy or conflicts detected.],[No issues - All required fields are present.],[No contradictions detected across policies.],[No enforcement issues - Policy is feasible.],[ERROR: Missing 'ids:role' field. The policy d...,[ERROR: Missing 'ids:provenance' field (policy...,[No compliance issues - Policy aligns with reg...,[No vagueness issues - Policy is well-defined.]
