<a href="https://colab.research.google.com/github/RyanKelvinFord/Policy/blob/main/Conference_paper.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# prompt: Lets load the json files into memeory and put them into a dataframe with columns file name and then file content

import json
import pandas as pd
import os
import glob

def json_to_dataframe(directory):
    """
    Loads JSON files from a directory into a pandas DataFrame.

    Args:
        directory: The path to the directory containing the JSON files.

    Returns:
        A pandas DataFrame with 'file_name' and 'file_content' columns.
        Returns an empty DataFrame if no JSON files are found or if an error occurs.
    """

    json_files = glob.glob(os.path.join(directory, '*.json'))
    data = []

    for file in json_files:
        try:
            with open(file, 'r') as f:
                content = json.load(f)
                data.append({'file_name': os.path.basename(file), 'file_content': content})
        except json.JSONDecodeError as e:
            print(f"Error decoding JSON in file {file}: {e}")
        except Exception as e:
            print(f"An error occurred while processing {file}: {e}")


    if data:
      return pd.DataFrame(data)
    else:
      return pd.DataFrame(columns=['file_name', 'file_content'])


df = json_to_dataframe('/content/')
df.head()

Unnamed: 0,file_name,file_content
0,Data access is limited to 5 times.json,{'@context': {'xsd': 'http://www.w3.org/2001/X...
1,Permission to access data a limited number of ...,{'@context': {'xsd': 'http://www.w3.org/2001/X...
2,Data can be accessed from 1st January 2024 to ...,{'@context': {'xsd': 'http://www.w3.org/2001/X...
3,Data can be accessed up to 5 times.json,{'@context': {'xsd': 'http://www.w3.org/2001/X...
4,Data can be accessed 5 times.json,{'@context': {'xsd': 'http://www.w3.org/2001/X...


In [None]:
import pandas as pd
import json

def process_json_column(df, column_name):
    """
    Convert a specified column in a dataframe from string to valid JSON objects.

    :param df: Pandas DataFrame containing the column with JSON-like strings.
    :param column_name: Name of the column to be processed.
    :return: DataFrame with the processed column.
    """
    def try_parse_json(value):
        try:
            return json.loads(value.replace("'", '"')) if isinstance(value, str) else value
        except json.JSONDecodeError:
            return None  # Return None for invalid JSON values

    df[column_name] = df[column_name].apply(try_parse_json)
    return df

df = process_json_column(df, 'file_content')
df.head()

Unnamed: 0,file_name,file_content
0,Data access is limited to 5 times.json,{'@context': {'xsd': 'http://www.w3.org/2001/X...
1,Permission to access data a limited number of ...,{'@context': {'xsd': 'http://www.w3.org/2001/X...
2,Data can be accessed from 1st January 2024 to ...,{'@context': {'xsd': 'http://www.w3.org/2001/X...
3,Data can be accessed up to 5 times.json,{'@context': {'xsd': 'http://www.w3.org/2001/X...
4,Data can be accessed 5 times.json,{'@context': {'xsd': 'http://www.w3.org/2001/X...


In [None]:
import pandas as pd
import json

def validate_json_column(df, column_name):
    """
    Validate if the JSON data in the specified column is correctly formatted.

    :param df: Pandas DataFrame containing the column with JSON data.
    :param column_name: Name of the column to be validated.
    :return: DataFrame with an additional column 'is_valid_json' indicating validity.
    """
    def is_valid_json(value):
        if isinstance(value, dict):
            return True  # Already a valid JSON object
        elif isinstance(value, str):
            try:
                json.loads(value)  # Attempt to parse
                return True
            except json.JSONDecodeError:
                return False
        return False  # Not a string or dictionary

    df["is_valid_json"] = df[column_name].apply(is_valid_json)
    return df

df = validate_json_column(df, 'file_content')
df.head()

Unnamed: 0,file_name,file_content,is_valid_json
0,Data access is limited to 5 times.json,{'@context': {'xsd': 'http://www.w3.org/2001/X...,True
1,Permission to access data a limited number of ...,{'@context': {'xsd': 'http://www.w3.org/2001/X...,True
2,Data can be accessed from 1st January 2024 to ...,{'@context': {'xsd': 'http://www.w3.org/2001/X...,True
3,Data can be accessed up to 5 times.json,{'@context': {'xsd': 'http://www.w3.org/2001/X...,True
4,Data can be accessed 5 times.json,{'@context': {'xsd': 'http://www.w3.org/2001/X...,True


In [None]:
import json
import torch
from sentence_transformers import SentenceTransformer, util

# Load SBERT model
model = SentenceTransformer("all-MiniLM-L6-v2")

def semantic_match_descriptions(df, file_name_col, content_col):
    """
    Checks the semantic similarity between the 'file_name' and the '@value' text in 'ids:description'.

    :param df: Pandas DataFrame containing the JSON data.
    :param file_name_col: Name of the column that contains file names.
    :param content_col: Name of the column that contains JSON data.
    :return: DataFrame with an additional 'semantic_similarity_description' column.
    """
    similarities = []

    for index, row in df.iterrows():
        file_name = row[file_name_col]
        json_data = row[content_col]
        best_score = 0  # Default similarity score

        # Ensure file name and JSON content are valid
        if isinstance(file_name, str) and isinstance(json_data, dict):
            try:
                # Extract 'ids:description' if it exists
                if "ids:description" in json_data and isinstance(json_data["ids:description"], list):
                    for desc in json_data["ids:description"]:
                        if "@value" in desc:
                            description_text = desc["@value"]

                            # Encode sentences into embeddings
                            embeddings = model.encode([file_name, description_text], convert_to_tensor=True)

                            # Compute cosine similarity
                            similarity = util.pytorch_cos_sim(embeddings[0], embeddings[1]).item()

                            best_score = max(best_score, similarity)  # Keep the highest similarity score
            except Exception as e:
                print(f"Error processing row {index}: {e}")

        similarities.append(best_score)

    # Add similarity scores to DataFrame with the new column name
    df["semantic_similarity_description"] = similarities
    return df

# Run semantic matching
df = semantic_match_descriptions(df, "file_name", "file_content")

# Display results in Google Colab
from IPython.display import display
display(df)

Unnamed: 0,file_name,file_content,is_valid_json,semantic_similarity_description
0,Data access is limited to 5 times.json,{'@context': {'xsd': 'http://www.w3.org/2001/X...,True,0.725082
1,Permission to access data a limited number of ...,{'@context': {'xsd': 'http://www.w3.org/2001/X...,True,0.770015
2,Data can be accessed from 1st January 2024 to ...,{'@context': {'xsd': 'http://www.w3.org/2001/X...,True,0.745148
3,Data can be accessed up to 5 times.json,{'@context': {'xsd': 'http://www.w3.org/2001/X...,True,0.510487
4,Data can be accessed 5 times.json,{'@context': {'xsd': 'http://www.w3.org/2001/X...,True,0.560698


In [None]:
import json
import torch
from sentence_transformers import SentenceTransformer, util

# Load SBERT model
model = SentenceTransformer("all-MiniLM-L6-v2")

def semantic_match_titles(df, file_name_col, content_col):
    """
    Checks the semantic similarity between the 'file_name' and the '@value' text in 'ids:title'.

    :param df: Pandas DataFrame containing the JSON data.
    :param file_name_col: Name of the column that contains file names.
    :param content_col: Name of the column that contains JSON data.
    :return: DataFrame with an additional 'semantic_similarity_title' column.
    """
    similarities = []

    for index, row in df.iterrows():
        file_name = row[file_name_col]
        json_data = row[content_col]
        best_score = 0  # Default similarity score

        # Ensure file name and JSON content are valid
        if isinstance(file_name, str) and isinstance(json_data, dict):
            try:
                # Extract 'ids:title' if it exists
                if "ids:title" in json_data and isinstance(json_data["ids:title"], list):
                    for title in json_data["ids:title"]:
                        if "@value" in title:
                            title_text = title["@value"]

                            # Encode sentences into embeddings
                            embeddings = model.encode([file_name, title_text], convert_to_tensor=True)

                            # Compute cosine similarity
                            similarity = util.pytorch_cos_sim(embeddings[0], embeddings[1]).item()

                            best_score = max(best_score, similarity)  # Keep the highest similarity score
            except Exception as e:
                print(f"Error processing row {index}: {e}")

        similarities.append(best_score)

    # Add similarity scores to DataFrame with the new column name
    df["semantic_similarity_title"] = similarities
    return df

# Run semantic matching for titles
df = semantic_match_titles(df, "file_name", "file_content")

# Display results in Google Colab
from IPython.display import display
display(df)

Unnamed: 0,file_name,file_content,is_valid_json,semantic_similarity_description,semantic_similarity_title
0,Data access is limited to 5 times.json,{'@context': {'xsd': 'http://www.w3.org/2001/X...,True,0.725082,0.543967
1,Permission to access data a limited number of ...,{'@context': {'xsd': 'http://www.w3.org/2001/X...,True,0.770015,0.491442
2,Data can be accessed from 1st January 2024 to ...,{'@context': {'xsd': 'http://www.w3.org/2001/X...,True,0.745148,0.429334
3,Data can be accessed up to 5 times.json,{'@context': {'xsd': 'http://www.w3.org/2001/X...,True,0.510487,0.357892
4,Data can be accessed 5 times.json,{'@context': {'xsd': 'http://www.w3.org/2001/X...,True,0.560698,0.331907


In [None]:
!pip install rdflib



In [None]:
import re
import json
import pandas as pd
from rdflib import Graph, URIRef

# Function to load ontology with auto-detected format
def load_ontology(ontology_path):
    """
    Load the ontology and extract valid individuals (URIs).

    :param ontology_path: Path to the ontology RDF file.
    :return: Set of ontology individuals (URIs).
    """
    g = Graph()
    try:
        g.parse(ontology_path)  # Auto-detect format
        ontology_individuals = set(g.subjects())
        print(f"Ontology loaded successfully! Found {len(ontology_individuals)} valid URIs.")
        return ontology_individuals
    except Exception as e:
        print(f"Error loading ontology: {e}")
        return set()  # Return an empty set if parsing fails

# Define ignored patterns (like auto-generated URIs)
ignored_value_patterns = [
    r"https://w3id.org/idsa/autogen/.*"  # Regex to match autogenerated values
]

# Function to validate JSON policies against the ontology
def validate_json_against_ontology(df, ontology_individuals, column_name="file_content"):
    """
    Validate if ids:action and ids:constraint fields in JSON match the ontology.

    :param df: Pandas DataFrame containing JSON data.
    :param ontology_individuals: Set of valid URIs from the ontology.
    :param column_name: Column in the DataFrame with JSON content.
    :return: DataFrame with an additional column 'ontology_validation_errors'.
    """
    validation_errors_list = []

    def is_ignored(value):
        """Check if a value matches ignored patterns."""
        return any(re.match(pattern, value) for pattern in ignored_value_patterns)

    for index, row in df.iterrows():
        policy_id = row["file_name"]
        json_data = row[column_name]
        errors = []

        if isinstance(json_data, dict):  # Ensure JSON is parsed
            # Validate ids:action
            if "ids:action" in json_data:
                for action in json_data["ids:action"]:
                    if "@id" in action:
                        uri = URIRef(action["@id"])
                        if not is_ignored(action["@id"]) and uri not in ontology_individuals:
                            errors.append(f"ERROR: Action '{action['@id']}' not found in ontology.")

            # Validate ids:constraint
            if "ids:constraint" in json_data:
                for i, constraint in enumerate(json_data["ids:constraint"]):
                    constraint_path = f"ids:constraint[{i}]"

                    # Validate ids:operator
                    if "ids:operator" in constraint and "@id" in constraint["ids:operator"]:
                        operator_uri = URIRef(constraint["ids:operator"]["@id"])
                        if not is_ignored(constraint["ids:operator"]["@id"]) and operator_uri not in ontology_individuals:
                            errors.append(f"ERROR: Operator '{constraint['ids:operator']['@id']}' not found in ontology.")

                    # Validate ids:leftOperand
                    if "ids:leftOperand" in constraint and "@id" in constraint["ids:leftOperand"]:
                        left_operand_uri = URIRef(constraint["ids:leftOperand"]["@id"])
                        if not is_ignored(constraint["ids:leftOperand"]["@id"]) and left_operand_uri not in ontology_individuals:
                            errors.append(f"ERROR: LeftOperand '{constraint['ids:leftOperand']['@id']}' not found in ontology.")

                    # Validate ids:rightOperand
                    if "ids:rightOperand" in constraint and "@id" in constraint["ids:rightOperand"]:
                        right_operand_uri = URIRef(constraint["ids:rightOperand"]["@id"])
                        if not is_ignored(constraint["ids:rightOperand"]["@id"]) and right_operand_uri not in ontology_individuals:
                            errors.append(f"ERROR: RightOperand '{constraint['ids:rightOperand']['@id']}' not found in ontology.")

        # Store results
        validation_errors_list.append(errors if errors else ["No errors - All values found in ontology."])

    # Add validation results as a new column in the existing DataFrame
    df["ontology_validation_errors"] = validation_errors_list
    return df

# ---- STEP 1: Load the Ontology ----
ontology_file_path = "/content/ontology.rdf"  # Adjust with the actual path
ontology_individuals = load_ontology(ontology_file_path)

# ---- STEP 2: Validate JSON Policies Against Ontology ----
df = validate_json_against_ontology(df, ontology_individuals, "file_content")

# ---- STEP 3: Display Updated DataFrame with Validation Column ----
from IPython.display import display
display(df)

Ontology loaded successfully! Found 1170 valid URIs.


Unnamed: 0,file_name,file_content,is_valid_json,semantic_similarity_description,semantic_similarity_title,ontology_validation_errors
0,Data access is limited to 5 times.json,{'@context': {'xsd': 'http://www.w3.org/2001/X...,True,0.725082,0.543967,[No errors - All values found in ontology.]
1,Permission to access data a limited number of ...,{'@context': {'xsd': 'http://www.w3.org/2001/X...,True,0.770015,0.491442,[No errors - All values found in ontology.]
2,Data can be accessed from 1st January 2024 to ...,{'@context': {'xsd': 'http://www.w3.org/2001/X...,True,0.745148,0.429334,[No errors - All values found in ontology.]
3,Data can be accessed up to 5 times.json,{'@context': {'xsd': 'http://www.w3.org/2001/X...,True,0.510487,0.357892,[No errors - All values found in ontology.]
4,Data can be accessed 5 times.json,{'@context': {'xsd': 'http://www.w3.org/2001/X...,True,0.560698,0.331907,[No errors - All values found in ontology.]


In [27]:
import re
import json
import pandas as pd
from rdflib import Graph, URIRef
from datetime import datetime

# Function to evaluate policy logic
def evaluate_policy_logic(df, column_name="file_content"):
    """
    Evaluates the logical correctness of JSON policies in the DataFrame.

    :param df: Pandas DataFrame containing JSON policies.
    :param column_name: Column name where JSON policies are stored.
    :return: DataFrame with an additional 'policy_logic_issues' column.
    """
    logic_issues_list = []

    for index, row in df.iterrows():
        policy_id = row["file_name"]
        json_data = row[column_name]
        issues = []

        if isinstance(json_data, dict):  # Ensure JSON is parsed
            # Validate constraints exist if required
            if "ids:constraint" in json_data:
                constraints = json_data["ids:constraint"]
                time_constraints = []
                numeric_constraints = {}

                for constraint in constraints:
                    # Validate `ids:operator`
                    if "ids:operator" in constraint and "@id" in constraint["ids:operator"]:
                        operator = constraint["ids:operator"]["@id"]

                        # Ensure a leftOperand exists
                        if "ids:leftOperand" not in constraint or "@id" not in constraint["ids:leftOperand"]:
                            issues.append(f"ERROR: Constraint '{operator}' is missing leftOperand.")

                        # Ensure a rightOperand exists
                        if "ids:rightOperand" not in constraint or "@value" not in constraint["ids:rightOperand"]:
                            issues.append(f"ERROR: Constraint '{operator}' is missing rightOperand.")

                        left_operand = constraint.get("ids:leftOperand", {}).get("@id", None)
                        right_operand = constraint.get("ids:rightOperand", {}).get("@value", None)

                        # Detect conflicting time constraints
                        if operator in ["https://w3id.org/idsa/code/DURING", "https://w3id.org/idsa/code/BEFORE", "https://w3id.org/idsa/code/AFTER"]:
                            try:
                                time_constraints.append(datetime.fromisoformat(right_operand.replace("Z", "")))
                            except ValueError:
                                issues.append(f"ERROR: Invalid datetime format in constraint '{operator}'.")

                        # Detect invalid numeric constraints
                        if operator in ["https://w3id.org/idsa/code/LTEQ", "https://w3id.org/idsa/code/GTEQ"]:
                            try:
                                numeric_constraints[left_operand] = float(right_operand)
                            except ValueError:
                                issues.append(f"ERROR: Invalid numeric value in constraint '{operator}'.")

                # Check for conflicting constraints
                if len(time_constraints) >= 2:
                    if any(t1 > t2 for t1, t2 in zip(time_constraints, time_constraints[1:])):
                        issues.append("ERROR: Conflicting time constraints detected.")

                # Check for redundant constraints
                if len(numeric_constraints) != len(set(numeric_constraints.values())):
                    issues.append("ERROR: Redundant numeric constraints detected.")

            # Validate that actions make sense
            if "ids:action" in json_data:
                for action in json_data["ids:action"]:
                    if "@id" in action:
                        action_type = action["@id"]
                        if action_type not in [
                            "https://w3id.org/idsa/code/USE",
                            "https://w3id.org/idsa/code/READ",
                            "https://w3id.org/idsa/code/WRITE"
                        ]:
                            issues.append(f"ERROR: Unrecognized action type '{action_type}'.")

        # Store results
        logic_issues_list.append(issues if issues else ["No issues - Policy logic is valid."])

    # Add logic validation results to the DataFrame
    df["policy_logic_issues"] = logic_issues_list
    return df

# ---- STEP 1: Evaluate Policy Logic ----
df = evaluate_policy_logic(df, "file_content")

# ---- STEP 2: Display Updated DataFrame with Logic Issues ----
from IPython.display import display
display(df)

Unnamed: 0,file_name,file_content,is_valid_json,semantic_similarity_description,semantic_similarity_title,ontology_validation_errors,policy_logic_issues
0,Data access is limited to 5 times.json,{'@context': {'xsd': 'http://www.w3.org/2001/X...,True,0.725082,0.543967,[No errors - All values found in ontology.],[No issues - Policy logic is valid.]
1,Permission to access data a limited number of ...,{'@context': {'xsd': 'http://www.w3.org/2001/X...,True,0.770015,0.491442,[No errors - All values found in ontology.],[No issues - Policy logic is valid.]
2,Data can be accessed from 1st January 2024 to ...,{'@context': {'xsd': 'http://www.w3.org/2001/X...,True,0.745148,0.429334,[No errors - All values found in ontology.],[ERROR: Constraint 'https://w3id.org/idsa/code...
3,Data can be accessed up to 5 times.json,{'@context': {'xsd': 'http://www.w3.org/2001/X...,True,0.510487,0.357892,[No errors - All values found in ontology.],[No issues - Policy logic is valid.]
4,Data can be accessed 5 times.json,{'@context': {'xsd': 'http://www.w3.org/2001/X...,True,0.560698,0.331907,[No errors - All values found in ontology.],[No issues - Policy logic is valid.]
