In [None]:
import os
import pandas as pd
import zipfile
import re

# === Step 1: Unzip ontology files ===
zip_path = "/content/family_1hop_no_tbox.zip"
extract_dir = "/content/family_1hop_no_tbox/family_1hop_no_tbox"

if not os.path.exists(extract_dir):
    with zipfile.ZipFile(zip_path, 'r') as zip_ref:
        zip_ref.extractall(extract_dir)

# === Step 2: Load the CSV ===
csv_path = "/content/SPARQL_questions_sampling (3).csv"
df = pd.read_csv(csv_path)

# Ensure output directory exists
context_dir = "/content/Context"
os.makedirs(context_dir, exist_ok=True)

# === Helper Functions ===

def extract_individual_names(ontology_text, properties):
    """
    Find all individuals in the ontology by:
     1) looking for triples of the form ns1:name1 ns1:prop ns1:name2 .
     2) looking for NamedIndividual blocks (ns1:name a owl:NamedIndividual ; …).

    Returns a list of unique names in order of first appearance.
    """
    triple_pattern = re.compile(
        r'ns1:([a-zA-Z0-9_]+)\s+ns1:(' + '|'.join(properties) + r')\s+ns1:([a-zA-Z0-9_]+)\s*\.'
    )
    block_pattern = re.compile(
        r'ns1:([a-zA-Z0-9_]+)\s+a\s+owl:NamedIndividual\s*;\s*((?:.|\n)*?)(?=\.\s)',
        re.MULTILINE
    )

    name_set = set()
    name_list = []

    # 1) Triples
    for match in re.finditer(triple_pattern, ontology_text):
        subj, _, obj = match.groups()
        for name in (subj, obj):
            if name not in name_set:
                name_list.append(name)
                name_set.add(name)

    # 2) NamedIndividual blocks
    for block_match in re.finditer(block_pattern, ontology_text):
        subject = block_match.group(1)
        if subject not in name_set:
            name_list.append(subject)
            name_set.add(subject)

        # Within that block, find any ns1:OtherName occurrences
        # Format inside block can be: ns1:prop ns1:name ; ...
        inner_text = block_match.group(2)
        related = re.findall(r'ns1:([a-zA-Z0-9_]+)', inner_text)
        for obj in related:
            if obj not in name_set:
                name_list.append(obj)
                name_set.add(obj)

    return name_list

def replace_in_sparql(sparql_text, mapping):
    """
    Given a SPARQL query string (with URIs like <...#name>),
    replace every occurrence of '#name' → '#IndividualX' based on mapping.
    """
    result = sparql_text
    for old_name, new_id in mapping.items():
        # Match "#old_name" as a whole word
        pattern = rf"#\b{re.escape(old_name)}\b"
        result = re.sub(pattern, f"#{new_id}", result)
    return result

def replace_all_ns1_names(text, mapping):
    """
    Replace ns1:<old> → ns1:<new> throughout the ontology text.
    """
    for old, new in mapping.items():
        text = re.sub(fr'\bns1:{re.escape(old)}\b', f'ns1:{new}', text)
    return text

# === Step 3: Process each Root Entity separately ===

properties = [
    "hasBrother", "hasChild", "hasDaughter", "hasSon", "hasPartner", "hasFemalePartner",
    "hasMalePartner", "hasRelation", "hasAncestor", "hasParent", "hasFather", "hasMother",
    "isBloodrelationOf", "isSiblingOf", "isBrotherOf", "isSisterOf", "hasSex", "hasSister",
    "hasSpouse", "hasHusband", "hasWife", "isAncestorOf", "isChildOf", "isDaughterOf",
    "isSonOf", "isFatherOf", "isFemalePartnerln", "isHusbandOf", "isMalePartnerln",
    "isMotherOf", "isParentOf", "isPartnerln", "isSpouseOf", "isUncleOf", "isWifeOf"
]

new_DA_answers = []
new_DA_sparqls = []

unique_entities = df['Root Entity'].unique()
seen_entities = set()

for entity in unique_entities:
    entity_rows = df[df['Root Entity'] == entity].copy()

    if entity in seen_entities:
        # We'll still need to append placeholders for those rows
        for _ in entity_rows.itertuples():
            new_DA_answers.append("")
            new_DA_sparqls.append("")
        continue

    seen_entities.add(entity)
    ttl_path = os.path.join(extract_dir, entity + ".ttl")
    if not os.path.exists(ttl_path):
        print(f"Missing TTL: {ttl_path}")
        for _ in entity_rows.itertuples():
            new_DA_answers.append("")
            new_DA_sparqls.append("")
        continue

    with open(ttl_path, "r") as f:
        ontology_text = f.read()

    # 3a) Extract individuals and build mapping: name → IndividualX
    name_list = extract_individual_names(ontology_text, properties)
    name_to_instance = {name: f"Individual{idx+1}" for idx, name in enumerate(name_list)}

    # 3b) Save an abstracted version of the ontology (optional)
    abstracted_ontology_text = replace_all_ns1_names(ontology_text, name_to_instance)
    with open(os.path.join(context_dir, entity + ".ttl"), "w") as out_f:
        out_f.write(abstracted_ontology_text)

    # 3c) Now abstract each row's SPARQL Query and Answer
    for _, row in entity_rows.iterrows():
        #  - Answer: split on commas, map each via name_to_instance (if not found, keep original)
        answer = str(row["Answer"])
        if pd.isna(answer) or answer.strip() == "":
            DA_answer = ""
        else:
            parts = [item.strip() for item in answer.split(",")]
            DA_answer = ", ".join(name_to_instance.get(p, p) for p in parts)

        #  - SPARQL Query: replace every "#name" occurrence via name_to_instance
        sparql = str(row.get("SPARQL Query", ""))
        if pd.isna(sparql) or sparql.strip() == "":
            DA_sparql = ""
        else:
            DA_sparql = replace_in_sparql(sparql, name_to_instance)

        new_DA_answers.append(DA_answer)
        new_DA_sparqls.append(DA_sparql)

# === Step 4: Attach new columns and save ===
df["Abstracted Answer"] = new_DA_answers
df["Abstracted SPARQL Query"] = new_DA_sparqls

output_csv_path = "/content/trial.csv"
df.to_csv(output_csv_path, index=False)

output_csv_path, context_dir


('/content/trial.csv', '/content/Context')

In [None]:
import shutil
from google.colab import files

# === Step 5: Zip and download the Context folder ===
context_zip_path = "/content/Context.zip"

# Remove existing zip if re-running
if os.path.exists(context_zip_path):
    os.remove(context_zip_path)

# Create a zip file from the context folder
shutil.make_archive(base_name="/content/Context", format='zip', root_dir="/content/Context")

# Download the zip file
files.download(context_zip_path)


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>