In [1]:
!pip install --upgrade pip
!pip install transformers accelerate bitsandbytes sentencepiece --break-system-packages -q

from transformers import AutoTokenizer, AutoModelForCausalLM
import torch

model_name = "mistralai/Mistral-7B-Instruct-v0.3"

print("Loading tokenizer…")
tokenizer = AutoTokenizer.from_pretrained(
    model_name,
    trust_remote_code=True,
    model_max_length=32768,     # Mistral supports 32k context in v0.3
    padding_side="left"
)

print("Loading mistralai/Mistral-7B-v0.3 in 8-bit… This takes ~1 minute.")
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    load_in_8bit=True,           # THIS is what makes it fit in Kaggle GPU
    device_map="auto"
)

# Disable gradients globally
torch.set_grad_enabled(False)


def generate(prompt, max_new_tokens=500, temperature=0.001, top_p=1.0):
    """
    Clean, reliable generation. Deterministic by default 
    because you are doing extraction tasks.
    """
    
    inputs = tokenizer(
        prompt,
        return_tensors="pt",
        truncation=False,
        padding=False
    ).to(model.device)

    with torch.no_grad():
        output = model.generate(
            **inputs,
            max_new_tokens=max_new_tokens,
            do_sample=False,            # <-- important for consistent output
            temperature=temperature,
            top_p=top_p,
            pad_token_id=tokenizer.eos_token_id,
            use_cache=True
        )

    return tokenizer.decode(output[0], skip_special_tokens=True)


Collecting pip
  Downloading pip-25.3-py3-none-any.whl.metadata (4.7 kB)
Downloading pip-25.3-py3-none-any.whl (1.8 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.8/1.8 MB[0m [31m17.6 MB/s[0m eta [36m0:00:00[0m00:01[0m0:01[0m
[?25hInstalling collected packages: pip
  Attempting uninstall: pip
    Found existing installation: pip 24.1.2
    Uninstalling pip-24.1.2:
      Successfully uninstalled pip-24.1.2
Successfully installed pip-25.3
[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
libcugraph-cu12 25.6.0 requires libraft-cu12==25.6.*, but you have libraft-cu12 25.2.0 which is incompatible.
pylibcugraph-cu12 25.6.0 requires pylibraft-cu12==25.6.*, but you have pylibraft-cu12 25.2.0 which is incompatible.
pylibcugraph-cu12 25.6.0 requires rmm-cu12==25.6.*, but you have rmm-cu12 25.2.0 which is incompatible.[0m[31m
[0mLoading

tokenizer_config.json: 0.00B [00:00, ?B/s]

tokenizer.model:   0%|          | 0.00/587k [00:00<?, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/414 [00:00<?, ?B/s]

Loading mistralai/Mistral-7B-v0.3 in 8-bit… This takes ~1 minute.


config.json:   0%|          | 0.00/601 [00:00<?, ?B/s]

2025-11-25 09:59:09.753917: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1764064750.120812      47 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1764064750.213978      47 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


AttributeError: 'MessageFactory' object has no attribute 'GetPrototype'

AttributeError: 'MessageFactory' object has no attribute 'GetPrototype'

AttributeError: 'MessageFactory' object has no attribute 'GetPrototype'

AttributeError: 'MessageFactory' object has no attribute 'GetPrototype'

AttributeError: 'MessageFactory' object has no attribute 'GetPrototype'

The `load_in_4bit` and `load_in_8bit` arguments are deprecated and will be removed in the future versions. Please, pass a `BitsAndBytesConfig` object in `quantization_config` argument instead.


model.safetensors.index.json: 0.00B [00:00, ?B/s]

Fetching 3 files:   0%|          | 0/3 [00:00<?, ?it/s]

model-00002-of-00003.safetensors:   0%|          | 0.00/5.00G [00:00<?, ?B/s]

model-00003-of-00003.safetensors:   0%|          | 0.00/4.55G [00:00<?, ?B/s]

model-00001-of-00003.safetensors:   0%|          | 0.00/4.95G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

In [2]:
!pip install datasets==2.19.1

from datasets import load_dataset

dataset = load_dataset("CLUTRR/v1", "gen_train234_test2to10")

Collecting datasets==2.19.1
  Downloading datasets-2.19.1-py3-none-any.whl.metadata (19 kB)
Collecting pyarrow-hotfix (from datasets==2.19.1)
  Downloading pyarrow_hotfix-0.7-py3-none-any.whl.metadata (3.6 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets==2.19.1)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting fsspec<=2024.3.1,>=2023.1.0 (from fsspec[http]<=2024.3.1,>=2023.1.0->datasets==2.19.1)
  Downloading fsspec-2024.3.1-py3-none-any.whl.metadata (6.8 kB)
INFO: pip is looking at multiple versions of multiprocess to determine which version is compatible with other requirements. This could take a while.
Collecting multiprocess (from datasets==2.19.1)
  Downloading multiprocess-0.70.17-py311-none-any.whl.metadata (7.2 kB)
  Downloading multiprocess-0.70.16-py311-none-any.whl.metadata (7.2 kB)
Downloading datasets-2.19.1-py3-none-any.whl (542 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m542.0/542.0 kB[0m [31m10.6 MB/s[0m  [33m0:00:00[0m


Downloading data:   0%|          | 0.00/3.69M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/938k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/598k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/12064 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/3019 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/1048 [00:00<?, ? examples/s]

In [57]:
def get_predicates(story):
    
    print("actual story : ", story)
    p1 = """You are a family-relationship extraction engine.

    Your job is to read the story and rewrite it into relationship bullet points that contains:
    - only and all family relationships
    - gender every new person you come across using context - dont base it off the name
    - NO events, objects, activities, locations, or irrelevant descriptions
    - NO timeline or sequencing details
    - NO pronouns; always replace pronouns with the correct resolved person name
    - NO hallucinations — include ONLY what is explicitly stated or directly implied
    - If a relationship is ambiguous, OMIT it.
    
    
    extract relation phrases from the story after excluding the unnecesary details using all previous context :
    
    {story}
    FINISH
    """
    
    print("Running model1......................................................................")
    result1 = generate(p1.format(story=story), max_new_tokens = 200, temperature = 0.001)
    result1 =  result1.split("FINISH", 1)[1]
    print(result1)



    p2 = """You are a family-relationship mapper.
    Your task is to read the cleaned family summary and output ONLY three sections:
    Genders, Triplets, Facts.
    
    You MUST think step-by-step internally, but DO NOT reveal your reasoning.
    Output ONLY the three sections exactly as described.
    
    =========================================================
    RULES
    =========================================================
    1. Names refer to unique people.
    2. Use only explicit relationships from the summary.
    3. Infer gender from roles (father, mother, son, daughter, husband, wife, brother, sister) DO NOT INFER FROM THEIR NAMES!.
    4. DO NOT guess information that is not stated.
    5. DO NOT create placeholder names or "unknown".
    6. Use the most specific relation available.
    7. In the Facts section, include gender as:
       male(name).  female(name).  unknown_gender(name).
       (Use lowercase names as atoms.)
    
    =========================================================
    OUTPUT SPECIFICATION
    =========================================================
    
    ### SECTION 1 — Genders
    Format: name=male/female/unknown
    
    ### SECTION 2 — Triplets
    Format: (subject, relation, object), relations are only family relations, ignore others:  
    ONLY Allowed relation labels:
    ('granddaughter', 'father', 'brother', 'mother', 'grandson', 'sister', 'uncle', 'daughter', 'husband', 'grandmother', 'wife', 'aunt', 'son', 'grandfather')
    
    ### SECTION 3 — Facts
    Convert both gender and triplets into Prolog facts.
    
    For gender:
    - male(name).
    - female(name).
    - unknown_gender(name).
    
    For relations:
    father(subject, object),
    brother(subject, object), etc
    
    Use lowercase atoms:
    Example: father(don, steve).
    
    =========================================================
    NOW PROCESS THESE RELATIONS into sections:
    {story}
    RELATIONS FINISH
    sections:
    """
    
    print("Running model2.................................................................")
    result2 = generate(p2.format(story=result1), max_new_tokens = 500, temperature = 0.001)
    result2 =  result2.split("FINISH", 1)[1]
    print(result2)
    print("\n\n\n\n\n")

    res = (result2.split("Facts:")[-1]).split(".\n")
    return res

In [58]:
import random

#call the function like this, you get a list of predicates

numbers = [random.randint(1, 250) for _ in range(2)]
allpreds = {}
for i in numbers:
    facts = get_predicates(dataset["test"][i]["clean_story"])
    allpreds[i] = facts

The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


actual story :  [Harold] took his son [Pedro] to the park to feed the squirrels. [Dorothy] and [Pedro] are married. [Dorothy] had picked her daughter [Heather] out the cutest new dress to wear on her birthday. [Heather] went to the baseball game with her uncle [Antonio].
Running model1......................................................................


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.



    1. Harold is Pedro's father
    2. Dorothy is Pedro's wife
    3. Dorothy has a daughter named Heather
    4. Heather went to a baseball game with her uncle Antonio
Running model2.................................................................


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.



    sections:
    1. Genders:
    Harold=male, Pedro=male, Dorothy=female, Heather=female, Antonio=male
    2. Triplets:
    (Pedro, father, Harold), (Pedro, husband, Dorothy), (Dorothy, wife, Pedro), (Heather, daughter, Dorothy), (Antonio, brother, Pedro)
    3. Facts:
    male(harold).
    male(pedro).
    female(dorothy).
    female(heather).
    male(antonio).
    father(pedro, harold).
    husband(pedro, dorothy).
    wife(dorothy, pedro).
    daughter(dorothy, heather).
    brother(pedro, antonio).






actual story :  [Dorothy] is so happy her husband got a promotion at work. His name is [Pedro]. [Cheryl]'s grandfather, [Harold], taught her how to make a paper airplane while her mother, [Dorothy], prepared dinner. [Harold] went to see his daughter [Marie] for parents day at college.
Running model1......................................................................


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.



    1. Dorothy is married to Pedro
    2. Cheryl is the granddaughter of Harold
    3. Harold has a daughter named Marie
    4. Marie is a college student
    5. Dorothy is the mother of Cheryl
    6. Harold taught Cheryl how to make a paper airplane
    7. Harold visited Marie at college
Running model2.................................................................

    sections:
    ------------------------------------------------------------
    Genders:
    Cheryl=female, Dorothy=female, Harold=male, Marie=female
    ------------------------------------------------------------
    Triplets:
    ('dorothy', 'mother', 'cheryl'),
    ('harold', 'grandfather', 'cheryl'),
    ('harold', 'father', 'marie'),
    ('marie', 'daughter', 'harold')
    ------------------------------------------------------------
    Facts:
    male(harold).
    female(cheryl).
    female(dorothy).
    female(marie).
    father(harold, marie).
    mother(dorothy, cheryl).
    granddaughter(cheryl, harold).
  

In [60]:
allpreds

{194: ['\n    male(harold)',
  '    male(pedro)',
  '    female(dorothy)',
  '    female(heather)',
  '    male(antonio)',
  '    father(pedro, harold)',
  '    husband(pedro, dorothy)',
  '    wife(dorothy, pedro)',
  '    daughter(dorothy, heather)',
  '    brother(pedro, antonio).'],
 191: ['\n    male(harold)',
  '    female(cheryl)',
  '    female(dorothy)',
  '    female(marie)',
  '    father(harold, marie)',
  '    mother(dorothy, cheryl)',
  '    granddaughter(cheryl, harold)',
  '    daughter(marie, harold)',
  '    college_student(marie)',
  '    taught_by(harold, cheryl, paper_airplane)',
  '    visited(harold, marie).']}

In [None]:
def to_prolog_query(question: str):
    prompt = f"""
Convert the following natural-language question into a Prolog query.
Use ONLY standard Prolog syntax.

RULES:
- Use lowercase atoms for predicates and constants.
- Replace spaces in names with underscores.
- For yes/no questions, generate a predicate that logically matches the question.
- Output ONLY the Prolog query, nothing else.

EXAMPLES:
EXAMPLES:

Q: "Is Alice Bob’s mother?"
A: mother(alice, bob).

Q: "Is Bob Alice’s father?"
A: father(bob, alice).

Q: "Is John Mary’s brother?"
A: brother(john, mary).

Q: "Is Sarah Emily’s sister?"
A: sister(sarah, emily).

Q: "Is Claire the grandmother of David?"
A: grandmother(claire, david).

Q: "Is Helen David’s granddaughter?"
A: granddaughter(helen, david).

Q: "Is Michael Jane’s son?"
A: son(michael, jane).

Q: "Is Jane Michael’s daughter?"
A: daughter(jane, michael).

Q: "Is Mark Susan’s uncle?"
A: uncle(mark, susan).

Q: "Is Susan Mark’s niece?"
A: niece(susan, mark).

Q: "Is Robert Kevin’s cousin?"
A: cousin(robert, kevin).

Q: "Is Daniel Olivia’s husband?"
A: spouse(daniel, olivia).

Q: "Is Olivia Daniel’s wife?"
A: spouse(olivia, daniel).

Q: "Is Henry Laura’s father-in-law?"
A: father_in_law(henry, laura).

Q: "Is Laura Henry’s daughter-in-law?"
A: daughter_in_law(laura, henry).

Q: "Is Priya Anil’s aunt?"
A: aunt(priya, anil).

Q: "Is Anil Priya’s nephew?"
A: nephew(anil, priya).

Q: "Is Thomas Jacob’s grandfather?"
A: grandfather(thomas, jacob).

Q: "Is Jacob Thomas’s grandchild?"
A: grandchild(jacob, thomas).


NOW CONVERT:
Q: "{question}"
A:
"""
    return generate(prompt, max_new_tokens=50, temperature=0.001)

In [26]:
query = to_prolog_query("Is Alice Claire’s grandmother?")
print(query.split("A:")[-1])

The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.



grandmother(alice, claire).


In [63]:
def summarize_prolog_trace_cot(trace: str):
    prompt = f"""
Summarize the following exit-only Prolog trace. First, think step-by-step
about what each exit line implies logically. Then, after reasoning, produce
a clean final paragraph that explains the reasoning chain in natural language.

RULES:
- Use lowercase atoms as they appear.
- DO NOT output the reasoning steps in the final answer.
- Output ONLY the final summarized paragraph.
- Your reasoning should interpret each exit line in order.

EXAMPLES:

TRACE:
Exit: mother(alice, beth)
Exit: parent(beth, claire)
Exit: grandmother(alice, claire)

THINK:
- mother(alice, beth) means Alice is Beth's mother.
- parent(beth, claire) means Beth is Claire’s parent.
- Combining them shows Alice is the grandmother of Claire through Beth.

SUMMARY:
Alice is Claire’s grandmother because she is the mother of Beth, and Beth is Claire’s parent, forming a direct maternal lineage.

---

TRACE:
Exit: father(john, mark)
Exit: parent(mark, olivia)
Exit: grandfather(john, olivia)

THINK:
- father(john, mark) → John is Mark’s father.
- parent(mark, olivia) → Mark is Olivia’s parent.
- Combine → John is Olivia’s grandfather through Mark.

SUMMARY:
John is Olivia’s grandfather since he is Mark’s father, and Mark is Olivia’s parent, creating a paternal line connecting John and Olivia.

---

TRACE:
Exit: parent(susan, kevin)
Exit: parent(susan, lily)
Exit: sibling(kevin, lily)

THINK:
- Susan is parent of Kevin.
- Susan is parent of Lily.
- Sharing a parent makes Kevin and Lily siblings.

SUMMARY:
Kevin and Lily are siblings because both share the same parent, Susan.

---

TRACE:
Exit: sibling(sarah, michael)
Exit: parent(michael, emma)
Exit: aunt(sarah, emma)

THINK:
- Sarah and Michael are siblings.
- Michael is Emma’s parent.
- A sibling of a parent is an aunt.

SUMMARY:
Sarah is Emma’s aunt because she is Michael’s sibling, and Michael is Emma’s parent.

---

TRACE:
Exit: parent(linda, jack)
Exit: sibling(linda, robert)
Exit: parent(robert, claire)
Exit: cousin(jack, claire)

THINK:
- Linda is Jack’s parent.
- Linda and Robert are siblings.
- Robert is Claire’s parent.
- Children of siblings are cousins.

SUMMARY:
Jack and Claire are cousins because their parents, Linda and Robert, are siblings.

---

NOW SOLVE:
TRACE:
{trace}

THINK:
"""
    return generate(prompt, max_new_tokens=200, temperature=0.001)


In [None]:
COT = summarize_prolog_trace_cot("""Exit: sibling(alex, rita)
Exit: parent(rita, nathan)
Exit: nephew(nathan, alex)
""")

In [65]:
print(COT.split("THINK:")[-1])


- Alex and Rita are siblings.
- Rita is Nathan’s parent.
- A sibling of a parent is a nephew/niece.

SUMMARY:
Nathan is Alex’s nephew because Alex is Rita’s sibling, and Rita is Nathan’s parent.


In [1]:
print("hi")

hi


In [3]:
import re

def generate_knowledge_base(input_text):
    # 1. Define the Allowed Inputs (Strict filtering list)
    ALLOWED_INPUTS = {
        'granddaughter', 'father', 'brother', 'mother', 'grandson', 
        'sister', 'uncle', 'daughter', 'husband', 'grandmother', 
        'wife', 'aunt', 'son', 'grandfather'
    }

    # 2. Define the Required Outputs (Order and types for the final KB)
    # Note: 'parent' is here but not in allowed inputs, so it will always trigger a dummy.
    REQUIRED_OUTPUTS = [
        'mother', 'father', 'daughter', 'son', 
        'grandmother', 'grandfather', 'granddaughter', 'grandson', 
        'brother', 'sister', 'husband', 'wife', 'aunt', 'uncle'
    ]

    # Initialize dictionary to hold found facts
    # We use the REQUIRED_OUTPUTS keys to ensure we track everything needed
    kb_data = {key: [] for key in REQUIRED_OUTPUTS}

    # 3. Parse the Input
    # Regex explanation:
    # (\w+)    -> Group 1: The predicate name (alphanumeric)
    # \s*\(    -> Matches open parenthesis (allowing optional space)
    # ([^,]+)  -> Group 2: First argument (anything not a comma)
    # ,        -> The comma separator
    # ([^)]+)  -> Group 3: Second argument (anything not a closing paren)
    # \)       -> Closing parenthesis
    pattern = re.compile(r'(\w+)\s*\(\s*([^,]+)\s*,\s*([^)]+)\s*\)')

    lines = input_text.strip().split('\n')

    for line in lines:
        line = line.strip()
        match = pattern.search(line)
        
        if match:
            pred_name = match.group(1)
            arg1 = match.group(2).strip()
            arg2 = match.group(3).strip()

            # Check if the predicate is in the ALLOWED input list
            if pred_name in ALLOWED_INPUTS:
                # Ensure it is also a recognized output type before adding
                if pred_name in kb_data:
                    # Format as proper Prolog fact with a period at the end
                    fact_str = f"{pred_name}({arg1}, {arg2})."
                    kb_data[pred_name].append(fact_str)

    # 4. Generate Output
    print("--- Generated Knowledge Base ---")
    
    for pred in REQUIRED_OUTPUTS:
        facts = kb_data[pred]
        
        if not facts:
            # If list is empty, print the dummy fact
            print(f"{pred}(dummy1, dummy2).")
        else:
            # Otherwise print all found facts of this type
            for fact in facts:
                print(fact)

# --- Example Usage ---

# You can paste your raw text here
user_input = """
mother(helena,victor).
wife(chloe,victor).
"""

user_input=user_input.lower()

# Run the function
if __name__ == "__main__":
    generate_knowledge_base(user_input)

--- Generated Knowledge Base ---
mother(helena, victor).
father(dummy1, dummy2).
daughter(dummy1, dummy2).
son(dummy1, dummy2).
grandmother(dummy1, dummy2).
grandfather(dummy1, dummy2).
granddaughter(dummy1, dummy2).
grandson(dummy1, dummy2).
brother(dummy1, dummy2).
sister(dummy1, dummy2).
husband(dummy1, dummy2).
wife(chloe, victor).
aunt(dummy1, dummy2).
uncle(dummy1, dummy2).
