In [1]:
import json

# path to your JSON file
json_path = 'train_v1.json'

# Load the JSON file
with open(json_path, 'r', encoding='utf-8') as f:
    data = json.load(f)

# Extract and flatten all 'premises-NL'
all_premises_nl = [
    premise
    for entry in data
    for premise in entry.get('premises-NL', [])
]

# Now `all_premises_nl` is a list of all premise-NL strings
for i, premise in enumerate(all_premises_nl, 1):
    print(f"{i}. {premise}")


1. If a Python code is well-tested, then the project is optimized.
2. If a Python code does not follow PEP 8 standards, then it is not well-tested.
3. All Python projects are easy to maintain.
4. All Python code is well-tested.
5. If a Python code follows PEP 8 standards, then it is easy to maintain.
6. If a Python code is well-tested, then it follows PEP 8 standards.
7. If a Python project is well-structured, then it is optimized.
8. If a Python project is easy to maintain, then it is well-tested.
9. If a Python project is optimized, then it has clean and readable code.
10. All Python projects are well-structured.
11. All Python projects have clean and readable code.
12. There exists at least one Python project that follows best practices.
13. There exists at least one Python project that is optimized.
14. If a Python project is not well-structured, then it does not follow PEP 8 standards.
15. Students who have completed the core curriculum and passed the science assessment are qualif

In [2]:
# prompt: save json

import json

# Assuming 'all_premises_nl' is already defined from the previous code

# Specify the path for the new JSON file
new_json_path = 'premises_nl.json'

# Save the data to a new JSON file
with open(new_json_path, 'w', encoding='utf-8') as f:
    json.dump(all_premises_nl, f, indent=4, ensure_ascii=False)  # Use ensure_ascii=False for proper encoding


In [8]:
# classify_premises.py

import re
from typing import List, Tuple
import spacy
import time

# load pretrained spaCy pipeline with NER once\ n_nlp = spacy.load("en_core_web_sm")

# static patterns and keyword sets
_GENERIC_PLURALS = {"Students", "Python projects", "Faculty members", "Employees", "Applicants"}
_QUANTIFIERS    = (
    "All ", "Every ", "Any ", "Each ", "No ",
    "Either ", "Anyone", "At least one", "There exists"
)
_KEYWORDS = (
    "GPA", "credits", "score", "scholarship", "enrolled",
    "published", "inspection", "exam", "assignment", "project",
    "teaches", "holds"
)
_DATE_REGEX = re.compile(
    r"\b(January|February|March|April|May|June|July|August|"
    r"September|October|November|December)\b|\d{1,2}(st|nd|rd|th)?\b"
)
_COURSE_CODE_REGEX = re.compile(r"^[A-Z]{2,4}\d{3,4}")

def classify_premises(premises: List[str]) -> Tuple[List[str], List[str]]:
    """
    Split a list of premise sentences into two lists:
      - logic_flow: generic rules, implications, universals
      - facts: concrete statements about persons, courses, dates, etc.

    Returns: (logic_flow, facts)
    """
    logic_flow = []
    facts      = []

    for s in premises:
        doc = _nlp(s)

        # 1) Conditional patterns ⇒ logic
        if re.match(r"^If .+ then .+", s) or ", then " in s:
            logic_flow.append(s); continue

        # 2) Universal quantifiers ⇒ logic
        if s.startswith(_QUANTIFIERS):
            logic_flow.append(s); continue

        # 3) Generic-plural subjects ⇒ logic
        first_two = " ".join(s.split()[:2])
        if s.split()[0] in _GENERIC_PLURALS or first_two in _GENERIC_PLURALS:
            logic_flow.append(s); continue

        # 4) Proper-name PERSON ⇒ fact
        if any(ent.label_ == "PERSON" for ent in doc.ents):
            facts.append(s); continue

        # 5) Course-code subjects ⇒ fact
        if _COURSE_CODE_REGEX.match(s):
            facts.append(s); continue

        # 6) Date-mention ⇒ fact
        if _DATE_REGEX.search(s):
            facts.append(s); continue

        # 7) Domain-keywords ⇒ fact
        if any(kw in s for kw in _KEYWORDS):
            facts.append(s); continue

        # 8) Verb-based fallback ⇒ fact
        if any(v in s for v in (" has ", " holds ", " completed ", " received ", " passed ")):
            facts.append(s); continue

        # — additional heuristics A–E —

        # A) "<Name> has <past-participle>" ⇒ fact
        if re.match(r"^[A-Z][a-z]+ has \b\w+ed\b", s):
            facts.append(s); continue

        # C) "<Name> has a valid ..." ⇒ fact
        if re.match(r"^[A-Z][a-z]+ has a valid", s):
            facts.append(s); continue

        # D) Mentions exam/assignment/project (non-conditional)
        if re.search(r"\b(exam|assignment|project)\b", s, re.I) and "If " not in s:
            if any(ent.label_ == "PERSON" for ent in doc.ents):
                facts.append(s)
            else:
                logic_flow.append(s)
            continue

        # B) Generic "provides/offers/allows" ⇒ logic
        if re.search(r"\b(provides|offers|allows)\b", s, re.I) \
           and not any(ent.label_ == "PERSON" for ent in doc.ents):
            logic_flow.append(s); continue

        # E) "system/environment allows/provides" ⇒ logic
        if re.search(r"\b(system|environment) (allows|provides)\b", s, re.I):
            logic_flow.append(s); continue

        # 9) ORG or GPE NER ⇒ fact
        if any(ent.label_ in ("ORG","GPE") for ent in doc.ents):
            facts.append(s); continue

        # 10) Modal verbs ⇒ logic
        if any(modal in s for modal in (" must "," shall "," should "," can "," cannot ")):
            logic_flow.append(s); continue

        # 11) Exclusive/exception conditions ⇒ logic
        if re.match(r"^(Only if|Unless|Provided that|Given that|Assuming)\b", s, re.I):
            logic_flow.append(s); continue

        # 12) Definitional/copular ("A X is a Y") ⇒ logic
        if re.match(r"^A [a-z][^,]+ is (an?|the) [a-z]", s):
            logic_flow.append(s); continue

        # 13) Numeric-threshold universals ⇒ logic
        if re.match(r"^(At least|More than|Less than) \d+%?", s) \
           and not any(ent.label_=="PERSON" for ent in doc.ents):
            logic_flow.append(s); continue

        # 14) Comparative adjectives ⇒ logic
        if re.search(r"\b(better than|worse than|highest|lowest|most|least)\b", s, re.I):
            logic_flow.append(s); continue

        # Fallback: everything else ⇒ logic
        logic_flow.append(s)

    return logic_flow, facts


if __name__ == "__main__":
    import json
    premises = json.load(open("premises_nl.json", encoding="utf-8"))

    start = time.perf_counter()
    logic, facts = classify_premises(premises)
    end = time.perf_counter()

    print(f"Logic-flow rules: {len(logic)}")
    print(f"Facts:            {len(facts)}")
    print(f"Total processing time: {end - start:.4f} seconds")


Logic-flow rules: 3924
Facts:            840
Total processing time: 40.0280 seconds


In [9]:
facts

['Sophia has completed the core curriculum.',
 'Sophia has passed the science assessment.',
 'Sophia has completed the research methodology course.',
 'Sophia has completed her capstone project.',
 'Sophia has completed the required community service hours.',
 'Sophia has completed the core curriculum.',
 'Sophia has passed the science assessment.',
 'Sophia has completed the research methodology course.',
 'Sophia has completed her capstone project.',
 'Sophia has completed the required community service hours.',
 'If a student is eligible for graduation and maintains a GPA above 3.5, they graduate with honors.',
 'John has completed all required courses.',
 'John maintains a GPA of 3.8.',
 'John has completed a thesis.',
 'If a faculty member has completed training, they can teach undergraduate courses.',
 'If a faculty member can teach undergraduate courses and holds a PhD, they can supervise graduate students.',
 'If a faculty member can supervise graduate students and has at least

In [10]:
logic

['If a Python code is well-tested, then the project is optimized.',
 'If a Python code does not follow PEP 8 standards, then it is not well-tested.',
 'All Python projects are easy to maintain.',
 'All Python code is well-tested.',
 'If a Python code follows PEP 8 standards, then it is easy to maintain.',
 'If a Python code is well-tested, then it follows PEP 8 standards.',
 'If a Python project is well-structured, then it is optimized.',
 'If a Python project is easy to maintain, then it is well-tested.',
 'If a Python project is optimized, then it has clean and readable code.',
 'All Python projects are well-structured.',
 'All Python projects have clean and readable code.',
 'There exists at least one Python project that follows best practices.',
 'There exists at least one Python project that is optimized.',
 'If a Python project is not well-structured, then it does not follow PEP 8 standards.',
 'Students who have completed the core curriculum and passed the science assessment are