In [5]:
text = [{
        "url": "https://nexcel.info/employee-handbook-2/",
        "headers": [
            "2. TERMS AND CONDITIONS",
            "2.6. Limitation of late-in and early-out request."
        ],
        "content": "\nThe total hours of \u2018Late in \u2013 Early out\u2019 will be counted in system and affect employee\u2019s Annual Leave balance.\nIn a month, if the employee is late for work and/or leaves work early 4 times, which will be counted as a violation, an email reminder from HR will be issued. The third violation within 12 months will incur a warning letter from the Management. For example:  \nFirst violation: August 2018\nSecond violation: December 2018\nThird violation: July 2019  \nFirst violation: August 2018\nSecond violation: December 2018\nThird violation: July 2019\n\u2192 Warning letter will be issued from the Management.  \nWhen the total \u2018Late-in\u2019 and \u2018Early-out\u2019 (will be accumulated over months and across the years) is up to 4 hours, a half-day Annual Leave of employee will be deducted, for example:  \nDecember 2018: total \u2018Late-in\u2019 and \u2018Early-out\u2019 is 3 hours.\nJanuary 2019: total \u2018Late-in\u2019 and \u2018Early-out\u2019 is 1 hour.  \nDecember 2018: total \u2018Late-in\u2019 and \u2018Early-out\u2019 is 3 hours.\nJanuary 2019: total \u2018Late-in\u2019 and \u2018Early-out\u2019 is 1 hour.\n\u2192 HR will issue a confirmation paper to inform that half-day Annual Leave of the employee is going to be deducted.  \nThis limitation is not applied to manager level."
    },
    {
        "url": "https://nexcel.info/employee-handbook-2/",
        "headers": [
            "2. TERMS AND CONDITIONS",
            "2.7. Abuse of working time"
        ],
        "content": "\nEmployees are not allowed to abuse working hours. Scenarios that would be considered abusing working hours may include but are not limited to:\n* Alter the work schedule (e.g. extending the 1-hour lunch break, going out or going to pantry to have breakfast after Check in, or staying longer in pantry/smoking area for chatting, etc.). Please be reminded that drinks and fast food are offered to help you get more energy for working, not for exploiting duty time.\n* Check out after playing sport. You must check out first and then enjoy sports, clubs, etc."
    }]

In [36]:
from typing import List
from langchain_core.prompts import ChatPromptTemplate, FewShotChatMessagePromptTemplate
from langchain_core.pydantic_v1 import BaseModel, Field
from langchain_ollama import ChatOllama

# Data model
class GeneratePropositions(BaseModel):
    """List of all the propositions in a given document"""

    propositions: List[str] = Field(
        description="List of propositions (factual, self-contained, and concise information)"
    )


# LLM with function call
llm = ChatOllama(model="llama3.2:latest", temperature=0)
structured_llm= llm.with_structured_output(GeneratePropositions)

# Few shot prompting --- We can add more examples to make it good
proposition_examples = [
    {"document": 
        "In 1969, Neil Armstrong became the first person to walk on the Moon during the Apollo 11 mission.", 
     "propositions": 
        ['Neil Armstrong was an astronaut.', 'Neil Armstrong walked on the Moon in 1969.', 'Neil Armstrong was the first person to walk on the Moon.', 'Neil Armstrong walked on the Moon during the Apollo 11 mission.', 'The Apollo 11 mission occurred in 1969.']
    },
]

example_proposition_prompt = ChatPromptTemplate.from_messages(
    [
        ("human", "{document}"),
        ("ai", "{propositions}"),
    ]
)

few_shot_prompt = FewShotChatMessagePromptTemplate(
    example_prompt = example_proposition_prompt,
    examples = proposition_examples,
)

# Prompt
system = """Please break down the following text into simple, self-contained propositions. Ensure that each proposition meets the following criteria:

    1. Express a Single Fact: Each proposition should state one specific fact or claim.
    2. Be Understandable Without Context: The proposition should be self-contained, meaning it can be understood without needing additional context.
    3. Use Full Names, Not Pronouns: Avoid pronouns or ambiguous references; use full entity names.
    4. Include Relevant Dates/Qualifiers: If applicable, include necessary dates, times, and qualifiers to make the fact precise.
    5. Contain One Subject-Predicate Relationship: Focus on a single subject and its corresponding action or attribute, without conjunctions or multiple clauses."""
prompt = ChatPromptTemplate.from_messages(
    [
        ("system", system),
        few_shot_prompt,
        ("human", "{document}"),
    ]
)

proposition_generator = prompt | structured_llm

In [37]:
proposition_generator

ChatPromptTemplate(input_variables=['document'], input_types={}, partial_variables={}, messages=[SystemMessagePromptTemplate(prompt=PromptTemplate(input_variables=[], input_types={}, partial_variables={}, template='Please break down the following text into simple, self-contained propositions. Ensure that each proposition meets the following criteria:\n\n    1. Express a Single Fact: Each proposition should state one specific fact or claim.\n    2. Be Understandable Without Context: The proposition should be self-contained, meaning it can be understood without needing additional context.\n    3. Use Full Names, Not Pronouns: Avoid pronouns or ambiguous references; use full entity names.\n    4. Include Relevant Dates/Qualifiers: If applicable, include necessary dates, times, and qualifiers to make the fact precise.\n    5. Contain One Subject-Predicate Relationship: Focus on a single subject and its corresponding action or attribute, without conjunctions or multiple clauses.'), addition

In [2]:
# Define the input text
test = "Supervised learning is the machine learning task of learning a function that maps an input to an output based on example input-output pairs. It infers a function from labeled training data consisting of a set of training examples. In supervised learning, each example is a pair consisting of an input object (typically a vector) and a desired output value (also called the supervisory signal)."
# proposition_generator.invoke({"document": test})

In [7]:
print(text[0]['content'].replace('\n', ''))



In [8]:
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
import torch
import json

model_name = "chentong00/propositionizer-wiki-flan-t5-large"
device = "cuda" if torch.cuda.is_available() else "cpu"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSeq2SeqLM.from_pretrained(model_name).to(device)

title = "Supervised learning"
section = ""
content = text[0]['content'].replace('\n', '')

input_text = f"Title: {title}. Section: {section}. Content: {content}"

input_ids = tokenizer(input_text, return_tensors="pt").input_ids
outputs = model.generate(input_ids.to(device), max_new_tokens=512).cpu()

output_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
try:
    prop_list = json.loads(output_text)
except:
    prop_list = []
    print("[ERROR] Failed to parse output text as JSON.")
print(json.dumps(prop_list, indent=2))


[
  "The total hours of 'Late in \u2013 Early out' will be counted in the system.",
  "The total hours affect employee's Annual Leave balance.",
  "In a month, if the employee is late for work and/or leaves work early 4 times, an email reminder from HR will be issued.",
  "The first violation occurred in August 2018.",
  "The second violation occurred in December 2018.",
  "The third violation occurred in July 2019.",
  "When the total 'Late-in' and 'Early-out' is up to 4 hours, a half-day Annual Leave of employee will be deducted.",
  "The total 'Late-in' and 'Early-out' will be accumulated over months and across the years.",
  "An example of a total 'Late-in' and 'Early-out' is 3 hours in December 2018.",
  "In January 2019, the total 'Late-in' and 'Early-out' is 1 hour.",
  "HR will issue a confirmation paper to inform that half-day Annual Leave of the employee is going to be deducted.",
  "This limitation is not applied to manager level."
]


In [None]:
output_text

In [None]:
from langchain_core.documents import Document

propositions = [] # Store all the propositions from the document
doc_splits = [t['content'] for t in text]
source = [t['url'] for t in text]

for i in range(len(doc_splits)):
    print(doc_splits[i])
    response = proposition_generator.invoke({"document": doc_splits[i]}) # Creating proposition
    for proposition in response.propositions:
        propositions.append(Document(page_content=proposition, metadata={"Title": "Employee Handbook", "Source": source[i], "chunk_id": i+1}))

In [None]:
propositions