In [43]:
import warnings
warnings.filterwarnings("ignore", message="'pin_memory' argument is set as true")

In [44]:
from docling.document_converter import DocumentConverter
from docling.chunking import HybridChunker
from colorama import Fore 

In [53]:
import json
from typing import List 
from pydantic import BaseModel
from litellm import completion
from short_prompt import prompt_template

In [58]:
converter = DocumentConverter()
doc = converter.convert("Federer.pdf").document
chunker = HybridChunker()
chunks = chunker.chunk(doc)

In [47]:
class Record(BaseModel):
    question: str
    answer : str

class Response(BaseModel):
    generated: List[Record]

In [None]:
# def llm_call(data: str, num_records: int = 5) -> dict:
#     stream = completion(
#         model = "ollama_chat/llama3:latest",
#         messages = [
#             {
#                 "role":"user",
#                 "content": prompt_template(data, num_records),
#             }
#         ],
#         stream=True,
#         options={"num_predict": 2000 },
#         format = Response.model_json_schema()
#     )
#     for x in stream:
#         delta = x['choices'][0]["delta"]["content"]
#         if delta is not None:
#             print(Fore.LIGHTBLUE_EX + delta + Fore.RESET, end="")
#             data+=delta
    
#     return json.loads(data)

In [54]:
def llm_call(data: str, num_records: int = 3) -> dict:
    stream = completion(
        model="ollama_chat/phi3-mini:latest",
        messages=[
            {
                "role": "user",
                "content": prompt_template(data, num_records),
            }
        ],
        stream=True,
        options={"num_predict": 2000},
        format=Response.model_json_schema()
    )

    response_data = ""  # accumulator for streamed output
    for x in stream:
        delta = x['choices'][0]["delta"]["content"]
        if delta is not None:
            print(Fore.LIGHTBLUE_EX + delta + Fore.RESET, end="")
            response_data += delta  # accumulate the streamed chunks

    # Now response_data should be a complete JSON string
    return json.loads(response_data)


In [63]:
# example_data_chunk = """
# Roger Federer, born 8 August 1981) is a Swiss former professional tennis player. He was ranked as the world No. 1 in men's singles by the Association of Tennis Professionals (ATP) for 310 weeks (second-most of all time), including a record 237 consecutive weeks, and finished as the year-end No. 1 five times. Federer won 103 singles titles on the ATP Tour, the second most since the start of the Open Era in 1968, including 20 major men's singles titles (among which a record eight men's singles Wimbledon titles, and an Open Era joint-record five men's singles US Open titles) and six year-end championships.

# """

In [64]:
# llm_call(example_data_chunk)

In [65]:



dataset = {}

for i, chunk in enumerate(chunks):
    # print(Fore.YELLOW + f"Raw Tesxt: \n{chunk.text[:300]}..." + Fore.RESET)
    enriched_text = chunker.contextualize(chunk=chunk)
    # print(Fore.LIGHTMAGENTA_EX + f"Contextualized Text: \n{enriched_text[:300]}..." + Fore.RESET)
    data = llm_call(
        enriched_text
    )
    # dataset[i] = {"generated":data["generated"], "context":enriched_text}


In [62]:
instructions = []
for key, chunk in dataset.items():
    for pairs in chunk['generated']:
        instructions.append(pairs)

with open('instruction_short_2.json', 'w') as f:
    json.dump(instructions, f)
