In [1]:
import asyncio
from langfuse import Langfuse, get_client
from langfuse.langchain import CallbackHandler
from langchain_openai import ChatOpenAI
from langchain_core.messages import HumanMessage, SystemMessage
from pathlib import Path
from dotenv import dotenv_values

In [2]:
config = dotenv_values(".env") 

In [1]:
LANGFUSE_PUBLIC_KEY=config.get("LANGFUSE_PUBLIC_KEY")
LANGFUSE_SECRET_KEY=config.get("LANGFUSE_SECRET_KEY")
LANGFUSE_HOST=config.get("LANGFUSE_HOST")
CEREBRAS_API_KEY=config.get("CEREBRAS_API_KEY")

NameError: name 'config' is not defined

In [4]:
langfuse = Langfuse(
    public_key=LANGFUSE_PUBLIC_KEY,
    secret_key=LANGFUSE_SECRET_KEY,
    host=LANGFUSE_HOST
)

In [5]:
langfuse_handler = CallbackHandler()

In [6]:
system_message = """
You are a cybersecurity text processor. Clean up formatting issues in scraped blog posts and OCR text from malware analysis materials.
For scraped content: restructure single-line text into readable paragraphs.
For OCR: fix character misrecognition in code, commands, and URLs while maintaining technical accuracy.
Do not interpret, summarize, or modify the actual cybersecurity content - only correct formatting and obvious transcription errors.
"""

In [7]:
llm = ChatOpenAI(
    model="gpt-5-nano",
    temperature=0.1,
    # top_p=0.1,
    api_key=OPENAI_API_KEY,    
    )

In [8]:
async def process_text(
    text: str,
    system_message: str,
    llm,
) -> str:    
    messages = [
        SystemMessage(content=system_message),
        HumanMessage(content=text)
    ]

    response = await llm.ainvoke(messages, config={"callbacks": [langfuse_handler]})
    return response.content

In [9]:
async def process_multiple_texts(
    texts: list[str],
    system_message: str,
    llm,
) -> list[str]:
    
    tasks = [
        process_text(text, system_message, llm) 
        for text in texts
    ]
    
    return await asyncio.gather(*tasks)

In [10]:
texts_path = Path("/home/bartek/Kod/PD/praca_dyplomowa/dane/texts/ocr_enriched_texts")

In [11]:
texts_paths = list(texts_path.glob("**/clean_text_w_ocr.txt"))

In [12]:
list(texts_paths)

[PosixPath('/home/bartek/Kod/PD/praca_dyplomowa/dane/texts/ocr_enriched_texts/Beware_Android_trojan_posing_as_Clubhouse_app/clean_text_w_ocr.txt'),
 PosixPath('/home/bartek/Kod/PD/praca_dyplomowa/dane/texts/ocr_enriched_texts/Joker/clean_text_w_ocr.txt'),
 PosixPath('/home/bartek/Kod/PD/praca_dyplomowa/dane/texts/ocr_enriched_texts/Mustang_Panda’s_PlugX_new_variant_targetting_Taiwanese_government_and_diplomats/clean_text_w_ocr.txt'),
 PosixPath('/home/bartek/Kod/PD/praca_dyplomowa/dane/texts/ocr_enriched_texts/Group_description:_Leviathan/clean_text_w_ocr.txt'),
 PosixPath('/home/bartek/Kod/PD/praca_dyplomowa/dane/texts/ocr_enriched_texts/Snakes_&_Ladders:_the_offensive_use_of_Python_on_Windows/clean_text_w_ocr.txt'),
 PosixPath('/home/bartek/Kod/PD/praca_dyplomowa/dane/texts/ocr_enriched_texts/Linux_Variant_of_REvil_Ransomware_Targets_VMware’s_ESXi,_NAS_Devices/clean_text_w_ocr.txt'),
 PosixPath('/home/bartek/Kod/PD/praca_dyplomowa/dane/texts/ocr_enriched_texts/Following_ESET’s_discov

In [12]:
len(list(texts_paths))

1791

In [13]:
texts = []
for text_path in texts_paths:
    with text_path.open("r", encoding="utf-8") as file:
        INPUT_TEXT = file.read()
    user_message = f"""
    Correct the formatting of this cybersecurity text for RAG embedding.
    For scraped blog posts: convert single-line text to properly structured paragraphs with line breaks.
    For OCR text (marked with XML tags): fix character recognition errors in code, URLs, and technical terms, then format appropriately.
    Preserve all technical content and terminology exactly. Do not add explanations or summaries.

    Text to correct:
    {INPUT_TEXT}
    """
    texts.append(user_message)

In [14]:
texts[:2]

["\n    Correct the formatting of this cybersecurity text for RAG embedding.\n    For scraped blog posts: convert single-line text to properly structured paragraphs with line breaks.\n    For OCR text (marked with XML tags): fix character recognition errors in code, URLs, and technical terms, then format appropriately.\n    Preserve all technical content and terminology exactly. Do not add explanations or summaries.\n\n    Text to correct:\n    are attempting to take advantage of the popularity of Clubhouse to deliver malware that aims to steal users’ login information for a variety of online services, ESET malware researcher Lukas Stefanko has found. Disguised as the (as yet non-existent) Android version of the invitation-only audio chat app, the malicious package is served from a website that has the look and feel of the genuine Clubhouse website. The trojan – nicknamed “BlackRock” by ThreatFabric and detected by ESET products as Android/TrojanDropper.Agent.HLR – can steal victims’ l

In [15]:
results = await process_multiple_texts(texts=texts[:10], system_message=system_message, llm=llm)

In [29]:
print(results[0])

are attempting to take advantage of the popularity of Clubhouse to deliver malware that aims to steal users’ login information for a variety of online services, ESET malware researcher Lukas Stefanko has found. Disguised as the (as yet non-existent) Android version of the invitation-only audio chat app, the malicious package is served from a website that has the look and feel of the genuine Clubhouse website. The trojan – nicknamed “BlackRock” by ThreatFabric and detected by ESET products as Android/TrojanDropper.Agent.HLR – can steal victims’ login data for no fewer than 458 online services. The target list includes well-known financial and shopping apps, cryptocurrency exchanges, as well as social media and messaging platforms. For starters, Twitter, WhatsApp, Facebook, Amazon, Netflix, Outlook, eBay, Coinbase, Plus500, Cash App, BBVA and Lloyds Bank are all on the list. “The website looks like the real deal. To be frank, it is a well-executed copy of the legitimate Clubhouse website

In [1]:
8000 / 60

133.33333333333334

In [None]:
13