In [1]:
import asyncio
from langchain_ollama import ChatOllama
from langchain_core.messages import HumanMessage, SystemMessage
from pathlib import Path

In [2]:
async def process_text(
    text: str,
    system_message: str,
    model_name: str = "gpt-oss:20b",
    mirostat: int = 2,
    mirostat_tau: float = 3.0,
    mirostat_eta: float = 0.1,
    temperature: float = 0.1,
    **kwargs
) -> str:
    """
    Process text using local Ollama model asynchronously.
    
    Args:
        text: Input text to process
        model_name: Ollama model name
        temperature: Randomness control (0.0-1.0)
        max_tokens: Token limit
        **kwargs: Additional model parameters
    """
    
    llm = ChatOllama(
        model=model_name,
        mirostat = mirostat,
        mirostat_tau = mirostat_tau,
        mirostat_eta = mirostat_eta,
        temperature=temperature,
        **kwargs
    )
    
    messages = [
        SystemMessage(content=system_message),
        HumanMessage(content=text)
    ]

    response = await llm.ainvoke(messages)
    return response.content

In [3]:
async def process_multiple_texts(
    texts: list[str],
    system_message: str,
    model_name: str = "gpt-oss:20b",
    **model_params
) -> list[str]:
    """Process multiple texts concurrently."""
    
    tasks = [
        process_text(text, system_message, model_name, **model_params) 
        for text in texts
    ]
    
    return await asyncio.gather(*tasks)

In [4]:
system_message = """
You are a cybersecurity text processor. Clean up formatting issues in scraped blog posts and OCR text from malware analysis materials.
For scraped content: restructure single-line text into readable paragraphs.
For OCR: fix character misrecognition in code, commands, and URLs while maintaining technical accuracy.
Do not interpret, summarize, or modify the actual cybersecurity content - only correct formatting and obvious transcription errors.
"""

In [5]:
texts_path = Path("/home/bartek/Kod/PD/praca_dyplomowa/dane/texts/ocr_enriched_texts")

In [20]:
texts_paths = list(texts_path.glob("**/clean_text_w_ocr.txt"))

In [16]:
list(texts_paths)

[]

In [15]:
len(list(texts_paths))

1791

In [21]:
for text_path in texts_paths:
    print(text_path)

/home/bartek/Kod/PD/praca_dyplomowa/dane/texts/ocr_enriched_texts/Beware_Android_trojan_posing_as_Clubhouse_app/clean_text_w_ocr.txt
/home/bartek/Kod/PD/praca_dyplomowa/dane/texts/ocr_enriched_texts/Joker/clean_text_w_ocr.txt
/home/bartek/Kod/PD/praca_dyplomowa/dane/texts/ocr_enriched_texts/Mustang_Panda’s_PlugX_new_variant_targetting_Taiwanese_government_and_diplomats/clean_text_w_ocr.txt
/home/bartek/Kod/PD/praca_dyplomowa/dane/texts/ocr_enriched_texts/Group_description:_Leviathan/clean_text_w_ocr.txt
/home/bartek/Kod/PD/praca_dyplomowa/dane/texts/ocr_enriched_texts/Snakes_&_Ladders:_the_offensive_use_of_Python_on_Windows/clean_text_w_ocr.txt
/home/bartek/Kod/PD/praca_dyplomowa/dane/texts/ocr_enriched_texts/Linux_Variant_of_REvil_Ransomware_Targets_VMware’s_ESXi,_NAS_Devices/clean_text_w_ocr.txt
/home/bartek/Kod/PD/praca_dyplomowa/dane/texts/ocr_enriched_texts/Following_ESET’s_discovery,_a_Monero_mining_botnet_is_disrupted/clean_text_w_ocr.txt
/home/bartek/Kod/PD/praca_dyplomowa/dane

In [22]:
texts = []
for text_path in texts_paths:
    print(text_path)
    with text_path.open("r", encoding="utf-8") as file:
        INPUT_TEXT = file.read()
    user_message = f"""
    Correct the formatting of this cybersecurity text for RAG embedding.
    For scraped blog posts: convert single-line text to properly structured paragraphs with line breaks.
    For OCR text (marked with XML tags): fix character recognition errors in code, URLs, and technical terms, then format appropriately.
    Preserve all technical content and terminology exactly. Do not add explanations or summaries.

    Text to correct:
    {INPUT_TEXT}
    """
    texts.append(user_message)

/home/bartek/Kod/PD/praca_dyplomowa/dane/texts/ocr_enriched_texts/Beware_Android_trojan_posing_as_Clubhouse_app/clean_text_w_ocr.txt
/home/bartek/Kod/PD/praca_dyplomowa/dane/texts/ocr_enriched_texts/Joker/clean_text_w_ocr.txt
/home/bartek/Kod/PD/praca_dyplomowa/dane/texts/ocr_enriched_texts/Mustang_Panda’s_PlugX_new_variant_targetting_Taiwanese_government_and_diplomats/clean_text_w_ocr.txt
/home/bartek/Kod/PD/praca_dyplomowa/dane/texts/ocr_enriched_texts/Group_description:_Leviathan/clean_text_w_ocr.txt
/home/bartek/Kod/PD/praca_dyplomowa/dane/texts/ocr_enriched_texts/Snakes_&_Ladders:_the_offensive_use_of_Python_on_Windows/clean_text_w_ocr.txt
/home/bartek/Kod/PD/praca_dyplomowa/dane/texts/ocr_enriched_texts/Linux_Variant_of_REvil_Ransomware_Targets_VMware’s_ESXi,_NAS_Devices/clean_text_w_ocr.txt
/home/bartek/Kod/PD/praca_dyplomowa/dane/texts/ocr_enriched_texts/Following_ESET’s_discovery,_a_Monero_mining_botnet_is_disrupted/clean_text_w_ocr.txt
/home/bartek/Kod/PD/praca_dyplomowa/dane

In [23]:
texts[:2]

["\n    Correct the formatting of this cybersecurity text for RAG embedding.\n    For scraped blog posts: convert single-line text to properly structured paragraphs with line breaks.\n    For OCR text (marked with XML tags): fix character recognition errors in code, URLs, and technical terms, then format appropriately.\n    Preserve all technical content and terminology exactly. Do not add explanations or summaries.\n\n    Text to correct:\n    are attempting to take advantage of the popularity of Clubhouse to deliver malware that aims to steal users’ login information for a variety of online services, ESET malware researcher Lukas Stefanko has found. Disguised as the (as yet non-existent) Android version of the invitation-only audio chat app, the malicious package is served from a website that has the look and feel of the genuine Clubhouse website. The trojan – nicknamed “BlackRock” by ThreatFabric and detected by ESET products as Android/TrojanDropper.Agent.HLR – can steal victims’ l

In [24]:
params = {    "mirostat": 2,
    "mirostat_tau":  3.0,
    "mirostat_eta":  0.1,
    "temperature":  0.1,}

In [27]:
results = await process_multiple_texts(texts=texts[:2], system_message=system_message, model_params=params)

In [29]:
print(results[0])

are attempting to take advantage of the popularity of Clubhouse to deliver malware that aims to steal users’ login information for a variety of online services, ESET malware researcher Lukas Stefanko has found.  
Disguised as the (as yet non‑existent) Android version of the invitation‑only audio chat app, the malicious package is served from a website that has the look and feel of the genuine Clubhouse website.  
The trojan – nicknamed “BlackRock” by ThreatFabric and detected by ESET products as Android/TrojanDropper.Agent.HLR – can steal victims’ login data for no fewer than 458 online services.  
The target list includes well‑known financial and shopping apps, cryptocurrency exchanges, as well as social media and messaging platforms.  
For starters, Twitter, WhatsApp, Facebook, Amazon, Netflix, Outlook, eBay, Coinbase, Plus500, Cash App, BBVA and Lloyds Bank are all on the list.  

“The website looks like the real deal. To be frank, it is a well‑executed copy of the legitimate Clubho