In [19]:
from pathlib import Path
import os
from langfuse.openai import AsyncOpenAI
from dotenv import dotenv_values
import asyncio

In [20]:
config = dotenv_values(".env") 

In [21]:
LANGFUSE_PUBLIC_KEY=config.get("LANGFUSE_PUBLIC_KEY")
LANGFUSE_SECRET_KEY=config.get("LANGFUSE_SECRET_KEY")
LANGFUSE_HOST=config.get("LANGFUSE_HOST")
CEREBRAS_API_KEY=config.get("CEREBRAS_API_KEY")

In [22]:
system_message = """
You are a cybersecurity text processor. Clean up formatting issues in scraped blog posts and OCR text from malware analysis materials.
For scraped content: restructure single-line text into readable paragraphs.
For OCR: fix character misrecognition in code, commands, and URLs while maintaining technical accuracy.
Do not interpret, summarize, or modify the actual cybersecurity content - only correct formatting and obvious transcription errors.
"""

In [23]:
texts_path = Path("/home/bartek/Kod/PD/praca_dyplomowa/dane/texts/ocr_enriched_texts")

In [24]:
texts_paths = list(texts_path.glob("**/clean_text_w_ocr.txt"))

In [25]:
texts = []
for text_path in texts_paths:
    messages = []
    with text_path.open("r", encoding="utf-8") as file:
        INPUT_TEXT = file.read()
    user_message = f"""
    Correct the formatting of this cybersecurity text for RAG embedding.
    For scraped blog posts: convert single-line text to properly structured paragraphs with line breaks.
    For OCR text (marked with XML tags): fix character recognition errors in code, URLs, and technical terms, then format appropriately.
    Preserve all technical content and terminology exactly. Do not add explanations or summaries.

    Text to correct:
    {INPUT_TEXT}
    """
    messages.append({
                "role": "user",
                "content": user_message
                })
    messages.append({"role": "system",
                     "content": system_message})
    texts.append(messages)

In [26]:
texts[0]

[{'role': 'user',
  'content': "\n    Correct the formatting of this cybersecurity text for RAG embedding.\n    For scraped blog posts: convert single-line text to properly structured paragraphs with line breaks.\n    For OCR text (marked with XML tags): fix character recognition errors in code, URLs, and technical terms, then format appropriately.\n    Preserve all technical content and terminology exactly. Do not add explanations or summaries.\n\n    Text to correct:\n    are attempting to take advantage of the popularity of Clubhouse to deliver malware that aims to steal users’ login information for a variety of online services, ESET malware researcher Lukas Stefanko has found. Disguised as the (as yet non-existent) Android version of the invitation-only audio chat app, the malicious package is served from a website that has the look and feel of the genuine Clubhouse website. The trojan – nicknamed “BlackRock” by ThreatFabric and detected by ESET products as Android/TrojanDropper.Ag

In [31]:
async def process_text(
    messages: str,
    client,
    model="gpt-oss-120b",
    name="cerebras-test",
    provider="cerebras"
) -> str:    

    response = await client.chat.completions.create(
        messages=messages,
        model=model,
        # Optional Langfuse metadata
        name=name,
        metadata={"provider": provider},
    )
    return response

In [28]:
async def process_multiple_texts(
    texts: list[str],
    client,
) -> list[str]:
    
    tasks = [
        process_text(messages, client) 
        for messages in texts
    ]
    
    return await asyncio.gather(*tasks)    

In [29]:
client = AsyncOpenAI(
    base_url="https://api.cerebras.ai/v1",
    api_key=CEREBRAS_API_KEY,
)

In [32]:
result = await process_multiple_texts(texts[:10], client)

Error code: 429 - {'message': 'Requests per second limit exceeded - too many requests sent.', 'type': 'too_many_requests_error', 'param': 'quota', 'code': 'request_quota_exceeded'}


RateLimitError: Error code: 429 - {'message': 'Requests per second limit exceeded - too many requests sent.', 'type': 'too_many_requests_error', 'param': 'quota', 'code': 'request_quota_exceeded'}