In [19]:
%load_ext dotenv
%dotenv

The dotenv extension is already loaded. To reload it, use:
  %reload_ext dotenv


In [20]:
# https://huggingface.co/TechxGenus/Meta-Llama-3-70B-Instruct-GPTQ
LLAMA3_BASE_URL = 'https://corellm.wb.ru/llama3/v1'
LLAMA3_MODEL_NAME = 'Meta-Llama-3-70B-Instruct-GPTQ'


DEEPSEEK_BASE_URL = 'https://corellm.wb.ru/deepseek/v1'
DEEPSEEK_MODEL_NAME = 'DeepSeek-R1'


QWEN25_32B_BASE_URL = 'https://corellm.wb.ru/qwen25-coder-14b-instruct/v1/'
QWEN25_32B_MODEL_NAME = 'Qwen25-Coder-14B-Instruct'

In [21]:
import os

RWB_CORELLM_TOKEN = os.getenv("RWB_CORELLM_TOKEN")
SOURCE_ARTICLE_NAME = 'Exploring ChatGPT and its impact on society MA Haque, S Li'

In [23]:
import pandas as pd

df = pd.read_csv('pdf_extracted_text.csv')

In [24]:
import time
from openai import OpenAI

def process_pdf_with_llm(pdf_content, source_article_name, client, model_name, max_retries=1):
    if not pdf_content or pd.isna(pdf_content) or pdf_content.strip() == "":
        return "No content to process"
    
    for attempt in range(max_retries):
        try:
            chat_completion = client.chat.completions.create(
                messages=[
                    {
                        "role": "system",
                        "content": f"You are a helpful assistant. User will give you article. Find article context which references to '{source_article_name}'. Write only text from article that mentions or references this source. If no references found, write 'No references found'."
                    },
                    {
                        "role": "user",
                        "content": pdf_content,
                    }
                ],
                model=model_name,
                stream=False,
                temperature=0.1
            )
            
            response = chat_completion.choices[0].message.content
            return response.strip()
            
        except Exception as e:
            print(f"Attempt {attempt + 1} failed: {str(e)}")
            if attempt < max_retries - 1:
                time.sleep(2)  # Wait before retry
            else:
                return f"Error after {max_retries} attempts: {str(e)}"


In [27]:
import tqdm
import pandas as pd
from concurrent.futures import ThreadPoolExecutor, as_completed


base_url = DEEPSEEK_BASE_URL
model_name = DEEPSEEK_MODEL_NAME
digest_col_name = f"{model_name}_digest"

client = OpenAI(
    api_key=RWB_CORELLM_TOKEN,
    base_url=base_url,
)

def process_row(args):
    idx, pdf_content = args

    digest = process_pdf_with_llm(
        pdf_content=pdf_content,
        source_article_name=SOURCE_ARTICLE_NAME,
        client=client,
        model_name=model_name
    )

    return idx, digest

df[digest_col_name] = None

non_null_mask = df['pdf_content'].notna() & (df['pdf_content'].str.strip() != '')
rows_to_process = df[non_null_mask]

print(f"\nProcessing {len(rows_to_process)} rows with non-null PDF content...")
print("="*60)

args_list = [
    (idx, row['pdf_content'])
    for idx, row in rows_to_process.iterrows()
]

with ThreadPoolExecutor(max_workers=10) as executor:
    futures = {executor.submit(process_row, args): args for args in args_list}


    with tqdm.tqdm(total=len(futures), desc="Processing PDFs' content") as pbar:
        for future in as_completed(futures):
            idx, digest = future.result()
            df.at[idx, digest_col_name] = digest
            pbar.update(1)

print(f"Processing complete")


Processing 30 rows with non-null PDF content...


Processing PDFs:  30%|███       | 9/30 [12:41<22:27, 64.16s/it]   

Attempt 1 failed: Error code: 400 - {'object': 'error', 'message': "The input (248740 tokens) is longer than the model's context length (163840 tokens).", 'type': 'BadRequestError', 'param': None, 'code': 400}
Attempt 2 failed: Error code: 400 - {'object': 'error', 'message': "The input (248740 tokens) is longer than the model's context length (163840 tokens).", 'type': 'BadRequestError', 'param': None, 'code': 400}


Processing PDFs:  33%|███▎      | 10/30 [12:48<15:27, 46.37s/it]

Attempt 3 failed: Error code: 400 - {'object': 'error', 'message': "The input (248740 tokens) is longer than the model's context length (163840 tokens).", 'type': 'BadRequestError', 'param': None, 'code': 400}


Processing PDFs: 100%|██████████| 30/30 [21:01<00:00, 42.04s/it]


Processing complete





In [28]:
output_filename = 'pdf_extracted_text_with_digests.csv'
df.to_csv(output_filename, index=False)


In [30]:
print("DIGEST ANALYSIS for", model_name)
print("="*60)

print(f"Total digests generated: {df[digest_col_name].notna().sum()}")

references_found = df[df[digest_col_name].str.contains('No references found', na=False)].shape[0]
references_not_found = df[df[digest_col_name].notna() & ~df[digest_col_name].str.contains('No references found', na=False)].shape[0]

print(f"\nReference Analysis:")
print(f"References found: {references_not_found}")
print(f"No references found: {references_found}")
print(f"Success rate: {references_not_found/(references_not_found + references_found)*100:.1f}%")

DIGEST ANALYSIS for DeepSeek-R1
Total digests generated: 30

Reference Analysis:
References found: 30
No references found: 0
Success rate: 100.0%
