In [None]:
import pandas as pd
import re
import numpy as np
from extrac_text import extract_text
from ollama_tools import ask_llm
import json

In [None]:
file_path = "data-extration.xlsx"
xls = pd.ExcelFile(file_path)
rq_sheets = ["RQ2", "RQ3_Systems", "RQ4_Learning", "RQ5_Partition_Method", "RQ5_Metric"]

def extract_citation_key(cite):
    if isinstance(cite, str):
        match = re.search(r"\\cite{(.+?)}", cite)
        return match.group(1) if match else cite
    return cite

processed_sheets = {}
for sheet in rq_sheets:
    df = xls.parse(sheet)
    df.columns = [str(col).strip().lower() for col in df.columns]
    
    df = df.loc[:, ~df.columns.str.contains('^unnamed')]
    df = df.loc[:, ~df.columns.str.contains('^comentário')]
    df = df.loc[:, ~df.columns.str.contains('^obs:')]
    
    df = df.dropna(how='all')
    
    if "reference" in df.columns:
        df["reference"] = df["reference"].apply(extract_citation_key)
        
        agg_dict = {}
        for col in df.columns:
            if col != "reference":
                agg_dict[col] = lambda x: list(x.dropna().unique()) if len(x.dropna()) > 0 else np.nan
        
        df = df.groupby("reference", as_index=False).agg(agg_dict)
        
        for col in df.columns:
            if col != "reference":
                df[col] = df[col].apply(lambda x: x[0] if isinstance(x, list) and len(x) == 1 else x)

    processed_sheets[sheet] = df

merged_df = processed_sheets[rq_sheets[0]]
for sheet in rq_sheets[1:]:
    merged_df = merged_df.merge(processed_sheets[sheet], on="reference", how="outer")

merged_df.columns = [col.split('.')[-1] if '.' in col else col for col in merged_df.columns]

merged_df.drop(columns=["related", "ref", "cite"], inplace=True)
merged_df



In [None]:
merged_df[merged_df["reference"] == "yufei2024:jss"]

In [None]:
merged_df["reference"].unique()

In [None]:
import os

papers_folder = "papers"

pdf_files = []
for root, dirs, files in os.walk(papers_folder):
    for file in files:
        if file.endswith(".pdf"):
            pdf_files.append(os.path.splitext(file)[0])

unique_references = merged_df["reference"].unique()
size = len(unique_references)


missing_papers = [pdf for pdf in pdf_files if pdf not in unique_references]

if len(unique_references) == size:
    print("All 64 papers are present in the merged_df.")
else:
    print(f"Number of unique papers in merged_df: {len(unique_references)}")
    print(f"Missing papers: {missing_papers}")

In [None]:
pdf_files = []
for root, dirs, files in os.walk(papers_folder):
    for file in files:
        if file.endswith(".pdf"):
            pdf_files.append(os.path.abspath(os.path.join(root, file)))

print(pdf_files)

In [7]:
prompts_base = ["using only the metodology part answer the following questions in topics: ", "do not consider the related work section for answer this questions in topics: "]
questions = ["What is the sampling method used?",
             "What are the systems used?",
             "Which is the domain of the systems used?",
             "What is the learning method used?",
             "What is the partition method used?",
             "What is the Non-Functional Performance metric used?",
             "Is there any reference for the dataset?",
             "What are the machine learning algorithms used in this article?",
             "What are the performance metrics used in this article?",
             "What are the partition methods used?",
             "What are the evaluation methods used in this article?" ]

In [None]:
questions_str = " ".join(questions)
final_prompts = [prompt  + " " + questions_str for prompt in prompts_base]

final_prompts

In [9]:
models = ["mistral-nemo", "qwen:1.8b"]

In [None]:
result_dict = {}
for model in models:
    for prompt in final_prompts:
        for pdf_path in pdf_files:
            result = None

            text = extract_text(pdf_path)

            if text:
                result = ask_llm(text, prompt, model)
            else:
                print(f"Error extracting text from {pdf_path}")

            if result:
                if pdf_path not in result_dict:
                    result_dict[pdf_path] = {}
                if model not in result_dict[pdf_path]:
                    result_dict[pdf_path][model] = {}
                result_dict[pdf_path][model][prompt] = result

with open("result_dict.json", "w") as json_file:
    json.dump(result_dict, json_file, indent=4)