In [None]:
import pandas as pd
import re
import numpy as np
import json
import sqlite3
from ollama_tools import ask_llm
import fitz
import os
from ollama import chat
from ollama import Client
from ollama import ChatResponse
from ollama import generate

In [None]:
client = Client(host='http://localhost:11434')

In [3]:
file_path = "data-extration.xlsx"
xls = pd.ExcelFile(file_path)
rq_sheets = ["RQ2", "RQ3_Systems", "RQ4_Learning", "RQ5_Partition_Method", "RQ5_Metric"]

def extract_citation_key(cite):
    if isinstance(cite, str):
        match = re.search(r"\\cite{(.+?)}", cite)
        return match.group(1) if match else cite
    return cite

processed_sheets = {}
for sheet in rq_sheets:
    df = xls.parse(sheet)
    df.columns = [str(col).strip().lower() for col in df.columns]
    
    df = df.loc[:, ~df.columns.str.contains('^unnamed')]
    df = df.loc[:, ~df.columns.str.contains('^comentário')]
    df = df.loc[:, ~df.columns.str.contains('^obs:')]
    
    df = df.dropna(how='all')
    
    if "reference" in df.columns:
        df["reference"] = df["reference"].apply(extract_citation_key)
        
        agg_dict = {}
        for col in df.columns:
            if col != "reference":
                agg_dict[col] = lambda x: list(x.dropna().unique()) if len(x.dropna()) > 0 else np.nan
        
        df = df.groupby("reference", as_index=False).agg(agg_dict)
        
        for col in df.columns:
            if col != "reference":
                df[col] = df[col].apply(lambda x: x[0] if isinstance(x, list) and len(x) == 1 else x)

    processed_sheets[sheet] = df

merged_df = processed_sheets[rq_sheets[0]]
for sheet in rq_sheets[1:]:
    merged_df = merged_df.merge(processed_sheets[sheet], on="reference", how="outer")

merged_df.columns = [col.split('.')[-1] if '.' in col else col for col in merged_df.columns]

merged_df.drop(columns=["related", "ref", "cite"], inplace=True)
merged_df



Unnamed: 0,reference,sampling method,systemname,domain,nfp,strategy,dataset,technique,partition method,evaluation metric
0,Alshehri2023,reamostragem SMOTE,Eclipse,system files,,,,"[AdaBoost com J48, J48]",Validação cruzada,"[Recall , Precision, Medida F ]"
1,Alves2020,"[Coverage-based , Solver-based, Randomized ...",x264,,"Tempo de codificação e codificação, tamanho",,https://github.com/jualvespereira/ICPE2020,regressão linear múltipla,NI,MRE
2,Arcaini2020,Não identifiquei,No identificado,,,,,,,
3,Ballesteros2021,NI,"[x264, Wget, Berkeley DB Memory, Sensor Networ...",,Population Size / Archive Size: 400; Number of...,SI,,regressão linear,NI,Coverage Metric (CM)
4,Chen2022,amostragem adaptativa com d-Simplexed,Spark,Database System,"Count,Executor Memory,Executor Threads, Memory...",,,Rede Neural Multicamadas (NN),Bootstrap,MAPE
...,...,...,...,...,...,...,...,...,...,...
59,tipu2022:cc,Random Sampling,"[MPI-I/O, SEG-Y I/O]",,"[Number of MPI node, MPI processes per node, S...",,,Artificial Neural Networks,,"[accuracy, MSE, MAE, MAPE]"
60,valov2020:icpe,amostragem pseudoaleatória,"[BZIP2, GZIP, XZ, FLAC, x264]",,,,,"[árvores de regressão, Regressão linear simple]",,"[MAPE, LOOCV]"
61,vitui2021:ese,Amostragem aleatória,"[Open-Src, Entprz. 1, Entprz. 2]",,,,,"[Random Forest, XGBoost trees, Multi-Layer Per...","[cross-validation, validação cruzada leave-one...","[Median Percentage Deviation, MAPE, MAE, (MSE,..."
62,yufei2024:jss,"[Random Sampling, Neighborhood Sampling, Input...","[SQLite, BDB-C, BDB-J, LLVM, Sac, Apache, x264...","[Database, Compiler, Web Server, Video Encoder...","[Execution Time, Response Time, Video Encoding...",Execution,https://github.com/RSFIN/RSFIN/tree/master/data,"[Artificial Neural Networks (ANN), Deep Learni...",,


In [14]:
merged_df[merged_df["reference"] == "lesoil2024"]

Unnamed: 0,reference,sampling method,systemname,domain,nfp,strategy,dataset,technique,partition method,evaluation metric
46,lesoil2024,"[K-means, HDBScan, Amostragem aleatória, Submo...","[gcc, ImageMagick, lingeling, nodeJS, poppler,...","[.c programs, images, SAT formulae, .js script...","[size, ctime, exec, size, time, #confl.,#reduc...",EX,,"[OLS Regression, Desicion Tree, Random forest,...",NI,Mean Absolute Percentage Error (MAPE)


In [7]:
merged_df["reference"] = merged_df["reference"].str.replace(":", "_", regex=False)

In [None]:
# messages=[
#   {
#     'role': 'user',
#     'content': 'Why is the sky blue?',
#   },
# ])


def build_message(topic: str, context: str, question: str, some_answer_examples: str,answer_prefix: str, base_text: str, text_example:str, answer_text_example:str, model: str) -> str:
    message = [{
        'role': "system",
        'content': f"""You are an expert scientific article analyzer. Your task is to extract specific information 
        from scientific texts based on provided questions and context. When asked about {topic}, understand that {context}
        Examples include: {some_answer_examples}
        Your answer should be concise and directly address the question based on the provided text, starting with the phrase:{answer_prefix}"""
    },
    {
        'role': "user",
        'content': f"""Please answer the question: {question}, based on the following text: {base_text}"""
    },{
        'role': "assistant",
        'content': f"""Question: {question}\n base_text: {text_example}\nAnswer: {answer_text_example}
        """
    }]

    return message
    
    

In [None]:
# def get_prompt_by_metric_zero_shot(metric: str) -> str:
#     match(metric):
#         case "sampling method":
#             return """
#             Q: What sampling method is used in the paper?
#             A: The sampling method used in the paper is:
#             """
#         case "dataset":
#             return """
#             Q: What dataset is used in the paper?
#             A: The dataset used in the paper is:
#             """
#         case "technique": # ver nome do metodo
#             return """
#             Q: What learning method is used in the paper?
#             A: The learning method used in the paper is:
#             """
#         case "partition method":
#             return """
#             Q: What partition method is used in the paper?
#             A: The partition method used in the paper is:
#             """
#         case "evaluation metric":
#             return """
#             Q: What evaluation metric is used in the paper?
#             A: The evaluation metric used in the paper is:
#             """
#         case _:
#             raise ValueError(f"Unknown metric: {metric}")
        
# def get_prompt_by_metric_with_context(metric: str) -> str:
#     match(metric):
#         case "sampling method":
#             return """
#             Context: sampling methods are used to select a representative subset from a dataset. They are often used to reduce the size of the dataset while maintaining its diversity and characteristics.
#             Examples: Randon sampling, Stratified sampling, Systematic sampling, Cluster sampling, Convenience sampling, Purposive sampling, Snowball sampling and etc... .
#             Q: What sampling method is used in the paper?
#             A: The sampling method used in the paper are/is:
#             """
#         case "dataset":
#             return """
#             Context: datasets are collections of data used for training and evaluating machine learning models. They can vary in size, type, and structure.
#             Types of datasets: Image datasets, Text datasets, Audio datasets, Video datasets, Tabular datasets and etc... .
#             Formats of datasets: CSV, JSON, XML, Parquet, HDF5 and etc... .
#             Q: What dataset is used in the paper?
#             A: The dataset used in the paper are:
#             """
#         case "technique":
#             return """
#             Context: techniques refer to the methods or approaches used in the Machine Learning field to solve specific problems or tasks. They are normally categorized in supervised learning (classification, regression), unsupervised learning (clustering, dimensionality reduction).
#             Other special categorie is Deep Learning (Convolutional Neural Networks, Recurrent Neural Networks, Transformers, Generative Adversarial Networks and etc... ).
#             Examples of techniques: Decision Trees, Support Vector Machines, Neural Networks, k-Nearest Neighbors, Random Forests, Gradient Boosting Machines and etc... .
#             Q: What technique is used in the paper?
#             A: The technique used in the paper is:
#             """
#         case "partition method":
#             return """
#             Context: partition methods are used to divide a dataset into subsets for training, validation, and testing. They help in evaluating the performance of machine learning models by ensuring that the model is tested on unseen data.
#             Examples of partition methods: Holdout method, k-Fold Cross-Validation, Stratified k-Fold Cross-Validation, Leave-One-Out Cross-Validation (LOOCV), Time Series Split and etc... .
#             Q: What partition method is used in the paper?
#             A: The partition method used in the paper is:
#             """
#         case "evaluation metric":
#             return """
#             Context: evaluation metrics are used to assess the performance of machine learning models. They provide a quantitative measure of how well a model performs on a given task.
#             Example: accuracy, precision, recall, F1-score, ROC-AUC, mean absolute error(MAE), and mean squared error(MSE).
#             Q: What evaluation metric is used in the paper?
#             A: The evaluation metric used in the paper is:
#             """
#         case _:
#             raise ValueError(f"Unknown metric: {metric}")

In [None]:
references_list = merged_df["reference"].tolist()

In [None]:
evaluated_metrics = ["sampling method", "dataset", "technique", "partition method", "evaluation metric"]

In [None]:
def extract_citation_key(cite):
    match = re.search(r"\\cite{(.+?)}", cite)
    return match.group(1) if match else cite

df["reference"] = df["reference"].apply(extract_citation_key)

result_dict = {}

for _, row in df.iterrows():
    ref = row["reference"]
    result_dict[ref] = {
        "sampling method": row["sampling method"],
        "dataset": row["dataset"],
        "technique": row["technique"],
        "partition method": row["partition method"],
        "evaluation metric": row["evaluation metric"]
    }

result_dict

In [None]:
local_llm_models = ["deepseek-r1:32b", "magistral:24b", "mistral-nemo:12b"] #testar o llama 4

In [None]:
def create_llm_tables(db_path: str, llm_models: list):
    conn = sqlite3.connect(db_path)
    cursor = conn.cursor()
    
    cursor.execute('''
        CREATE TABLE IF NOT EXISTS llm_models (
            id INTEGER PRIMARY KEY AUTOINCREMENT,
            model_name TEXT NOT NULL
        )
    ''')
    
    cursor.execute('''
        CREATE TABLE IF NOT EXISTS llm_responses (
            id INTEGER PRIMARY KEY AUTOINCREMENT,
            model_id INTEGER,
            pdf_name varchar(255),
            prompt_technique varchar(255),
            text_segmentation_stategy varchar(255),
            metric varchar(255),
            response TEXT,
            FOREIGN KEY (model_id) REFERENCES llm_models(id)
        )
    ''')
    
    for model in llm_models:
        cursor.execute('INSERT INTO llm_models (model_name) VALUES (?)', (model,))
    
    conn.commit()
    conn.close()

def insert_llm_response(db_path: str, model_name: str, pdf_name: str, prompt_technique: str, metric: str, response: str, text_segmentation_stategy: str):
    conn = sqlite3.connect(db_path)
    cursor = conn.cursor()
    
    cursor.execute('SELECT id FROM llm_models WHERE model_name = ?', (model_name,))
    model_id = cursor.fetchone()
    if not model_id:
        cursor.execute('INSERT INTO llm_models (model_name) VALUES (?)', (model_name,))
        conn.commit()
        cursor.execute('SELECT id FROM llm_models WHERE model_name = ?', (model_name,))
        model_id = cursor.fetchone()
    
    if model_id:
        cursor.execute('''
            INSERT INTO llm_responses (model_id, pdf_name, prompt_technique, metric, response, text_segmentation_stategy)
            VALUES (?, ?, ?, ?, ?, ?)
        ''', (model_id, pdf_name, prompt_technique, metric, response, text_segmentation_stategy))
        
        conn.commit()
    else:
        print(f"Model {model_name} not found in the database.")
    
    conn.close()

In [None]:
def find_all_paths_and_names(folder_path):
    all_paths = []
    all_names = []
    for root, dirs, files in os.walk(folder_path):
        for file in files:
            all_paths.append(os.path.join(root, file))
            all_names.append(file)
    return all_paths, all_names

all_file_paths, all_file_names = find_all_paths_and_names("/home/pramos/Documents/AutoSLR/papers_pdf")

In [None]:
def extract_text_from_pdf(pdf_path: str) -> str:
    if not os.path.exists(pdf_path):
        print(f"PDF file {pdf_path} does not exist. Skipping.")
        
    doc = fitz.open(pdf_path)
    text = ""
    for page in doc:
        text += page.get_text()

    doc.close()
    return text

In [None]:
def find_segment_text(text: str, segment_names: list) -> str:
    pass

In [None]:
#1) local llm tests
db_path = "llm_evaluation.db"
create_llm_tables(db_path, local_llm_models)

for model in local_llm_models:
    print(f"Evaluating with model: {model}")
    for path, name in zip(all_file_paths, all_file_names):
        
        text = extract_text_from_pdf(path)
        segmented_text = adfas()

        for metric in evaluated_metrics:
            prompt_zero_shot = get_prompt_by_metric_zero_shot(metric)
            prompt_few_shot = get_prompt_by_metric_few_shot(metric)
            
            answer_zero_shot = ask_llm(prompt_zero_shot, [segmented_text], model=model)
            answer_few_shot = ask_llm(prompt_few_shot, [segmented_text], model=model)
            
            insert_llm_response(db_path, model, name, "zero_shot", metric, answer_zero_shot)
            insert_llm_response(db_path, model, name, "few_shot", metric, answer_few_shot)

        


In [None]:
#2) remote llm tests
db_path = "llm_evaluation.db"
create_llm_tables(db_path, ["gemini-2.5-flash-preview-05-20", ]) #make sense use "gemini-2.0-flash", "gemini-2.0-flash-lite"?