In [2]:
import pandas as pd
import re
import numpy as np
import json

In [3]:
file_path = "data-extration.xlsx"
xls = pd.ExcelFile(file_path)
rq_sheets = ["RQ2", "RQ3_Systems", "RQ4_Learning", "RQ5_Partition_Method", "RQ5_Metric"]

def extract_citation_key(cite):
    if isinstance(cite, str):
        match = re.search(r"\\cite{(.+?)}", cite)
        return match.group(1) if match else cite
    return cite

processed_sheets = {}
for sheet in rq_sheets:
    df = xls.parse(sheet)
    df.columns = [str(col).strip().lower() for col in df.columns]
    
    df = df.loc[:, ~df.columns.str.contains('^unnamed')]
    df = df.loc[:, ~df.columns.str.contains('^comentário')]
    df = df.loc[:, ~df.columns.str.contains('^obs:')]
    
    df = df.dropna(how='all')
    
    if "reference" in df.columns:
        df["reference"] = df["reference"].apply(extract_citation_key)
        
        agg_dict = {}
        for col in df.columns:
            if col != "reference":
                agg_dict[col] = lambda x: list(x.dropna().unique()) if len(x.dropna()) > 0 else np.nan
        
        df = df.groupby("reference", as_index=False).agg(agg_dict)
        
        for col in df.columns:
            if col != "reference":
                df[col] = df[col].apply(lambda x: x[0] if isinstance(x, list) and len(x) == 1 else x)

    processed_sheets[sheet] = df

merged_df = processed_sheets[rq_sheets[0]]
for sheet in rq_sheets[1:]:
    merged_df = merged_df.merge(processed_sheets[sheet], on="reference", how="outer")

merged_df.columns = [col.split('.')[-1] if '.' in col else col for col in merged_df.columns]

merged_df.drop(columns=["related", "ref", "cite"], inplace=True)
merged_df



Unnamed: 0,reference,sampling method,systemname,domain,nfp,strategy,dataset,technique,partition method,evaluation metric
0,Alshehri2023,reamostragem SMOTE,Eclipse,system files,,,,"[AdaBoost com J48, J48]",Validação cruzada,"[Recall , Precision, Medida F ]"
1,Alves2020,"[Coverage-based , Solver-based, Randomized ...",x264,,"Tempo de codificação e codificação, tamanho",,https://github.com/jualvespereira/ICPE2020,regressão linear múltipla,NI,MRE
2,Arcaini2020,Não identifiquei,No identificado,,,,,,,
3,Ballesteros2021,NI,"[x264, Wget, Berkeley DB Memory, Sensor Networ...",,Population Size / Archive Size: 400; Number of...,SI,,regressão linear,NI,Coverage Metric (CM)
4,Chen2022,amostragem adaptativa com d-Simplexed,Spark,Database System,"Count,Executor Memory,Executor Threads, Memory...",,,Rede Neural Multicamadas (NN),Bootstrap,MAPE
...,...,...,...,...,...,...,...,...,...,...
59,tipu2022:cc,Random Sampling,"[MPI-I/O, SEG-Y I/O]",,"[Number of MPI node, MPI processes per node, S...",,,Artificial Neural Networks,,"[accuracy, MSE, MAE, MAPE]"
60,valov2020:icpe,amostragem pseudoaleatória,"[BZIP2, GZIP, XZ, FLAC, x264]",,,,,"[árvores de regressão, Regressão linear simple]",,"[MAPE, LOOCV]"
61,vitui2021:ese,Amostragem aleatória,"[Open-Src, Entprz. 1, Entprz. 2]",,,,,"[Random Forest, XGBoost trees, Multi-Layer Per...","[cross-validation, validação cruzada leave-one...","[Median Percentage Deviation, MAPE, MAE, (MSE,..."
62,yufei2024:jss,"[Random Sampling, Neighborhood Sampling, Input...","[SQLite, BDB-C, BDB-J, LLVM, Sac, Apache, x264...","[Database, Compiler, Web Server, Video Encoder...","[Execution Time, Response Time, Video Encoding...",Execution,https://github.com/RSFIN/RSFIN/tree/master/data,"[Artificial Neural Networks (ANN), Deep Learni...",,


In [14]:
merged_df[merged_df["reference"] == "lesoil2024"]

Unnamed: 0,reference,sampling method,systemname,domain,nfp,strategy,dataset,technique,partition method,evaluation metric
46,lesoil2024,"[K-means, HDBScan, Amostragem aleatória, Submo...","[gcc, ImageMagick, lingeling, nodeJS, poppler,...","[.c programs, images, SAT formulae, .js script...","[size, ctime, exec, size, time, #confl.,#reduc...",EX,,"[OLS Regression, Desicion Tree, Random forest,...",NI,Mean Absolute Percentage Error (MAPE)


In [7]:
merged_df["reference"] = merged_df["reference"].str.replace(":", "_", regex=False)

In [10]:
len(merged_df["reference"].unique())

64

In [17]:
import os

papers_folder = "../papers_pdf"

pdf_files = []
for root, dirs, files in os.walk(papers_folder):
    for file in files:
        if file.endswith(".pdf"):
            pdf_files.append(os.path.splitext(file)[0])

unique_references = merged_df["reference"].unique()
size = len(unique_references)


missing_papers = [pdf for pdf in pdf_files if pdf not in unique_references]

if len(unique_references) == size:
    print("All 64 papers are present in the merged_df.")
else:
    print(f"Number of unique papers in merged_df: {len(unique_references)}")
    print(f"Missing papers: {missing_papers}")

All 64 papers are present in the merged_df.


In [18]:
pdf_files = []
for root, dirs, files in os.walk(papers_folder):
    for file in files:
        if file.endswith(".pdf"):
            pdf_files.append(os.path.abspath(os.path.join(root, file)))

print(pdf_files)

['/home/PUC/Documentos/AutoSLR/papers_pdf/Scopus/Krishna2021.pdf', '/home/PUC/Documentos/AutoSLR/papers_pdf/Scopus/Xiang2022.pdf', '/home/PUC/Documentos/AutoSLR/papers_pdf/Scopus/tërnava2022.pdf', '/home/PUC/Documentos/AutoSLR/papers_pdf/Scopus/Kaltenecker2019.pdf', '/home/PUC/Documentos/AutoSLR/papers_pdf/Scopus/Muhlbauer2020.pdf', '/home/PUC/Documentos/AutoSLR/papers_pdf/Scopus/Silva2023.pdf', '/home/PUC/Documentos/AutoSLR/papers_pdf/Scopus/Salman2023.pdf', '/home/PUC/Documentos/AutoSLR/papers_pdf/Scopus/Iqbal2023.pdf', '/home/PUC/Documentos/AutoSLR/papers_pdf/Scopus/alves2020-icpe.pdf', '/home/PUC/Documentos/AutoSLR/papers_pdf/Scopus/chen2023.pdf', '/home/PUC/Documentos/AutoSLR/papers_pdf/Scopus/mehlstäubl2022.pdf', '/home/PUC/Documentos/AutoSLR/papers_pdf/Scopus/schmid2022.pdf', '/home/PUC/Documentos/AutoSLR/papers_pdf/Scopus/damasceno2019.pdf', '/home/PUC/Documentos/AutoSLR/papers_pdf/Scopus/li2020-ase.pdf', '/home/PUC/Documentos/AutoSLR/papers_pdf/Scopus/Iorio2019.pdf', '/home/