In [131]:
import pandas as pd
import numpy as np
from dirty_cat import fuzzy_join
import math
import glob
import shutil

In [52]:
papers_info = pd.read_csv("../../Results/extraction/papers_infos_openalex.csv")
df_oa_reference_allInfo = pd.read_csv("../../Results/extraction/oa_papers_datasets_reference.csv").drop_duplicates(["name"])
df_oa_reference_allInfo["year"] = papers_info["year"]
df_oa_reference_allInfo["venue"] = papers_info["venue"]
df_oa_reference_allInfo = df_oa_reference_allInfo[((df_oa_reference_allInfo["year"]>=2013) & (df_oa_reference_allInfo["year"]<=2023))| (df_oa_reference_allInfo["year"].isna())]

df_oa_abstract_allInfo = pd.read_csv("../../Results/extraction/oa_papers_datasets_abstract.csv").drop_duplicates(["name"])
df_oa_abstract_allInfo["year"] = papers_info["year"]
df_oa_abstract_allInfo["venue"] = papers_info["venue"]
df_oa_abstract_allInfo = df_oa_abstract_allInfo[((df_oa_abstract_allInfo["year"]>=2013) & (df_oa_abstract_allInfo["year"]<=2023)) | (df_oa_abstract_allInfo["year"].isna())]

fulltext_abstract_info = pd.read_csv("../../Results/extraction/fulltext_datasets_abstract.csv").drop_duplicates(["name"])
fulltext_references_info = pd.read_csv("../../Results/extraction/fulltext_datasets_references.csv").drop_duplicates(["name"])
fulltext_method_info = pd.read_csv("../../Results/extraction/fulltext_datasets_method.csv").drop_duplicates(["name"])
#fulltext_results_info = pd.read_csv("../../Results/extraction/fulltext_datasets_results.csv").drop_duplicates(["name"])

common_papers = fuzzy_join(
    fulltext_abstract_info,  # our table to join
    df_oa_abstract_allInfo,  # the table to join with
    left_on="name",  # the first join key column
    right_on="name",  # the second join key column
    return_score=True,
)

#Names for fulltext df
common_papers_x = common_papers["name_x"]

#Name for OA df
common_papers_y = common_papers["name_y"]

fulltext_abstract_info["venue"] = common_papers["venue"]
fulltext_references_info["venue"] = common_papers["venue"]
fulltext_method_info["venue"] = common_papers["venue"]
#fulltext_results_info["venue"] = common_papers["venue"]



In [53]:
datasets_columns = ["ACDC","BRATS","LIDC-IDRI","DRIVE","PROMISE12","Chexpert","PadChest","PAD-UFES-20","CAMELYON","CADDementia","MRNet","PROSTATEx","MIMIC","CBIS-DDSM"]
venues = ["MICCAI","MIDL"]

In [70]:
fulltext_references_info["name"] = fulltext_references_info["name"].apply(lambda x:x.removesuffix("pdf"))
combination = df_oa_reference_allInfo[df_oa_reference_allInfo["name"].isin(common_papers_y)][datasets_columns].reset_index(drop=True)|fulltext_references_info[fulltext_references_info["name"].isin(common_papers_x)][datasets_columns].reset_index(drop=True)
combination.index = common_papers["name_x"]
df_merge_ref = df_oa_reference_allInfo.set_index(["name"])
df_merge_ref.update(combination)

In [101]:
fulltext_abstract_info["name"] = fulltext_abstract_info["name"].apply(lambda x:x.removesuffix("pdf"))
combination = df_oa_abstract_allInfo[df_oa_abstract_allInfo["name"].isin(common_papers_y)][datasets_columns].reset_index(drop=True)|fulltext_abstract_info[fulltext_abstract_info["name"].isin(common_papers_x)][datasets_columns].reset_index(drop=True)
combination.index = common_papers["name_x"]
df_merge_abs = df_oa_abstract_allInfo.set_index(["name"])
df_merge_abs.update(combination)

In [118]:
have_ft  = df_merge_ref[df_merge_ref.index.isin(common_papers_y)][datasets_columns] | df_merge_abs[df_merge_abs.index.isin(common_papers_y)][datasets_columns]
# have_ft = df_merge_ref[df_merge_ref.index.isin(common_papers_y)] | df_merge_abs[df_merge_abs.index.isin(common_papers_y)]
have_ft["year"] = df_merge_ref[df_merge_ref.index.isin(common_papers_y)]["year"]
have_ft["venue"] = df_merge_ref[df_merge_ref.index.isin(common_papers_y)]["venue"]
detected_papers_name = have_ft[have_ft[datasets_columns].any(axis=1)]

In [119]:
have_a_detection = have_ft[(have_ft.index.isin(detected_papers_name.index)) & have_ft["venue"].isin(venues)]
no_detection = have_ft[(~have_ft.index.isin(detected_papers_name.index)) & have_ft["venue"].isin(venues)]

In [122]:
have_a_detection.groupby("venue").count()["ACDC"]

venue
MICCAI    215
MIDL       20
Name: ACDC, dtype: int64

In [123]:
no_detection.groupby("venue").count()["ACDC"]

venue
MICCAI    2096
MIDL       190
Name: ACDC, dtype: int64

In [None]:
nb_papers = 65
for venue in venues:
    detect_venue = have_a_detection[have_a_detection["venue"] == venue]
    index_jump = math.floor(len(detect_venue)/nb_papers)
    index_jump = max(1,index_jump) # In case there is not enough paper
    names = detect_venue.iloc[::index_jump, :]["name"]
    names.to_csv(f"../../Results/analysis/detect_{venue}.csv",index=False)
    print(f"Number of screened papers with detection for {venue}: {len(names)}") 
    
    no_detect_venue = no_detection[no_detection["venue"] == venue]
    index_jump = math.floor(len(no_detect_venue)/nb_papers)
    index_jump = max(1,index_jump)
    names = no_detect_venue.iloc[::index_jump, :]["name"]
    names.to_csv(f"../../Results/analysis/no_detect_{venue}.csv",index=False)
    print(f"Number of screened papers without detection for {venue}: {len(names)}")

Number of screened papers with detection for MICCAI: 84
Number of screened papers without detection for MICCAI: 67
Number of screened papers with detection for MIDL: 50
Number of screened papers without detection for MIDL: 81


In [42]:
#Merge csv
lst_df = []
for path in glob.glob("../../Results/analysis/*.csv"):
    df = pd.read_csv(path)
    lst_df.append(df.copy())
df_concat = pd.concat(lst_df)

In [141]:
#Create folds for papers that have a detection
from sklearn.model_selection import StratifiedKFold
skf = StratifiedKFold(n_splits=10)
folds = skf.split(have_a_detection, have_a_detection.venue)
with open("../../Results/folds/detect_folds.csv","w") as detect_folds_file:
    detect_folds_file.write("fold_id,pdf_name")
    for i,(_,fold_indexes) in enumerate(folds):
        for pdf in have_a_detection.iloc[fold_indexes].iterrows():
            filename = pdf[0].replace("/"," ")
            try:
                shutil.copy(f"../../Results/extraction/fulltext/{filename}.pdf",f"../../Results/folds/detect/{filename}.pdf")
                detect_folds_file.write(f'\n{i},"{filename}"')
            except:
                print("ERROR",{filename})

have_a_detection.iloc[fold_indexes].groupby("venue").count()["ACDC"]

venue
MICCAI    21
MIDL       2
Name: ACDC, dtype: int64

In [142]:
skf = StratifiedKFold(n_splits=30,shuffle=True,random_state=1907)
folds = skf.split(no_detection, no_detection.venue)
with open("../../Results/folds/no_detect_folds.csv","w") as no_detect_folds_file:
    no_detect_folds_file.write("fold_id,pdf_name")
    for i,(_,fold_indexes) in enumerate(folds):
        for pdf in no_detection.iloc[fold_indexes].iterrows():
            filename = pdf[0].replace("/"," ")
            try:
                shutil.copy(f"../../Results/extraction/fulltext/{filename}.pdf",f"../../Results/folds/no_detect/{filename}.pdf")
                no_detect_folds_file.write(f'\n{i},"{filename}"')
            except:
                print("ERROR",{filename})
no_detection.iloc[fold_indexes].groupby("venue").count()["ACDC"]

ERROR {'Learning the Latent Heat Diffusion Process through Structural Brain Network from Longitudinal β-Amyloid Data'}
ERROR {'Residual learning for 3D motion corrected quantitative MRI: Robust clinical T1 T2 and proton density mapping'}


venue
MICCAI    69
MIDL       7
Name: ACDC, dtype: int64