In [1]:
%%capture
!pip install multiversity

from multiversity.multicare_dataset import MedicalDatasetCreator

In [2]:
import os
import pandas as pd

In [3]:
current_directory = os.getcwd()
print(f"Current directory: {current_directory}")

os.chdir(f'{current_directory}')
print(f"Working directory set to: {os.getcwd()}")

Current directory: /home/onyxia/work/projet_NLP
Working directory set to: /home/onyxia/work/projet_NLP


# 1. Downloading the Whole MultiCaRe Dataset

In [5]:
mdc = MedicalDatasetCreator(directory = 'medical_datasets')

The MultiCaRe Dataset is already downloaded.
Importing and pre-processing the main files.
Done!


In [None]:
sorted(os.listdir('medical_datasets/whole_multicare_dataset'))

['PMC1',
 'PMC2',
 'PMC3',
 'PMC4',
 'PMC5',
 'PMC6',
 'PMC7',
 'PMC8',
 'PMC9',
 'abstracts.parquet',
 'captions_and_labels.csv',
 'case_images.parquet',
 'cases.parquet',
 'data_dictionary.csv',
 'metadata.parquet']

# 2. Reading of dataset

In [6]:
data_dictionary = pd.read_csv('medical_datasets/whole_multicare_dataset/data_dictionary.csv')

data_dictionary.head()

Unnamed: 0,file,field,explanation
0,captions_and_labels.csv,file_id,Primary key for each row. Each row contains on...
1,captions_and_labels.csv,file,Name of the image file. The file path can be d...
2,captions_and_labels.csv,main_image,Id from the original image (it corresponds to ...
3,captions_and_labels.csv,image_component,It is 'undivided' if the source image was not ...
4,captions_and_labels.csv,patient_id,"Id of the patient, created combining the PMC o..."


In [7]:
mdc.full_cases

{'cases': [[{'age': 53,
    'case_id': 'PMC3738355_01',
    'case_text': 'A 53-year-old woman presented with a 10-year history of intermittent abdominal pain, swelling and continuous vomiting. The patient denied presence of fever, nausea, and weight loss. There were no significant findings at physical examination. An abdominal ultrasound exam revealed a 10.4 x 10.0 cm mass of heterogeneous echogenicity in the left upper abdomen. Axial unenhanced CT scan (Fig. 1a) confirmed an ill-defined heterogeneous mass 9.1 cm in diameter that filled the left abdominal quadrant, located between the left lobe of the liver and the spleen. The CT attenuation of the mass was around 26-53 HU on non-enhanced scan. After intravenous contrast medium injection the mass enhanced gradually and heterogeneously (Fig. 1b-e), but was still hypodense relative to the spleen. There was no accompanying lymphadenopathy or evidence of malignant process elsewhere in the abdomen. Because the origin of the mass was unknown

In [8]:
nb_cases = len(mdc.full_cases['cases'])

print(f'nombre de cas cliniques disponibles : {nb_cases}')

nombre de cas cliniques disponibles : 85653


In [30]:
file_full_df = "full_df"
path = f'data/{file_full_df}'

# Vérifier si le fichier existe avant d'exécuter le traitement
if not os.path.exists(path):
    df = pd.DataFrame()
    for i in range(len(mdc.full_cases['cases'])):
        temp = pd.DataFrame(mdc.full_cases['cases'][i])
        df = pd.concat([df, temp])

    df.to_parquet(path)
    print("Fichier 'full_df' créé avec succès.")
else:
    print("Le fichier 'full_df' existe déjà. Aucune action effectuée.")

Fichier 'full_df' créé avec succès.


In [37]:
mdc.full_cases['cases'][5]

[{'age': 40,
  'case_id': 'PMC9106225_01',
  'case_text': 'A 40-year-old female patient was referred for evaluation of an asymptomatic swelling on the lower lip of 6 months duration with a history of local trauma. Clinical examination revealed a well-defined, nontender, smooth-surfaced, roughly oval, fluctuant swelling [Figure 1]. No relevant medical history was elicited. Oral hygiene was fair. A provisional diagnosis of mucocele was made; the lesion was excised under local anesthesia. Histopathological examination of the excised tissue revealed cystic lumen devoid of lining epithelium and surrounded by compressed granulation tissue and peripherally located mixed salivary glands. The lumen was filled with numerous mucinous globular structures which were oval or round in shape and of varying sizes [Figure 2]. Most of the globules present in the cystic lumen were attached to the surrounding cystic capsule and seemed dissociated from one another because of their globular organization. Som

In [9]:
df_metadata = pd.DataFrame(mdc.full_metadata['article_metadata'])
df_metadata.shape

(85653, 14)

In [36]:
df_metadata['major_mesh_terms'] = df_metadata['major_mesh_terms'].apply(lambda x: None if x == [] else x)
df_metadata.head(10)

Unnamed: 0,authors,case_amount,doi,journal,journal_detail,keywords,license,link,major_mesh_terms,mesh_terms,pmcid,pmid,title,year
0,"[Wenhua Liang, Jingjing Lu, Mingwei Qin, Xinti...",1,10.1258/arsr.2012.120031,Acta Radiol Short Rep,2012 Jul 31;1(6):arsr.2012.120031.,"[vascular tumor, diagnostic imaging, liver, pa...",CC BY-NC,https://pubmed.ncbi.nlm.nih.gov/23986846/,,[Case Reports],PMC3738355,23986846,Littoral cell angioma mimicking hepatic tumor,2012
1,"[Payal J Shah, Brian Ellis, Lauren R DiGiovine...",1,10.3205/oc000036,GMS Ophthalmol Cases,2015 Dec 2;5:Doc14.,"[mri, aortic arch, carotid artery stenosis, ca...",CC BY,https://pubmed.ncbi.nlm.nih.gov/27625958/,,[Case Reports],PMC5015624,27625958,Central retinal artery occlusion following las...,2015
2,"[Nobuhiko Seki, Maika Natsume, Ryosuke Ochiai,...",1,10.1159/000493088,Case Rep Oncol,2019 Jan 21;12(1):91-97.,"[bevacizumab, egfr mutation, erlotinib, lung c...",CC BY-NC,https://pubmed.ncbi.nlm.nih.gov/30792648/,,[Case Reports],PMC6381877,30792648,Promising Combination Therapy with Bevacizumab...,2019
3,"[Ali Raza Ghani, Faisal Inayat, Nouman Safdar ...",9,10.1177/2324709618770479,J Investig Med High Impact Case Rep,2018 Apr 18;6:2324709618770479.,"[acute coronary syndrome, diagnosis, fibromusc...",CC BY,https://pubmed.ncbi.nlm.nih.gov/29707593/,,[],PMC5912312,29707593,Spontaneous Coronary Artery Dissection: A Case...,2018
4,"[Xiangyi Kong, Dongmei Li, Yanguo Kong, Dingro...",1,10.1097/MD.0000000000005657,Medicine (Baltimore),2017 Jan;96(4):e5657.,,CC BY,https://pubmed.ncbi.nlm.nih.gov/28121922/,"[Adenoma, Oxyphilic / pathology, Ki-67 Antigen...","[Adenoma, Oxyphilic / pathology, Ki-67 Antigen...",PMC5287946,28121922,Malignant adenohypophysis spindle cell oncocyt...,2017
5,"[Rutuja Gajanan Vidhale, Subraj Shetty, Nikita...",1,10.4103/jomfp.jomfp_214_21,J Oral Maxillofac Pathol,2022 Jan-Mar;26(1):101-103.,"[mucocele, mucus extravasation cyst, myxoglobu...",CC BY-NC-SA,https://pubmed.ncbi.nlm.nih.gov/35571290/,,[Case Reports],PMC9106225,35571290,A novel mucocele: Myxoglobulosis,2022
6,"[Liaqat A Khan, Ali M Al-Neami, Ayman F Solima...",1,10.1016/j.ijscr.2020.03.051,Int J Surg Case Rep,2020;71:364-366.,"[bochdalek, diaphragm, emergency care, hernia,...",CC BY,https://pubmed.ncbi.nlm.nih.gov/32506006/,,[Case Reports],PMC7276389,32506006,Bochdalek hernia with retrocardiac spleen - Di...,2020
7,"[Ryuji Kawaguchi, Naoto Furukawa, Yoshihiko Ya...",2,10.1159/000330239,Case Rep Oncol,2011;4(2):358-62.,"[alpha-fetoprotein, hepatoid adenocarcinoma, r...",CC BY-NC-ND,https://pubmed.ncbi.nlm.nih.gov/21769295/,,[Case Reports],PMC3134036,21769295,Carcinosarcoma of the uterine corpus with alph...,2011
8,"[Ludovica Fucci, Lorenzo Gensini, Ugo Coppetel...",1,10.1016/j.lrr.2022.100330,Leuk Res Rep,2022 May 30;17:100330.,"[daratumumab, multiple myeloma, real-word expe...",CC BY-NC-ND,https://pubmed.ncbi.nlm.nih.gov/35694449/,,[Case Reports],PMC9184873,35694449,Daratumumab triplet therapies in patients with...,2022
9,"[Pradyumna Agasthi, Hemalatha Narayanasamy, Da...",1,10.1155/2018/3953579,Case Rep Cardiol,2018 Sep 30;2018:3953579.,,CC BY,https://pubmed.ncbi.nlm.nih.gov/30363961/,,[Case Reports],PMC6186336,30363961,Decitabine Induced Delayed Cardiomyopathy in H...,2018


In [32]:
print('nombre de ligne avec keywords non vide : ' + str(df_metadata.keywords.notna().sum()))
print('nombre de ligne avec major_mesh_terms non vide : ' + str(df_metadata.major_mesh_terms.notna().sum()))

nombre de ligne avec keywords non vide : 59628
nombre de ligne avec major_mesh_terms non vide : 13211


In [None]:
file = "df_metadata"
path = f'data/{file}'

if not os.path.exists(path):
    df_metadata.to_parquet(path)
else:
    print("Le fichier 'df_metadata' existe déjà. Aucune action effectuée.")


Le fichier 'df_metadata' existe déjà. Aucune action effectuée.
