In [1]:
import os
import s3fs
import pandas as pd

# 1. Chargement des fichiers

 
> Chargement à partir du serveur AWS du fichier full_df qui contient l'ensemble des cas cliniques en format texte ainsi qu'un "**case_id**"
>
> Chargement à partir du fichier 'df_metadata' qui contient les diagnostique et les keywords lié à chaque cas cliniques ainsi qu'un "**case_id**"



In [2]:
# Create filesystem object
S3_ENDPOINT_URL = "https://" + os.environ["AWS_S3_ENDPOINT"]
fs = s3fs.S3FileSystem(client_kwargs={'endpoint_url': S3_ENDPOINT_URL})

In [3]:
print(fs.ls('s3://quentin1999'))

['quentin1999/Data_Projet_NLP', 'quentin1999/full_df']


In [4]:
BUCKET = "s3://quentin1999/Data_Projet_NLP"
FILE_KEY_S3 = "full_df"
FILE_PATH_S3 = BUCKET + "/" + FILE_KEY_S3

with fs.open(FILE_PATH_S3, mode="rb") as file_in:
    full_df = pd.read_parquet(file_in)

In [5]:
full_df.head()

Unnamed: 0,age,case_id,case_text,gender
0,53.0,PMC3738355_01,A 53-year-old woman presented with a 10-year h...,Female
0,69.0,PMC5015624_01,A 69-year-old Caucasian female with coronary a...,Female
0,60.0,PMC6381877_01,A 60-year-old male smoker presented with persi...,Male
0,41.0,PMC5912312_01,A 41-year-old female with a past medical histo...,Female
1,51.0,PMC5912312_02,A 51-year-old male with a history of SCAD pres...,Male


In [11]:
full_df["pmcid"] = full_df["case_id"].apply(lambda x: x.split("_")[0])
full_df.head(2)

Unnamed: 0,age,case_id,case_text,gender,pmcid
0,53.0,PMC3738355_01,A 53-year-old woman presented with a 10-year h...,Female,PMC3738355
0,69.0,PMC5015624_01,A 69-year-old Caucasian female with coronary a...,Female,PMC5015624


In [7]:
df_metadata = pd.read_parquet('data/df_metadata')

print('nombre de ligne avec keywords non vide : ' + str(df_metadata.keywords.notna().sum()))
print('nombre de ligne avec major_mesh_terms non vide : ' + str(df_metadata.major_mesh_terms.notna().sum()))

nombre de ligne avec keywords non vide : 59628
nombre de ligne avec major_mesh_terms non vide : 13211


In [8]:
df_metadata.head()

Unnamed: 0,authors,case_amount,doi,journal,journal_detail,keywords,license,link,major_mesh_terms,mesh_terms,pmcid,pmid,title,year
0,"[Wenhua Liang, Jingjing Lu, Mingwei Qin, Xinti...",1,10.1258/arsr.2012.120031,Acta Radiol Short Rep,2012 Jul 31;1(6):arsr.2012.120031.,"[vascular tumor, diagnostic imaging, liver, pa...",CC BY-NC,https://pubmed.ncbi.nlm.nih.gov/23986846/,,[Case Reports],PMC3738355,23986846,Littoral cell angioma mimicking hepatic tumor,2012
1,"[Payal J Shah, Brian Ellis, Lauren R DiGiovine...",1,10.3205/oc000036,GMS Ophthalmol Cases,2015 Dec 2;5:Doc14.,"[mri, aortic arch, carotid artery stenosis, ca...",CC BY,https://pubmed.ncbi.nlm.nih.gov/27625958/,,[Case Reports],PMC5015624,27625958,Central retinal artery occlusion following las...,2015
2,"[Nobuhiko Seki, Maika Natsume, Ryosuke Ochiai,...",1,10.1159/000493088,Case Rep Oncol,2019 Jan 21;12(1):91-97.,"[bevacizumab, egfr mutation, erlotinib, lung c...",CC BY-NC,https://pubmed.ncbi.nlm.nih.gov/30792648/,,[Case Reports],PMC6381877,30792648,Promising Combination Therapy with Bevacizumab...,2019
3,"[Ali Raza Ghani, Faisal Inayat, Nouman Safdar ...",9,10.1177/2324709618770479,J Investig Med High Impact Case Rep,2018 Apr 18;6:2324709618770479.,"[acute coronary syndrome, diagnosis, fibromusc...",CC BY,https://pubmed.ncbi.nlm.nih.gov/29707593/,,[],PMC5912312,29707593,Spontaneous Coronary Artery Dissection: A Case...,2018
4,"[Xiangyi Kong, Dongmei Li, Yanguo Kong, Dingro...",1,10.1097/MD.0000000000005657,Medicine (Baltimore),2017 Jan;96(4):e5657.,,CC BY,https://pubmed.ncbi.nlm.nih.gov/28121922/,"[Adenoma, Oxyphilic / pathology, Ki-67 Antigen...","[Adenoma, Oxyphilic / pathology, Ki-67 Antigen...",PMC5287946,28121922,Malignant adenohypophysis spindle cell oncocyt...,2017


# 2. Analyse MeSH terms

> Suppression des articles avec plus d'un cas cliniques (case_amount > 1) du df_metadata
>
> Suppression des articles avec colonne major_mesh_terms vide

In [9]:
df_metadata_mesh = df_metadata.copy(deep=True)

df_metadata_mesh.dropna(subset='major_mesh_terms', inplace=True)
df_metadata_mesh.query("case_amount <= 1", inplace=True)

df_metadata_mesh.head(10)

Unnamed: 0,authors,case_amount,doi,journal,journal_detail,keywords,license,link,major_mesh_terms,mesh_terms,pmcid,pmid,title,year
4,"[Xiangyi Kong, Dongmei Li, Yanguo Kong, Dingro...",1,10.1097/MD.0000000000005657,Medicine (Baltimore),2017 Jan;96(4):e5657.,,CC BY,https://pubmed.ncbi.nlm.nih.gov/28121922/,"[Adenoma, Oxyphilic / pathology, Ki-67 Antigen...","[Adenoma, Oxyphilic / pathology, Ki-67 Antigen...",PMC5287946,28121922,Malignant adenohypophysis spindle cell oncocyt...,2017
18,"[Stéphane Helleringer, Géraldine Duthé, Almamy...",1,10.1111/tmi.12012,Trop Med Int Health,2013 Jan;18(1):27-34.,,author_manuscript,https://pubmed.ncbi.nlm.nih.gov/23130912/,"[Cause of Death, Data Collection / standards, ...","[Cause of Death, Data Collection / standards, ...",PMC3678730,23130912,Misclassification of pregnancy-related deaths ...,2013
19,"[Jian-Min Jin, Peng Bai, Wei He, Fei Wu, Xiao-...",1,10.3389/fpubh.2020.00152,Front Public Health,2020 Apr 29;8:152.,"[covid-19, sars, sars-cov-2, female, gender, m...",CC BY,https://pubmed.ncbi.nlm.nih.gov/32411652/,"[COVID-19 / epidemiology, COVID-19 / mortality...","[COVID-19 / epidemiology, COVID-19 / mortality...",PMC7201103,32411652,Gender Differences in Patients With COVID-19: ...,2020
24,"[Kamal Khademvatani, Yousef Rezaei, Abdollah K...",1,10.12659/AJCR.890607,Am J Case Rep,2014 Jul 13;15:300-3.,"[echocardiography, leiomyoma, pulmonary emboli...",NO-CC CODE,https://pubmed.ncbi.nlm.nih.gov/25061497/,"[Pulmonary Embolism / etiology, Uterine Neopla...","[Pulmonary Embolism / etiology, Uterine Neopla...",PMC4108191,25061497,Acute pulmonary embolism caused by enlarged ut...,2014
34,"[Teppei Tanaka, Motoharu Hayakawa, Akiyo Sadat...",1,10.2176/nmc.cr.2013-0007,Neurol Med Chir (Tokyo),2014;54(2):155-60.,,CC BY-NC-ND,https://pubmed.ncbi.nlm.nih.gov/24418783/,"[Arteriovenous Fistula / therapy, Carotid Arte...","[Arteriovenous Fistula / therapy, Carotid Arte...",PMC4508704,24418783,Transvenous embolization for carotid-cavernous...,2014
38,"[A Katz, P Zalewski]",1,10.1038/sj.bjc.6601478,Br J Cancer,2003 Dec;89 Suppl 2(Suppl 2):S15-8.,,CC BY,https://pubmed.ncbi.nlm.nih.gov/14661048/,"[Antineoplastic Agents / therapeutic use, Brai...","[Antineoplastic Agents / therapeutic use, Brai...",PMC2750244,14661048,Quality-of-life benefits and evidence of antit...,2003
42,"[Hamza Dergamoun, Abdelaziz El Gdaouni, Imad Z...",1,10.11604/pamj.2022.43.106.36588,Pan Afr Med J,2022 Oct 27;43:106.,"[renal cell carcinoma, case report, gallbladde...",CC BY,https://pubmed.ncbi.nlm.nih.gov/36699978/,"[Carcinoma, Renal Cell / diagnosis, Carcinoma,...","[Carcinoma, Renal Cell / diagnosis, Carcinoma,...",PMC9834804,36699978,Renal cell carcinoma with gallbladder metastas...,2022
46,"[Cindy Y G Noben, Frans J N Nijhuis, Angelique...",1,10.1186/1471-2458-12-43,BMC Public Health,2012 Jan 18;12:43.,,CC BY,https://pubmed.ncbi.nlm.nih.gov/22257557/,"[Disabled Persons, Occupational Health Service...","[Disabled Persons, Occupational Health Service...",PMC3273437,22257557,Design of a trial-based economic evaluation on...,2012
49,"[Yukiko Mikami, Tomonori Nagai, Yousuke Gomi, ...",1,10.1186/s13256-015-0790-6,J Med Case Rep,2016 Jan 18;10:9.,,CC BY,https://pubmed.ncbi.nlm.nih.gov/26781485/,"[Antibiotics, Antineoplastic / therapeutic use...","[Antibiotics, Antineoplastic / therapeutic use...",PMC4717627,26781485,Methotrexate and actinomycin D chemotherapy in...,2016
50,"[Bartosz Hudzik, Lech Poloński, Mariusz Gąsior]",1,10.1007/s11739-015-1384-4,Intern Emerg Med,2016 Dec;11(8):1139-1140.,,CC BY,https://pubmed.ncbi.nlm.nih.gov/26758273/,"[Electrocardiography / standards, Tricuspid Va...","[Electrocardiography / standards, Tricuspid Va...",PMC5114320,26758273,Lancisi sign: giant C-V waves of tricuspid reg...,2016


In [10]:
df_metadata_mesh.shape

(10718, 14)

# 3. Fusionner les 2 datasets

In [17]:
df_merged = pd.merge(df_metadata_mesh, full_df, how='left')
df_merged.head(2)

Unnamed: 0,authors,case_amount,doi,journal,journal_detail,keywords,license,link,major_mesh_terms,mesh_terms,pmcid,pmid,title,year,age,case_id,case_text,gender
0,"[Xiangyi Kong, Dongmei Li, Yanguo Kong, Dingro...",1,10.1097/MD.0000000000005657,Medicine (Baltimore),2017 Jan;96(4):e5657.,,CC BY,https://pubmed.ncbi.nlm.nih.gov/28121922/,"[Adenoma, Oxyphilic / pathology, Ki-67 Antigen...","[Adenoma, Oxyphilic / pathology, Ki-67 Antigen...",PMC5287946,28121922,Malignant adenohypophysis spindle cell oncocyt...,2017,30.0,PMC5287946_01,A 30-year-old man came to Peking Union Medical...,Male
1,"[Stéphane Helleringer, Géraldine Duthé, Almamy...",1,10.1111/tmi.12012,Trop Med Int Health,2013 Jan;18(1):27-34.,,author_manuscript,https://pubmed.ncbi.nlm.nih.gov/23130912/,"[Cause of Death, Data Collection / standards, ...","[Cause of Death, Data Collection / standards, ...",PMC3678730,23130912,Misclassification of pregnancy-related deaths ...,2013,15.0,PMC3678730_01,"For each death, the DSS has systematically rec...",Female


In [21]:
columns = ['pmcid','title', 'gender', 'case_text', 'keywords', 'major_mesh_terms', 'mesh_terms', 'journal', 'doi']
df = df_merged[columns]
df.head(3)

Unnamed: 0,pmcid,title,gender,case_text,keywords,major_mesh_terms,mesh_terms,journal,doi
0,PMC5287946,Malignant adenohypophysis spindle cell oncocyt...,Male,A 30-year-old man came to Peking Union Medical...,,"[Adenoma, Oxyphilic / pathology, Ki-67 Antigen...","[Adenoma, Oxyphilic / pathology, Ki-67 Antigen...",Medicine (Baltimore),10.1097/MD.0000000000005657
1,PMC3678730,Misclassification of pregnancy-related deaths ...,Female,"For each death, the DSS has systematically rec...",,"[Cause of Death, Data Collection / standards, ...","[Cause of Death, Data Collection / standards, ...",Trop Med Int Health,10.1111/tmi.12012
2,PMC7201103,Gender Differences in Patients With COVID-19: ...,Female,The demographic and clinical characteristics a...,"[covid-19, sars, sars-cov-2, female, gender, m...","[COVID-19 / epidemiology, COVID-19 / mortality...","[COVID-19 / epidemiology, COVID-19 / mortality...",Front Public Health,10.3389/fpubh.2020.00152


# 4. Sauvegarde dataframe final 'df'

In [22]:
file = "df_final"
path = f'data/{file}'

if not os.path.exists(path):
    df.to_parquet(path)
else:
    print("Le fichier 'df_metadata' existe déjà. Aucune action effectuée.")