In [8]:
import pandas as pd
from modules.NER_functions import *
import warnings 
warnings.filterwarnings("ignore")

# NoteBook modélisation
Le but de ce notebook est d'extraire le nombre de conditions par critère d'inclusivité des essais cliniques. 

Voici un schéma simplifié des étapes du notebook:

![Texte alternatif](../images/schema.png)



Le modèle entrainé est un modèle BERT fine-tuné. C'est un modèle de reconnaissance d'entités nommés, il est entrainé pour assigner un label à chaque token d'une phrase qu'on lui soumet.

![Texte alternatif2](../images/ner.jpg)

Notre modèle a été entrainé spécialement sur des données d'essais cliniques et présente 5 labels:
* Mood
* Drug
* Condition
* Person
* Procedure
* Observation

Le but du modèle est ainsi de compter le nombre d'attributs recherchés pour un essai clinique afin de construire des indicateurs si un essai clinique est relativement exigeant sur sa sélection de patient.

## Predictions du modèle

In [9]:
# On charge d'abord le modèle

model = Bert_Model()

In [10]:
model.load_model('./model/model1')

In [11]:
tags = ['B-Mood',
 'O',
 'B-Drug',
 'I-Procedure',
 'B-Condition',
 'B-Person',
 'I-Condition',
 'B-Procedure',
 'B-Observation',
 'I-Drug',
 'I-Observation',
 'I-Person',
 'I-Mood',
 'PAD']

In [12]:
model.getTag(tags)

In [13]:
data = pd.read_csv("../data/clini_data.csv")

In [14]:
data['EligibilityCriteria'].iloc[3]

'Inclusion Criteria:\n\nMale and females of non-childbearing potential 18 to 75 years of age, inclusive, at the time of signing the informed consent.\nParticipants with a fasting low-density lipoprotein cholesterol (LDL-C) higher than or equal to 70 mg/dL (1.8 mmol/L) and lower than or equal to 190 mg/dL (4.9 mmol/L) at screening.\nParticipants with fasting triglycerides lower than 400 mg/dL (lower than 4.52 mmol/L) at screening.\nShould be receiving moderate or high-intensity statin therapy for more than or equal to 2 months prior to screening.\nThere should be no planned medication or dose change during study participation.\nBody mass index between 19.0 and 40.0 kg/m^2.\n\nExclusion Criteria:\n\nHistory or presence of gastrointestinal, hepatic or renal disease or any other conditions known to interfere with absorption, distribution, metabolism, or excretion of drugs.\nAny uncontrolled or serious disease, or any medical (eg, known major active infection or major hematological, renal, 

In [15]:
preds = model.predict(data['EligibilityCriteria'].iloc[80][:200])

In [16]:
# Voici un exemple de prédictions du modèle
for elem in preds[10:]:
    print(elem[0], "-"*10, elem[1])

total ---------- B-Procedure
hip ---------- I-Procedure
art ---------- I-Procedure
##hr ---------- I-Procedure
##op ---------- I-Procedure
##las ---------- I-Procedure
##ty ---------- I-Procedure
due ---------- O
to ---------- O
non ---------- O
- ---------- O
inflammatory ---------- O
de ---------- I-Condition
##gen ---------- I-Condition
##erative ---------- I-Condition
joint ---------- I-Condition
disease ---------- I-Condition
( ---------- O
e ---------- O
. ---------- O
g ---------- O
. ---------- O
, ---------- O
os ---------- B-Condition
##te ---------- B-Condition
##oa ---------- B-Condition
##rth ---------- B-Condition
##rit ---------- B-Condition
##is ---------- B-Condition
, ---------- O
traumatic ---------- B-Condition
arthritis ---------- I-Condition
, ---------- O
ava ---------- B-Condition
##scu ---------- B-Condition
##lar ---------- B-Condition
nec ---------- I-Condition
##rosis ---------- I-Condition
, ---------- O
d ---------- B-Condition
##ys ---------- B-Condition


In [144]:
# On ne veut garder que les critères d'inclusions. Pour ce faire, on ne sélectionne dans les critères d'éligibilités que le texte compris entre 
# Inclusion Criteria: et Exclusion Criteria

In [18]:
def extract_inclusion_criteria(eligibility_criteria):
    if type(eligibility_criteria)!=str:
        return None
    match = re.search(r'Inclusion Criteria:(.*?)Exclusion Criteria:', eligibility_criteria, re.DOTALL)
    
    if match:
        return match.group(1).strip()
    else:
        return None

data['InclusionCriteria'] = data['EligibilityCriteria'].apply(extract_inclusion_criteria)

In [148]:
print(data['EligibilityCriteria'].iloc[2],"\n\n", "-"*40,"\n\n", data["InclusionCriteria"].iloc[2])

Inclusion Criteria:

Patients undergoing elective heart surgery

Exclusion Criteria:

Refusal to consent
Patients undergoing emergency surgery 

 ---------------------------------------- 

 Patients undergoing elective heart surgery


In [149]:
# On réduit maintenant la taille des critères d'éligibilités, en effet le modèle entrainé peut modéliser jusqu'à 115 token et certains essais cliniques
# Ont une liste de critère très longue. Dans un soucis de simplicité, on tronquera donc certaines données

def reduce_numb_words(phrase, nombre_mots=80):
    if phrase == None:
        return None
    mots = phrase.split()[:nombre_mots]
    return ' '.join(mots)

data['InclusionReduced'] = data['InclusionCriteria'].apply(reduce_numb_words)

In [150]:
data['InclusionReduced'] = data['InclusionReduced'].fillna('')

In [151]:
#On met les critères d'inclusivités auxquels nous appliqueront le modèle dans une liste

list_inclusion = data[(data['Phase']=="Phase 3") & (data['InclusionReduced']!="None") & (data['InclusionReduced']!="")]['InclusionReduced'].tolist()

In [152]:
list_inclusion[:2]

['Male or female and 5 through 30 years of age Prader-Willi syndrome with a documented disease-causing mutation Increased appetite with decreased satiety accompanied by food seeking (consistent with PWS Nutritional Phase 3) HQ-CT total score of ≥13 at Screening and Baseline CGI-S score for hyperphagia in PWS of ≥4 at Screening and Baseline Lives with a caregiver who understands and is willing and able to adhere to study-related procedures and is willing to participate in all study visits',
 'Histologically, cytologically, or radiographically confirmed diagnosis of metastatic cancer Age ≥ 18 years Patients who have cervical, thoracic, or lumbar spine metastasis that need treatment. Patients will have 1 to 3 separate spinal sites that require treatment. Each spinal site to be treated on trial will span 1-2 contiguous vertebral levels ECOG 0-2 Negative serum or urine pregnancy test within 14 days prior to enrollment for women of childbearing potential or who are not postmenopausal Women o

In [49]:
preds = model.batch_predict(list_inclusion)

3979it [1:48:34,  1.64s/it]


## Transcription des prédictions du modèle

Maintenant qu'on a nos prédictions, on compte le nombre d'attributs mentionnée dans les critères d'éligibilité des essais cliniques. 
On créer pour chaque essais clinique un dictionnaire qui associe a un attribut le nombre de fois qu'il apparait. 

In [62]:
list_keys = [element[2:] for element in tags if element.startswith("B")]
list_keys

['Mood', 'Drug', 'Condition', 'Person', 'Procedure', 'Observation']

In [82]:
for elem in preds[0]:
    if elem[1]!='O':
        print(elem)

('female', 'B-Person')
('age', 'B-Person')
('pr', 'B-Condition')
('##ade', 'B-Condition')
('##r', 'I-Condition')
('-', 'I-Condition')
('will', 'I-Condition')
('##i', 'I-Condition')
('syndrome', 'I-Condition')
('disease', 'B-Condition')
('-', 'I-Condition')
('causing', 'I-Condition')
('mutation', 'I-Condition')
('increased', 'I-Condition')
('appetite', 'I-Condition')
('sat', 'I-Condition')
('##ie', 'I-Condition')
('##ty', 'I-Condition')
('seeking', 'I-Condition')
('hyper', 'B-Condition')
('##pha', 'B-Condition')


In [125]:
def count_attributes(pred):
    dict_clini = {}
    for elem in list_keys:
        dict_clini[elem]=0


    for i in pred:
        if i[1][0]=="B":
            if i[0][:2]!="##":
                dict_clini[i[1][2:]]+=1
    return dict_clini
count_attributes(preds[0])

{'Mood': 0,
 'Drug': 0,
 'Condition': 3,
 'Person': 2,
 'Procedure': 0,
 'Observation': 0}

In [126]:
df_augmented = data[(data['Phase']=="Phase 3") & (data['InclusionReduced']!="None") & (data['InclusionReduced']!="")].copy()
df_augmented = df_augmented.reset_index(drop=True)

In [154]:
preds_dict_list = [count_attributes(elem) for elem in preds]

df_augmented['raw_count'] = preds_dict_list

In [155]:
df_augmented[['InclusionCriteria','raw_count']]

Unnamed: 0,InclusionCriteria,raw_count
0,Male or female and 5 through 30 years of age\n...,"{'Mood': 0, 'Drug': 0, 'Condition': 3, 'Person..."
1,"Histologically, cytologically, or radiographic...","{'Mood': 0, 'Drug': 0, 'Condition': 3, 'Person..."
2,Subject must be able to understand and comply ...,"{'Mood': 0, 'Drug': 0, 'Condition': 0, 'Person..."
3,Be 3-5 years old\nExhibit no symptoms of malar...,"{'Mood': 0, 'Drug': 0, 'Condition': 1, 'Person..."
4,PMR patients who fulfilled the 1982 Chuang cri...,"{'Mood': 0, 'Drug': 2, 'Condition': 0, 'Person..."
...,...,...
3974,Histologically confirmed invasive breast carci...,"{'Mood': 0, 'Drug': 4, 'Condition': 1, 'Person..."
3975,Completed study VMDN-003-2 and consent to enro...,"{'Mood': 0, 'Drug': 0, 'Condition': 0, 'Person..."
3976,X-linked hypophosphatemia diagnosed by the doc...,"{'Mood': 0, 'Drug': 0, 'Condition': 10, 'Perso..."
3977,All patients undergoing foot and ankle surgeri...,"{'Mood': 0, 'Drug': 0, 'Condition': 0, 'Person..."


In [134]:
print(df_augmented['InclusionCriteria'].iloc[3974])
print("\n\n")
print(df_augmented['raw_count'].iloc[3974])

Histologically confirmed invasive breast carcinoma
Centrally-confirmed human epidermal growth factor receptor 2 (HER2)-positive invasive breast cancer
Centrally confirmed PD-L1 and hormone receptor status
Clinical stage at disease presentation (prior to neoadjuvant therapy): cT4/anyN/M0, any cT/N2-3/M0, or cT1-3/N0-1/M0 (participants with cT1mi/T1a/T1b/N0 are not eligible)
Completion of pre-operative systemic chemotherapy including at least 9 weeks of taxane and 9 weeks of trastuzumab (anthracycline and/or additional HER2-targeted agents are permitted)
<=12 weeks between primary surgery and randomization
Eastern Cooperative Oncology Group (ECOG) Performance Status 0 or 1
Screening left ventricular ejection fraction (LVEF) >= 50% and no decrease in LVEF by >15% from the pre-chemotherapy LVEF. If no pre-chemotherapy LVEF, screening LVEF >= 55%
Life expectancy >= 6 months
Adequate hematologic and end organ function



{'Mood': 0, 'Drug': 4, 'Condition': 1, 'Person': 0, 'Procedure': 2, 'Ob

In [156]:
# On réparti maintenant les observations dans des colonnes annexes (afin de pouvoir utiliser les variables facilement ensuite)

colonnes_separees = df_augmented['raw_count'].apply(pd.Series)
colonnes_separees.rename(columns={'Condition': 'Conditions'}, inplace=True)
df_final = pd.concat([df_augmented, colonnes_separees], axis=1)
df_final[["NCTId", "InclusionCriteria","Drug","Conditions","Procedure"]]

Unnamed: 0,NCTId,InclusionCriteria,Drug,Conditions,Procedure
0,NCT06173531,Male or female and 5 through 30 years of age\n...,0,3,0
1,NCT06173401,"Histologically, cytologically, or radiographic...",0,3,2
2,NCT06173284,Subject must be able to understand and comply ...,0,0,0
3,NCT06173206,Be 3-5 years old\nExhibit no symptoms of malar...,0,1,0
4,NCT06172361,PMR patients who fulfilled the 1982 Chuang cri...,2,0,0
...,...,...,...,...,...
3974,NCT04873362,Histologically confirmed invasive breast carci...,4,1,2
3975,NCT04873232,Completed study VMDN-003-2 and consent to enro...,0,0,0
3976,NCT04872907,X-linked hypophosphatemia diagnosed by the doc...,0,10,0
3977,NCT04872322,All patients undergoing foot and ankle surgeri...,0,0,1


In [142]:
# On sauvegarde le modèle 

df_final.to_csv('./data/Data_augmented.csv')

In [159]:
import pickle

# J'enregistre les prédictions dans un fichier pkl au cas où je souhaite les réutiliser
with open('./data/predictions.pkl', 'wb') as fichier:
    pickle.dump(preds, fichier)

In [160]:
with open('./data/predictions.pkl', 'rb') as fichier:
    predictions = pickle.load(fichier)
predictions

[[('male', 'O'),
  ('or', 'O'),
  ('female', 'B-Person'),
  ('and', 'O'),
  ('5', 'O'),
  ('through', 'O'),
  ('30', 'O'),
  ('years', 'O'),
  ('of', 'O'),
  ('age', 'B-Person'),
  ('pr', 'B-Condition'),
  ('##ade', 'B-Condition'),
  ('##r', 'I-Condition'),
  ('-', 'I-Condition'),
  ('will', 'I-Condition'),
  ('##i', 'I-Condition'),
  ('syndrome', 'I-Condition'),
  ('with', 'O'),
  ('a', 'O'),
  ('documented', 'O'),
  ('disease', 'B-Condition'),
  ('-', 'I-Condition'),
  ('causing', 'I-Condition'),
  ('mutation', 'I-Condition'),
  ('increased', 'I-Condition'),
  ('appetite', 'I-Condition'),
  ('with', 'O'),
  ('decreased', 'O'),
  ('sat', 'I-Condition'),
  ('##ie', 'I-Condition'),
  ('##ty', 'I-Condition'),
  ('accompanied', 'O'),
  ('by', 'O'),
  ('food', 'O'),
  ('seeking', 'I-Condition'),
  ('(', 'O'),
  ('consistent', 'O'),
  ('with', 'O'),
  ('p', 'O'),
  ('##ws', 'O'),
  ('nutritional', 'O'),
  ('phase', 'O'),
  ('3', 'O'),
  (')', 'O'),
  ('hq', 'O'),
  ('-', 'O'),
  ('ct', 'O')