In [1]:
import pandas as pd
from modules.NER_functions import *
import warnings 
warnings.filterwarnings("ignore")

  from .autonotebook import tqdm as notebook_tqdm


# NoteBook modélisation
Le but de ce notebook est d'extraire le nombre de conditions par critère d'inclusivité des essais cliniques. 

Voici un schéma simplifié des étapes du notebook:

![Texte alternatif](./images/schema.png)



Le modèle entrainé est un modèle BERT fine-tuné. C'est un modèle de reconnaissance d'entités nommés, il est entrainé pour assigner un label à chaque token d'une phrase qu'on lui soumet.

![Texte alternatif2](./images/ner.jpg)

Notre modèle a été entrainé spécialement sur des données d'essais cliniques et présente 5 labels:
* Mood
* Drug
* Condition
* Person
* Procedure
* Observation

Le but du modèle est ainsi de compter le nombre d'attributs recherchés pour un essai clinique afin de construire des indicateurs si un essai clinique est relativement exigeant sur sa sélection de patient.

## Predictions du modèle

In [2]:
# On charge d'abord le modèle

model = Bert_Model()

tokenizer_config.json: 100%|██████████| 28.0/28.0 [00:00<00:00, 103kB/s]
vocab.txt: 100%|██████████| 232k/232k [00:00<00:00, 2.69MB/s]
tokenizer.json: 100%|██████████| 466k/466k [00:00<00:00, 5.65MB/s]
config.json: 100%|██████████| 570/570 [00:00<00:00, 2.55MB/s]


In [4]:
model.load_model('./model/model1')

In [5]:
tags = ['I-Procedure',
 'B-Drug',
 'I-Condition',
  'I-Person',
 'B-Condition',
 'O',
 'I-Observation',
 'B-Procedure',
 'B-Person',
 'B-Mood',
 'I-Mood',
 'B-Observation',
 'I-Drug',
 'PAD']

In [6]:
model.getTag(tags)

In [7]:
data = pd.read_csv("../data/clini_data.csv")

In [8]:
data = data_treatment(data)

In [9]:
data['EligibilityCriteria'].iloc[3]

'Inclusion Criteria:\n\n● Age between 18 and 60 years.\n\nWillingness and ability to fully understand the content and scope of the experiment and comply with its instructions.\nHave signed the informed consent.\n\nExclusion Criteria:\n\n● Pregnancy.\n\nOngoing chronic pain or neuromuscular disorder, or any Desis that effect the nociceptive system and not allowed to be evaluated in normal Condition\nHistory of addictive behavior, defined as abuse of alcohol, cannabis, opioids, or other drugs.\nHistory of heat sensitivity disorders.\nHistory of mental illness.\nPresence of fever, tuberculosis, malignant tumors, infectious processes, acute inflammatory processes\nImplantation of pacemakers or metal prostheses.\nUse of analgesics within 24 hours prior to participation in the experiment.\nLack of sleep (< 6 hours) the night before the experiment.\nHigh alcohol intake the evening before the experiment.'

In [10]:
preds = model.predict(data['EligibilityCriteria'].iloc[80][:200])

In [11]:
# Voici un exemple de prédictions du modèle
for elem in preds[10:]:
    print(elem[0], "-"*10, elem[1])

##met ---------- O
##ast ---------- O
##atic ---------- O
, ---------- O
localized ---------- O
, ---------- O
or ---------- O
regional ---------- O
solid ---------- I-Condition
or ---------- I-Condition
blood ---------- I-Condition
ma ---------- I-Condition
##li ---------- I-Condition
##gna ---------- I-Condition
##ncy ---------- I-Condition
( ---------- I-Condition
i ---------- I-Condition
##es ---------- I-Condition
) ---------- I-Condition
; ---------- O
( ---------- O
2 ---------- O
) ---------- O
completion ---------- O
of ---------- O
primary ---------- B-Procedure
cancer ---------- I-Procedure
treatment ---------- I-Procedure
( ---------- O
radiation ---------- B-Procedure
, ---------- O
surgery ---------- B-Procedure
, ---------- O
and ---------- O
/ ---------- O
or ---------- O
ch ---------- B-Procedure
##em ---------- B-Procedure
##otherapy ---------- B-Procedure
) ---------- O
; ---------- O
( ---------- O


In [12]:
# On ne veut garder que les critères d'inclusions. Pour ce faire, on ne sélectionne dans les critères d'éligibilités que le texte compris entre 
# Inclusion Criteria: et Exclusion Criteria

In [13]:
def extract_inclusion_criteria(eligibility_criteria):
    if type(eligibility_criteria)!=str:
        return None
    match = re.search(r'Inclusion Criteria:(.*?)Exclusion Criteria:', eligibility_criteria, re.DOTALL)
    
    if match:
        return match.group(1).strip()
    else:
        return None

data['InclusionCriteria'] = data['EligibilityCriteria'].apply(extract_inclusion_criteria)

In [14]:
print(data['EligibilityCriteria'].iloc[2],"\n\n", "-"*40,"\n\n", data["InclusionCriteria"].iloc[2])

Inclusion Criteria:

Symptomatic paroxysmal AF that are unresponsive to antiarrhythmic drugs (one or more than one).
Willing to undergo catheter ablation for AF.

Exclusion Criteria:

History of any type of catheter ablation for cardiac arrhythmias.
Sinus node dysfunction that requires permanent pacemaker implantation. 

 ---------------------------------------- 

 Symptomatic paroxysmal AF that are unresponsive to antiarrhythmic drugs (one or more than one).
Willing to undergo catheter ablation for AF.


In [15]:
# On réduit maintenant la taille des critères d'éligibilités, en effet le modèle entrainé peut modéliser jusqu'à 115 token et certains essais cliniques
# Ont une liste de critère très longue. Dans un soucis de simplicité, on tronquera donc certaines données

def reduce_numb_words(phrase, nombre_mots=80):
    if phrase == None:
        return None
    mots = phrase.split()[:nombre_mots]
    return ' '.join(mots)

data['InclusionReduced'] = data['InclusionCriteria'].apply(reduce_numb_words)

In [16]:
data['InclusionReduced'] = data['InclusionReduced'].fillna('')

In [17]:
#On met les critères d'inclusivités auxquels nous appliqueront le modèle dans une liste

list_inclusion = data[(data['Phase']=="Phase 3") & (data['InclusionReduced']!="None") & (data['InclusionReduced']!="")]['InclusionReduced'].tolist()

In [18]:
list_inclusion[:2]

['Male and female subjects aged 18 to 65 years inclusive. Written informed consent. Co-living with persons who has developed influenza or other acute respiratory viral infection, diagnosed no more than 3 days ago. No signs of acute respiratory viral infection, influenza or COVID-19 at the time of inclusion in the study. For women with preserved reproductive potential - a negative pregnancy test and consent to use approved methods of contraception during the entire period of participation in the study; for',
 "Need IV's line placement for IV fluids and/or phlebotomy"]

In [19]:
preds = model.batch_predict(list_inclusion)

6261it [19:21,  5.39it/s]


## Transcription des prédictions du modèle

Maintenant qu'on a nos prédictions, on compte le nombre d'attributs mentionnée dans les critères d'éligibilité des essais cliniques. 
On créer pour chaque essais clinique un dictionnaire qui associe a un attribut le nombre de fois qu'il apparait. 

In [20]:
list_keys = [element[2:] for element in tags if element.startswith("B")]
list_keys

['Drug', 'Condition', 'Procedure', 'Person', 'Mood', 'Observation']

In [21]:
for elem in preds[0]:
    if elem[1]!='O':
        print(elem)

('male', 'B-Person')
('female', 'B-Person')
('aged', 'B-Person')
('in', 'B-Condition')
('##fluenza', 'B-Condition')
('acute', 'B-Condition')
('respiratory', 'I-Condition')
('viral', 'I-Condition')
('infection', 'I-Condition')
('acute', 'B-Condition')
('respiratory', 'I-Condition')
('viral', 'I-Condition')
('infection', 'I-Condition')
('in', 'B-Condition')
('##fluenza', 'B-Condition')


In [22]:
def count_attributes(pred):
    dict_clini = {}
    for elem in list_keys:
        dict_clini[elem]=0


    for i in pred:
        if i[1][0]=="B":
            if i[0][:2]!="##":
                dict_clini[i[1][2:]]+=1
    return dict_clini
count_attributes(preds[0])

{'Drug': 0,
 'Condition': 4,
 'Procedure': 0,
 'Person': 3,
 'Mood': 0,
 'Observation': 0}

In [23]:
df_augmented = data[(data['Phase']=="Phase 3") & (data['InclusionReduced']!="None") & (data['InclusionReduced']!="")].copy()
df_augmented = df_augmented.reset_index(drop=True)

In [24]:
preds_dict_list = [count_attributes(elem) for elem in preds]

df_augmented['raw_count'] = preds_dict_list

In [25]:
df_augmented[['InclusionCriteria','raw_count']]

Unnamed: 0,InclusionCriteria,raw_count
0,Male and female subjects aged 18 to 65 years i...,"{'Drug': 0, 'Condition': 4, 'Procedure': 0, 'P..."
1,Need IV's line placement for IV fluids and/or ...,"{'Drug': 1, 'Condition': 0, 'Procedure': 2, 'P..."
2,Male or female ≥18 years of age.\nDocumentatio...,"{'Drug': 0, 'Condition': 6, 'Procedure': 0, 'P..."
3,Participants must be 18 years of age (when sig...,"{'Drug': 0, 'Condition': 4, 'Procedure': 1, 'P..."
4,Participants must have at least 28 teeth in th...,"{'Drug': 0, 'Condition': 0, 'Procedure': 0, 'P..."
...,...,...
6256,Persistent asthma of a minimum of six months d...,"{'Drug': 0, 'Condition': 0, 'Procedure': 1, 'P..."
6257,Male and female patients with mild to severe h...,"{'Drug': 0, 'Condition': 1, 'Procedure': 0, 'P..."
6258,Community dwelling patients 65 years of age or...,"{'Drug': 0, 'Condition': 1, 'Procedure': 0, 'P..."
6259,Clinical diagnosis of 4-10 previously untreate...,"{'Drug': 0, 'Condition': 1, 'Procedure': 0, 'P..."


In [26]:
print(df_augmented['InclusionCriteria'].iloc[3974])
print("\n\n")
print(df_augmented['raw_count'].iloc[3974])

Subject must meet Diagnostic and Statistical Manual of Mental Disorders - Fifth Edition (DSM-5) criteria for a primary diagnosis of ADHD (combined, inattentive, or hyperactive/impulsive presentation) per clinical evaluation and confirmed by the Mini International Neuropsychiatric Interview for Children and Adolescents (MINI-KID).
Subject must have a score of at least 3 (mildly ill) on the clinician-administered Clinical Global Impressions-Severity (CGI-S) scale.
Subjects who completed the efficacy study with KP415 may be rolled over into the current study.
Subject, subject's parent/legal guardian and caregiver (if applicable) must understand and be willing and able to comply with all study procedures and visit schedule.



{'Drug': 0, 'Condition': 3, 'Procedure': 2, 'Person': 0, 'Mood': 0, 'Observation': 0}


In [27]:
# On réparti maintenant les observations dans des colonnes annexes (afin de pouvoir utiliser les variables facilement ensuite)

colonnes_separees = df_augmented['raw_count'].apply(pd.Series)
colonnes_separees.rename(columns={'Condition': 'Conditions'}, inplace=True)
df_final = pd.concat([df_augmented, colonnes_separees], axis=1)
df_final[["NCTId", "InclusionCriteria","Drug","Conditions","Procedure"]]

Unnamed: 0,NCTId,InclusionCriteria,Drug,Conditions,Procedure
0,NCT06183229,Male and female subjects aged 18 to 65 years i...,0,4,0
1,NCT06182631,Need IV's line placement for IV fluids and/or ...,1,0,2
2,NCT06182319,Male or female ≥18 years of age.\nDocumentatio...,0,6,0
3,NCT06181435,Participants must be 18 years of age (when sig...,0,4,1
4,NCT06180707,Participants must have at least 28 teeth in th...,0,0,0
...,...,...,...,...,...
6256,NCT00308685,Persistent asthma of a minimum of six months d...,0,0,1
6257,NCT00307060,Male and female patients with mild to severe h...,0,1,0
6258,NCT00305604,Community dwelling patients 65 years of age or...,0,1,0
6259,NCT00304239,Clinical diagnosis of 4-10 previously untreate...,0,1,0


In [28]:
# On sauvegarde le modèle 

df_final.to_csv('./data/Data_augmented3.csv')

In [29]:
import pickle

# J'enregistre les prédictions dans un fichier pkl au cas où je souhaite les réutiliser
with open('./data/predictions3.pkl', 'wb') as fichier:
    pickle.dump(preds, fichier)

In [30]:
with open('./data/predictions3.pkl', 'rb') as fichier:
    predictions = pickle.load(fichier)
predictions[:5]

[[('male', 'B-Person'),
  ('and', 'O'),
  ('female', 'B-Person'),
  ('subjects', 'O'),
  ('aged', 'B-Person'),
  ('18', 'O'),
  ('to', 'O'),
  ('65', 'O'),
  ('years', 'O'),
  ('inclusive', 'O'),
  ('.', 'O'),
  ('written', 'O'),
  ('informed', 'O'),
  ('consent', 'O'),
  ('.', 'O'),
  ('co', 'O'),
  ('-', 'O'),
  ('living', 'O'),
  ('with', 'O'),
  ('persons', 'O'),
  ('who', 'O'),
  ('has', 'O'),
  ('developed', 'O'),
  ('in', 'B-Condition'),
  ('##fluenza', 'B-Condition'),
  ('or', 'O'),
  ('other', 'O'),
  ('acute', 'B-Condition'),
  ('respiratory', 'I-Condition'),
  ('viral', 'I-Condition'),
  ('infection', 'I-Condition'),
  (',', 'O'),
  ('diagnosed', 'O'),
  ('no', 'O'),
  ('more', 'O'),
  ('than', 'O'),
  ('3', 'O'),
  ('days', 'O'),
  ('ago', 'O'),
  ('.', 'O'),
  ('no', 'O'),
  ('signs', 'O'),
  ('of', 'O'),
  ('acute', 'B-Condition'),
  ('respiratory', 'I-Condition'),
  ('viral', 'I-Condition'),
  ('infection', 'I-Condition'),
  (',', 'O'),
  ('in', 'B-Condition'),
  ('##flu