In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
!pip install seqeval

Collecting seqeval
  Downloading seqeval-1.2.2.tar.gz (43 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/43.6 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m43.6/43.6 kB[0m [31m1.6 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: seqeval
  Building wheel for seqeval (setup.py) ... [?25l[?25hdone
  Created wheel for seqeval: filename=seqeval-1.2.2-py3-none-any.whl size=16161 sha256=1e951e5df1b790706fc177866821f953a7c649dd39786e6c6bf5cbcde1850218
  Stored in directory: /root/.cache/pip/wheels/bc/92/f0/243288f899c2eacdfa8c5f9aede4c71a9bad0ee26a01dc5ead
Successfully built seqeval
Installing collected packages: seqeval
Successfully installed seqeval-1.2.2


In [6]:
import os
import re
import json
import torch
import pandas as pd
import numpy as np
from tqdm import tqdm
from sklearn.model_selection import train_test_split
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
from transformers import AutoTokenizer, AutoModelForTokenClassification
from seqeval.metrics import classification_report

# Parameters
MAX_LEN = 128
BATCH_SIZE = 16
DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
MODEL_NAME = "microsoft/BiomedNLP-PubMedBERT-base-uncased-abstract"

# Define tag mapping for clinical trial entities
TAG_MAP = {
    'O': 0,
    'B-INCLUSION': 1,
    'I-INCLUSION': 2,
    'B-EXCLUSION': 3,
    'I-EXCLUSION': 4,
    'B-CONDITION': 5,
    'I-CONDITION': 6,
    'B-TREATMENT': 7,
    'I-TREATMENT': 8,
    'B-MEASUREMENT': 9,
    'I-MEASUREMENT': 10,
    'B-DEMOGRAPHIC': 11,
    'I-DEMOGRAPHIC': 12
}

# Inverse tag map for decoding
INVERSE_TAG_MAP = {v: k for k, v in TAG_MAP.items()}

class ClinicalTrialMatcher:
    def __init__(self, model_path=None):
        self.tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, do_lower_case=True)

        if model_path and os.path.exists(model_path):
            self.model = AutoModelForTokenClassification.from_pretrained(model_path)
            print(f"Loaded model from {model_path}")
        else:
            # Initialize with pre-trained model
            self.model = AutoModelForTokenClassification.from_pretrained(
                MODEL_NAME,
                num_labels=len(TAG_MAP)
            )
            print("Initialized new model")

        self.model.to(DEVICE)

    def preprocess_chia_data(self, folder_path):
        """Load and preprocess CHIA dataset"""
        print("Loading CHIA dataset...")
        data = []

        for file in tqdm(os.listdir(folder_path)):
            if file.endswith(".txt"):
                # Extract trial ID from filename (assumes format like NCT03648021_exc.txt)
                trial_id_match = re.search(r'(NCT\d+)', file)
                if trial_id_match:
                    trial_id = trial_id_match.group(1)
                else:
                    trial_id = file.split('.')[0]  # Fallback to filename without extension

                # Determine if this is an inclusion or exclusion criteria file
                is_exclusion = "_exc" in file.lower()
                criteria_type = "EXCLUSION" if is_exclusion else "INCLUSION"

                txt_path = os.path.join(folder_path, file)
                ann_path = txt_path.replace(".txt", ".ann")

                with open(txt_path, "r", encoding="utf-8") as txt_file:
                    text = txt_file.read()

                # Process the text file - split by newlines to get individual criteria
                criteria_list = [line.strip() for line in text.split('\n') if line.strip()]

                annotations = []

                # If annotation file exists, use it
                if os.path.exists(ann_path):
                    with open(ann_path, "r", encoding="utf-8") as ann_file:
                        for line in ann_file:
                            parts = line.strip().split('\t')
                            if len(parts) >= 3:
                                entity_type, span_info, entity_text = parts[1], parts[1], parts[2]
                                span_parts = span_info.split(' ')

                                if len(span_parts) == 2 and all(part.isdigit() for part in span_parts):
                                    start, end = map(int, span_parts)
                                    span = f"{start} {end}"

                                    # Map entity types to our schema
                                    if "inclusion" in entity_type.lower() or not is_exclusion:
                                        mapped_type = "INCLUSION"
                                    elif "exclusion" in entity_type.lower() or is_exclusion:
                                        mapped_type = "EXCLUSION"
                                    elif any(term in entity_type.lower() for term in ["disease", "condition", "symptom"]):
                                        mapped_type = "CONDITION"
                                    elif any(term in entity_type.lower() for term in ["treatment", "medication", "therapy"]):
                                        mapped_type = "TREATMENT"
                                    elif any(term in entity_type.lower() for term in ["measurement", "value", "score"]):
                                        mapped_type = "MEASUREMENT"
                                    elif any(term in entity_type.lower() for term in ["age", "gender", "demographic"]):
                                        mapped_type = "DEMOGRAPHIC"
                                    else:
                                        mapped_type = criteria_type  # Default to file type

                                    annotations.append((mapped_type, span, entity_text))
                else:
                    # If no annotation file, create basic annotations based on file type
                    for criterion in criteria_list:
                        if criterion:
                            start = text.find(criterion)
                            if start >= 0:
                                end = start + len(criterion)
                                annotations.append((criteria_type, f"{start} {end}", criterion))

                data.append({
                    "trial_id": trial_id,
                    "criteria_type": criteria_type,
                    "text": text,
                    "annotations": annotations
                })

        return pd.DataFrame(data)

    def char_to_word_tags(self, text, annotations):
        """Convert character-level annotations to word-level tags"""
        char_tags = ['O'] * len(text)

        for tag, span, _ in annotations:
            span_parts = span.split()

            if len(span_parts) == 2 and all(part.isdigit() for part in span_parts):
                start, end = map(int, span_parts)
                if start < len(char_tags):
                    char_tags[start] = f'B-{tag}'
                    for i in range(start + 1, min(end, len(char_tags))):
                        char_tags[i] = f'I-{tag}'
            else:
                # Try to find the text in the document
                entity_text = span
                matches = list(re.finditer(re.escape(entity_text.strip()), text))
                for match in matches:
                    start, end = match.start(), match.end()
                    if start < len(char_tags):
                        char_tags[start] = f'B-{tag}'
                        for i in range(start + 1, min(end, len(char_tags))):
                            char_tags[i] = f'I-{tag}'

        words = text.split()
        word_tags = []
        char_index = 0

        for word in words:
            word_tag = 'O'
            for _ in range(min(len(word), len(text) - char_index)):
                if char_index < len(char_tags) and char_tags[char_index] != 'O':
                    word_tag = char_tags[char_index]
                    break
                char_index += 1

            # Skip any remaining characters in the word
            char_index += max(0, len(word) - (len(text) - char_index))

            word_tags.append(word_tag)

        # Ensure word_tags has same length as words
        if len(word_tags) < len(words):
            word_tags.extend(['O'] * (len(words) - len(word_tags)))

        return words, word_tags[:len(words)]

    def prepare_dataset(self, data_df):
        """Prepare dataset for training/inference"""
        processed_data = []

        for _, row in tqdm(data_df.iterrows(), total=len(data_df)):
            words, word_tags = self.char_to_word_tags(row['text'], row['annotations'])

            # Fix tags if needed
            word_tags = self.fix_bio_tags(word_tags)

            processed_data.append({
                "trial_id": row['trial_id'],
                "criteria_type": row.get('criteria_type', 'UNKNOWN'),
                "words": words,
                "tags": word_tags
            })

        return pd.DataFrame(processed_data)

    def fix_bio_tags(self, tags):
        """Fix BIO tagging issues"""
        fixed_tags = []
        prev_tag = 'O'

        for tag in tags:
            # If I- tag follows O or different entity type, convert to B-
            if tag.startswith('I-') and (prev_tag == 'O' or
                                         (prev_tag.startswith('B-') or prev_tag.startswith('I-')) and
                                         prev_tag[2:] != tag[2:]):
                tag = f'B-{tag[2:]}'

            fixed_tags.append(tag)
            prev_tag = tag

        return fixed_tags

    def tokenize_and_align_labels(self, sentence, text_labels):
        """Tokenize words and align labels"""
        tokenized_inputs = self.tokenizer(
            sentence,
            padding='max_length',
            truncation=True,
            max_length=MAX_LEN,
            is_split_into_words=True,
            return_tensors="pt"
        )

        word_ids = tokenized_inputs.word_ids()
        label_ids = []

        for word_idx in word_ids:
            if word_idx is None:
                label_ids.append(-100)  # Special tokens get -100
            else:
                if word_idx < len(text_labels):
                    label_ids.append(TAG_MAP.get(text_labels[word_idx], 0))
                else:
                    label_ids.append(-100)

        return tokenized_inputs, torch.tensor(label_ids)

    def create_dataloaders(self, df, batch_size=BATCH_SIZE):
        """Create PyTorch DataLoaders"""
        input_ids = []
        attention_masks = []
        token_type_ids = []
        label_ids = []
        trial_ids = []
        criteria_types = []

        for _, row in tqdm(df.iterrows(), total=len(df)):
            tokenized_inputs, labels = self.tokenize_and_align_labels(row['words'], row['tags'])

            input_ids.append(tokenized_inputs['input_ids'][0])
            attention_masks.append(tokenized_inputs['attention_mask'][0])
            token_type_ids.append(tokenized_inputs['token_type_ids'][0])
            label_ids.append(labels)
            trial_ids.append(row['trial_id'])
            criteria_types.append(row['criteria_type'])

        input_ids = torch.stack(input_ids)
        attention_masks = torch.stack(attention_masks)
        token_type_ids = torch.stack(token_type_ids)
        label_ids = torch.stack(label_ids)

        dataset = TensorDataset(input_ids, attention_masks, token_type_ids, label_ids)
        dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=False)

        return dataloader, trial_ids, criteria_types

    def train(self, train_df, val_df, epochs=5, learning_rate=5e-5):
        """Train the model"""
        train_dataloader, _, _ = self.create_dataloaders(train_df)
        val_dataloader, _, _ = self.create_dataloaders(val_df)

        optimizer = torch.optim.AdamW(self.model.parameters(), lr=learning_rate)
        total_steps = len(train_dataloader) * epochs
        scheduler = torch.optim.lr_scheduler.LinearLR(optimizer, start_factor=1.0, end_factor=0.0, total_iters=total_steps)

        for epoch in range(epochs):
            # Training
            self.model.train()
            total_loss = 0

            for batch in tqdm(train_dataloader, desc=f"Epoch {epoch+1}/{epochs}"):
                batch = tuple(t.to(DEVICE) for t in batch)
                b_input_ids, b_attention_mask, b_token_type_ids, b_labels = batch

                self.model.zero_grad()
                outputs = self.model(
                    input_ids=b_input_ids,
                    attention_mask=b_attention_mask,
                    token_type_ids=b_token_type_ids,
                    labels=b_labels
                )

                loss = outputs.loss
                total_loss += loss.item()
                loss.backward()

                torch.nn.utils.clip_grad_norm_(self.model.parameters(), 1.0)
                optimizer.step()
                scheduler.step()

            avg_train_loss = total_loss / len(train_dataloader)
            print(f"Average training loss: {avg_train_loss:.4f}")

            # Validation
            self.model.eval()
            val_loss = 0
            predictions, true_labels = [], []

            for batch in tqdm(val_dataloader, desc="Validation"):
                batch = tuple(t.to(DEVICE) for t in batch)
                b_input_ids, b_attention_mask, b_token_type_ids, b_labels = batch

                with torch.no_grad():
                    outputs = self.model(
                        input_ids=b_input_ids,
                        attention_mask=b_attention_mask,
                        token_type_ids=b_token_type_ids,
                        labels=b_labels
                    )

                val_loss += outputs.loss.item()

                logits = outputs.logits
                logits = logits.detach().cpu().numpy()
                label_ids = b_labels.to('cpu').numpy()

                predictions.extend([list(p) for p in np.argmax(logits, axis=2)])
                true_labels.extend([list(l) for l in label_ids])

            avg_val_loss = val_loss / len(val_dataloader)
            print(f"Validation loss: {avg_val_loss:.4f}")

        print("Training complete!")
        return self.model

    def predict(self, test_df):
        """Make predictions on test data"""
        self.model.eval()
        test_dataloader, trial_ids, criteria_types = self.create_dataloaders(test_df, batch_size=BATCH_SIZE)

        predictions = []

        for batch in tqdm(test_dataloader, desc="Prediction"):
            batch = tuple(t.to(DEVICE) for t in batch)
            b_input_ids, b_attention_mask, b_token_type_ids, _ = batch

            with torch.no_grad():
                outputs = self.model(
                    input_ids=b_input_ids,
                    attention_mask=b_attention_mask,
                    token_type_ids=b_token_type_ids
                )

            logits = outputs.logits
            logits = logits.detach().cpu().numpy()
            batch_predictions = np.argmax(logits, axis=2)

            predictions.extend([list(p) for p in batch_predictions])

        return predictions, trial_ids, criteria_types

    def extract_entities(self, words, predicted_tags):
        """Extract entities from predicted tags"""
        entities = {
            "INCLUSION": [],
            "EXCLUSION": [],
            "CONDITION": [],
            "TREATMENT": [],
            "MEASUREMENT": [],
            "DEMOGRAPHIC": []
        }

        current_entity = None
        current_tokens = []

        for word, tag_id in zip(words, predicted_tags):
            if tag_id == -100:  # Special token, skip
                continue

            tag = INVERSE_TAG_MAP.get(tag_id, 'O')

            if tag.startswith('B-'):
                if current_entity and current_tokens:
                    entities[current_entity].append(' '.join(current_tokens))

                current_entity = tag[2:]
                current_tokens = [word]

            elif tag.startswith('I-') and current_tokens:
                current_tokens.append(word)

            elif tag == 'O':
                if current_entity and current_tokens:
                    entities[current_entity].append(' '.join(current_tokens))
                    current_entity = None
                    current_tokens = []

        # Add the last entity if it exists
        if current_entity and current_tokens:
            entities[current_entity].append(' '.join(current_tokens))

        return entities

    def calculate_sdi_scores(self, patient_data, trial_entities):
        """Calculate SDI (Semantic Differential Index) scores"""
        # This is a simplified scoring algorithm
        # In a real implementation, you'd want to use more sophisticated NLP techniques

        inclusion_score = 0
        exclusion_score = 0

        # Check inclusion criteria
        if trial_entities["INCLUSION"]:
            matched_inclusions = 0
            for criterion in trial_entities["INCLUSION"]:
                # Check if criterion is met in patient data
                if any(criterion.lower() in data.lower() for data in patient_data):
                    matched_inclusions += 1

            inclusion_score = matched_inclusions / len(trial_entities["INCLUSION"]) if trial_entities["INCLUSION"] else 0

        # Check exclusion criteria
        if trial_entities["EXCLUSION"]:
            matched_exclusions = 0
            for criterion in trial_entities["EXCLUSION"]:
                # Check if exclusion criterion is met (which is bad)
                if any(criterion.lower() in data.lower() for data in patient_data):
                    matched_exclusions += 1

            exclusion_score = matched_exclusions / len(trial_entities["EXCLUSION"]) if trial_entities["EXCLUSION"] else 0

        return inclusion_score, exclusion_score

    def determine_match_rating(self, inclusion_score, exclusion_score):
        """Determine overall match rating based on inclusion and exclusion scores"""
        if inclusion_score >= 0.6 and exclusion_score <= 0.2:
            return "Good Match"
        elif inclusion_score >= 0.4 and exclusion_score <= 0.3:
            return "Moderate Match"
        else:
            return "Weak Match"

    def process_patient_against_trials(self, processed_df, patient_data):
        """Process clinical trials and calculate match scores for a patient"""
        predictions, trial_ids, criteria_types = self.predict(processed_df)

        # Group by trial ID
        trial_entities = {}

        for i, (trial_id, prediction, criteria_type) in enumerate(zip(trial_ids, predictions, criteria_types)):
            words = processed_df.iloc[i]['words']
            extracted = self.extract_entities(words, prediction)

            if trial_id not in trial_entities:
                trial_entities[trial_id] = {
                    "INCLUSION": [],
                    "EXCLUSION": [],
                    "CONDITION": [],
                    "TREATMENT": [],
                    "MEASUREMENT": [],
                    "DEMOGRAPHIC": []
                }

            # Append entities to the corresponding trial
            for entity_type, entities in extracted.items():
                trial_entities[trial_id][entity_type].extend(entities)

        # Calculate match scores
        results = []

        for trial_id, entities in trial_entities.items():
            inclusion_score, exclusion_score = self.calculate_sdi_scores(patient_data, entities)
            match_rating = self.determine_match_rating(inclusion_score, exclusion_score)

            results.append({
                "Trial ID": trial_id,
                "Inclusion SDI": round(inclusion_score, 2),
                "Exclusion SDI": round(exclusion_score, 2),
                "Overall Match": match_rating
            })

        return pd.DataFrame(results)

# Main execution
if __name__ == "__main__":
    # Path to your CHIA dataset
    folder_path = "/content/drive/MyDrive/chia_with_scope"

    # Initialize the matcher
    matcher = ClinicalTrialMatcher()

    # Load and preprocess data
    data_df = matcher.preprocess_chia_data(folder_path)

    # Print some statistics
    print(f"Loaded {len(data_df)} documents from {len(data_df['trial_id'].unique())} unique trials")

    # Split data for training
    train_df, test_df = train_test_split(data_df, test_size=0.2, random_state=42)
    train_df, val_df = train_test_split(train_df, test_size=0.2, random_state=42)

    # Prepare datasets
    train_processed = matcher.prepare_dataset(train_df)
    val_processed = matcher.prepare_dataset(val_df)
    test_processed = matcher.prepare_dataset(test_df)

    # Train the model
    matcher.train(train_processed, val_processed, epochs=3)

    # Save the model
    model_path = "/content/drive/MyDrive/clinical_trial_ner_model/"
    os.makedirs(model_path, exist_ok=True)
    matcher.model.save_pretrained(model_path)
    matcher.tokenizer.save_pretrained(model_path)
    print(f"Model saved to {model_path}")

    # Example patient data (in a real scenario, this would come from patient records)
    patient_data = [
        "67-year-old male with stage 2 hypertension",
        "History of myocardial infarction 5 years ago",
        "Current medications: paracetamol, lisinopril, metoprolol",
        "Liver function tests: ASAT 45 U/L, ALAT 42 U/L, bilirubin 0.8 mg/dL",
        "Received corticosteroids 7 days ago",
        "No known drug allergies",
        "BMI 28.5, non-smoker"
    ]

    # Process trials for this patient
    results = matcher.process_patient_against_trials(test_processed, patient_data)

    # Display results
    print("\nClinical Trial Matching Results:")
    print(results.to_string(index=False))

    # Export results to CSV
    results.to_csv("/content/drive/MyDrive/clinical_trial_matches.csv", index=False)
    print("\nResults exported to /content/drive/MyDrive/clinical_trial_matches.csv")

Some weights of BertForTokenClassification were not initialized from the model checkpoint at microsoft/BiomedNLP-PubMedBERT-base-uncased-abstract and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Initialized new model
Loading CHIA dataset...


100%|██████████| 4004/4004 [03:51<00:00, 17.30it/s] 


Loaded 2000 documents from 1000 unique trials


100%|██████████| 1280/1280 [00:00<00:00, 2138.26it/s]
100%|██████████| 320/320 [00:00<00:00, 1824.93it/s]
100%|██████████| 400/400 [00:00<00:00, 1936.08it/s]
100%|██████████| 1280/1280 [00:04<00:00, 273.39it/s]
100%|██████████| 320/320 [00:00<00:00, 596.24it/s]
Epoch 1/3: 100%|██████████| 80/80 [31:07<00:00, 23.34s/it]


Average training loss: 0.0715


Validation: 100%|██████████| 20/20 [02:12<00:00,  6.62s/it]


Validation loss: 0.0001


Epoch 2/3: 100%|██████████| 80/80 [29:19<00:00, 21.99s/it]


Average training loss: 0.0002


Validation: 100%|██████████| 20/20 [02:12<00:00,  6.64s/it]


Validation loss: 0.0001


Epoch 3/3: 100%|██████████| 80/80 [29:11<00:00, 21.89s/it]


Average training loss: 0.0002


Validation: 100%|██████████| 20/20 [02:15<00:00,  6.79s/it]


Validation loss: 0.0001
Training complete!
Model saved to /content/drive/MyDrive/clinical_trial_ner_model/


100%|██████████| 400/400 [00:00<00:00, 654.01it/s]
Prediction: 100%|██████████| 25/25 [02:52<00:00,  6.89s/it]


Clinical Trial Matching Results:
   Trial ID  Inclusion SDI  Exclusion SDI Overall Match
NCT01991743              0              0    Weak Match
NCT03472508              0              0    Weak Match
NCT02573597              0              0    Weak Match
NCT03029078              0              0    Weak Match
NCT02552459              0              0    Weak Match
NCT02541955              0              0    Weak Match
NCT03062358              0              0    Weak Match
NCT01680081              0              0    Weak Match
NCT03177811              0              0    Weak Match
NCT02570321              0              0    Weak Match
NCT03169127              0              0    Weak Match
NCT02553226              0              0    Weak Match
NCT02371200              0              0    Weak Match
NCT02781610              0              0    Weak Match
NCT03493919              0              0    Weak Match
NCT03366779              0              0    Weak Match
NCT02920177   




In [7]:
# Load the saved model
model_path = "/content/drive/MyDrive/clinical_trial_ner_model/"
matcher = ClinicalTrialMatcher(model_path=model_path)

Loaded model from /content/drive/MyDrive/clinical_trial_ner_model/


In [8]:
patient_data = [
    "55-year-old female with type 2 diabetes",
    "HbA1c level of 7.5%",
    "BMI of 30.2, classified as obese",
    "No history of cardiovascular disease",
    "Not currently pregnant or planning to become pregnant",
    "No known allergies to insulin or related medications",
    "Willing to participate in a 12-month clinical trial"
]

In [9]:
# Process the patient data against the test dataset
results = matcher.process_patient_against_trials(test_processed, patient_data)

# Display results
print("\nClinical Trial Matching Results:")
print(results.to_string(index=False))

# Export results to CSV
results.to_csv("/content/drive/MyDrive/clinical_trial_matches.csv", index=False)
print("\nResults exported to /content/drive/MyDrive/clinical_trial_matches.csv")

100%|██████████| 400/400 [00:00<00:00, 400.53it/s]
Prediction: 100%|██████████| 25/25 [02:58<00:00,  7.14s/it]


Clinical Trial Matching Results:
   Trial ID  Inclusion SDI  Exclusion SDI Overall Match
NCT01991743              0              0    Weak Match
NCT03472508              0              0    Weak Match
NCT02573597              0              0    Weak Match
NCT03029078              0              0    Weak Match
NCT02552459              0              0    Weak Match
NCT02541955              0              0    Weak Match
NCT03062358              0              0    Weak Match
NCT01680081              0              0    Weak Match
NCT03177811              0              0    Weak Match
NCT02570321              0              0    Weak Match
NCT03169127              0              0    Weak Match
NCT02553226              0              0    Weak Match
NCT02371200              0              0    Weak Match
NCT02781610              0              0    Weak Match
NCT03493919              0              0    Weak Match
NCT03366779              0              0    Weak Match
NCT02920177   




In [11]:
sample_text = "Patients with type 2 diabetes and HbA1c > 7% are eligible for this trial."
words = sample_text.split()
# Create a DataFrame with dummy trial_id and criteria_type to avoid KeyError
# This should be consistent with how your training/test data is formatted.
input_df = pd.DataFrame([{"words": words, "tags": ["O"] * len(words), "trial_id": "dummy_id", "criteria_type": "UNKNOWN"}])
predicted_tags = matcher.predict(input_df)[0][0]
entities = matcher.extract_entities(words, predicted_tags)
print(entities)

100%|██████████| 1/1 [00:00<00:00, 198.59it/s]
Prediction: 100%|██████████| 1/1 [00:01<00:00,  1.18s/it]

{'INCLUSION': [], 'EXCLUSION': [], 'CONDITION': [], 'TREATMENT': [], 'MEASUREMENT': [], 'DEMOGRAPHIC': []}





In [13]:
print(predicted_tags)

[np.int64(0), np.int64(0), np.int64(0), np.int64(0), np.int64(0), np.int64(0), np.int64(0), np.int64(0), np.int64(0), np.int64(0), np.int64(0), np.int64(0), np.int64(0), np.int64(0), np.int64(0), np.int64(0), np.int64(0), np.int64(0), np.int64(0), np.int64(0), np.int64(0), np.int64(0), np.int64(0), np.int64(0), np.int64(0), np.int64(0), np.int64(0), np.int64(0), np.int64(0), np.int64(0), np.int64(0), np.int64(0), np.int64(0), np.int64(0), np.int64(0), np.int64(0), np.int64(0), np.int64(0), np.int64(0), np.int64(0), np.int64(0), np.int64(0), np.int64(0), np.int64(0), np.int64(0), np.int64(0), np.int64(0), np.int64(0), np.int64(0), np.int64(0), np.int64(0), np.int64(0), np.int64(0), np.int64(0), np.int64(0), np.int64(0), np.int64(0), np.int64(0), np.int64(0), np.int64(0), np.int64(0), np.int64(0), np.int64(0), np.int64(0), np.int64(0), np.int64(0), np.int64(0), np.int64(0), np.int64(0), np.int64(0), np.int64(0), np.int64(0), np.int64(0), np.int64(0), np.int64(0), np.int64(0), np.int64(0)