In [14]:
from google.colab import drive

# Mount Google Drive
drive.mount('/content/drive')


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [5]:
import os
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from transformers import BertTokenizer, BertModel
from scipy.spatial.distance import cosine

# Load the models and tokenizer
model_path_biobert = '/content/drive/MyDrive/biobert_chia_model'
model_path_clinical = '/content/drive/MyDrive/clinical_trial_ner_model'

tokenizer_biobert = BertTokenizer.from_pretrained(model_path_biobert)
model_biobert = BertModel.from_pretrained(model_path_biobert)

tokenizer_clinical = BertTokenizer.from_pretrained(model_path_clinical)
model_clinical = BertModel.from_pretrained(model_path_clinical)

# Load the patient EHR data
patient_data_path = '/content/drive/MyDrive/CCDA_Patient_Records_Fixed.txt'
with open(patient_data_path, 'r') as file:
    patient_data = file.readlines()

# Load the clinical trial data
trial_data_path = '/content/drive/MyDrive/chia_with_scope'
with open(trial_data_path, 'r') as file:
    trial_data = file.readlines()

# Function to encode text using BERT
def encode_text(text, tokenizer, model):
    inputs = tokenizer(text, return_tensors='pt', truncation=True, padding=True, max_length=512)
    with torch.no_grad():
        outputs = model(**inputs)
    return outputs.last_hidden_state.mean(dim=1).numpy()

# Function to calculate SDI
def calculate_sdi(embedding1, embedding2):
    return cosine(embedding1, embedding2)

# Function to calculate Jaccard similarity
def calculate_jaccard(set1, set2):
    intersection = len(set1 & set2)
    union = len(set1 | set2)
    return intersection / union if union != 0 else 0

# Preprocess and encode the data
patient_embeddings = {}
trial_embeddings = {}

for line in patient_data:
    patient_id, text = line.strip().split('\t')
    patient_embeddings[patient_id] = encode_text(text, tokenizer_biobert, model_biobert)

for line in trial_data:
    trial_id, text = line.strip().split('\t')
    trial_embeddings[trial_id] = encode_text(text, tokenizer_clinical, model_clinical)

# Calculate SDI and Jaccard similarity
results = []

for patient_id, patient_embedding in patient_embeddings.items():
    for trial_id, trial_embedding in trial_embeddings.items():
        sdi = calculate_sdi(patient_embedding, trial_embedding)
        jaccard = calculate_jaccard(set(patient_embedding.flatten()), set(trial_embedding.flatten()))

        # Determine the overall match
        overall_match = 'Strong Match' if sdi < 0.5 and jaccard > 0.7 else 'Weak Match'

        results.append([patient_id, trial_id, sdi, jaccard, overall_match])

# Print the results
print("Patient ID\tTrial ID\tInclusion SDI\tExclusion SDI\tOverall Match")
for result in results:
    print(f"{result[0]}\t{result[1]}\t{result[2]}\t{result[3]}\t{result[4]}")

# Determine which model is better
# This is a heuristic approach; you might need more sophisticated methods for real-world applications
biobert_scores = [result[2] for result in results if 'exc' in result[1]]
clinical_scores = [result[2] for result in results if 'inc' in result[1]]

if np.mean(biobert_scores) < np.mean(clinical_scores):
    print("Biobert model is better for exclusion criteria.")
else:
    print("Clinical trial NER model is better for inclusion criteria.")

Some weights of BertModel were not initialized from the model checkpoint at /content/drive/MyDrive/biobert_chia_model and are newly initialized: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of BertModel were not initialized from the model checkpoint at /content/drive/MyDrive/clinical_trial_ner_model and are newly initialized: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


IsADirectoryError: [Errno 21] Is a directory: '/content/drive/MyDrive/chia_with_scope'

In [7]:
import os
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from transformers import BertTokenizer, BertModel
from scipy.spatial.distance import cosine

# Load the models and tokenizer
model_path_biobert = '/content/drive/MyDrive/biobert_chia_model'
model_path_clinical = '/content/drive/MyDrive/clinical_trial_ner_model'

tokenizer_biobert = BertTokenizer.from_pretrained(model_path_biobert)
model_biobert = BertModel.from_pretrained(model_path_biobert)

tokenizer_clinical = BertTokenizer.from_pretrained(model_path_clinical)
model_clinical = BertModel.from_pretrained(model_path_clinical)

# Load the patient EHR data
patient_data_path = '/content/drive/MyDrive/CCDA_Patient_Records_Fixed.txt'
with open(patient_data_path, 'r') as file:
    patient_data = file.readlines()

# Load the clinical trial data
trial_data_path = '/content/drive/MyDrive/chia_with_scope'
with open(trial_data_path, 'r') as file:
    trial_data = file.readlines()

# Function to encode text using BERT
def encode_text(text, tokenizer, model):
    inputs = tokenizer(text, return_tensors='pt', truncation=True, padding=True, max_length=512)
    with torch.no_grad():
        outputs = model(**inputs)
    return outputs.last_hidden_state.mean(dim=1).numpy()

# Function to calculate SDI
def calculate_sdi(embedding1, embedding2):
    return cosine(embedding1, embedding2)

# Function to calculate Jaccard similarity
def calculate_jaccard(set1, set2):
    intersection = len(set1 & set2)
    union = len(set1 | set2)
    return intersection / union if union != 0 else 0

# Preprocess and encode the data
patient_embeddings = {}
trial_embeddings = {}

for line in patient_data:
    patient_id, text = line.strip().split('\t')
    patient_embeddings[patient_id] = encode_text(text, tokenizer_biobert, model_biobert)

for line in trial_data:
    trial_id, text = line.strip().split('\t')
    trial_embeddings[trial_id] = encode_text(text, tokenizer_clinical, model_clinical)

# Calculate SDI and Jaccard similarity
results = []

for patient_id, patient_embedding in patient_embeddings.items():
    for trial_id, trial_embedding in trial_embeddings.items():
        sdi = calculate_sdi(patient_embedding, trial_embedding)
        jaccard = calculate_jaccard(set(patient_embedding.flatten()), set(trial_embedding.flatten()))

        # Determine the overall match
        overall_match = 'Strong Match' if sdi < 0.5 and jaccard > 0.7 else 'Weak Match'

        results.append([patient_id, trial_id, sdi, jaccard, overall_match])

# Print the results
print("Patient ID\tTrial ID\tInclusion SDI\tExclusion SDI\tOverall Match")
for result in results:
    print(f"{result[0]}\t{result[1]}\t{result[2]}\t{result[3]}\t{result[4]}")

# Determine which model is better
# This is a heuristic approach; you might need more sophisticated methods for real-world applications
biobert_scores = [result[2] for result in results if 'exc' in result[1]]
clinical_scores = [result[2] for result in results if 'inc' in result[1]]

if np.mean(biobert_scores) < np.mean(clinical_scores):
    print("Biobert model is better for exclusion criteria.")
else:
    print("Clinical trial NER model is better for inclusion criteria.")

Some weights of BertModel were not initialized from the model checkpoint at /content/drive/MyDrive/biobert_chia_model and are newly initialized: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of BertModel were not initialized from the model checkpoint at /content/drive/MyDrive/clinical_trial_ner_model and are newly initialized: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


IsADirectoryError: [Errno 21] Is a directory: '/content/drive/MyDrive/chia_with_scope'

In [8]:
import os
import torch
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from transformers import BertTokenizer, BertModel

# ✅ Load the models and tokenizer
model_path_biobert = '/content/drive/MyDrive/biobert_chia_model'
model_path_clinical = '/content/drive/MyDrive/clinical_trial_ner_model'

tokenizer_biobert = BertTokenizer.from_pretrained(model_path_biobert)
model_biobert = BertModel.from_pretrained(model_path_biobert)

tokenizer_clinical = BertTokenizer.from_pretrained(model_path_clinical)
model_clinical = BertModel.from_pretrained(model_path_clinical)

# ✅ Load the patient EHR data
patient_data_path = '/content/drive/MyDrive/CCDA_Patient_Records_Fixed.txt'
with open(patient_data_path, 'r') as file:
    patient_data = file.readlines()

# ✅ Load clinical trial data from all files in the folder
trial_data_folder = '/content/drive/MyDrive/chia_with_scope'
trial_data = []

# Iterate through all files in the folder
for filename in os.listdir(trial_data_folder):
    if filename.endswith('.txt'):  # Filter only text files
        with open(os.path.join(trial_data_folder, filename), 'r') as file:
            trial_data.extend(file.readlines())

# ✅ Function to encode text using BERT
def encode_text(text, tokenizer, model):
    inputs = tokenizer(text, return_tensors='pt', truncation=True, padding=True, max_length=512)
    with torch.no_grad():
        outputs = model(**inputs)
    return outputs.last_hidden_state.mean(dim=1).numpy()

# ✅ Function to calculate cosine similarity (better than cosine distance)
def calculate_similarity(embedding1, embedding2):
    similarity = cosine_similarity(embedding1, embedding2)[0][0]
    return similarity

# ✅ Preprocess and encode the data
patient_embeddings = {}
trial_embeddings = {}

# Encode patients
for line in patient_data:
    if '\t' in line:
        patient_id, text = line.strip().split('\t')
        patient_embeddings[patient_id] = encode_text(text, tokenizer_biobert, model_biobert)

# Encode trials
for line in trial_data:
    if '\t' in line:
        trial_id, text = line.strip().split('\t')
        trial_embeddings[trial_id] = encode_text(text, tokenizer_clinical, model_clinical)

# ✅ Calculate similarity
results = []

for patient_id, patient_embedding in patient_embeddings.items():
    for trial_id, trial_embedding in trial_embeddings.items():
        similarity = calculate_similarity(patient_embedding, trial_embedding)

        # Threshold for match criteria
        overall_match = 'Strong Match' if similarity > 0.75 else 'Weak Match'

        results.append([patient_id, trial_id, similarity, overall_match])

# ✅ Print the results
print("Patient ID\tTrial ID\tSimilarity\tOverall Match")
for result in results:
    print(f"{result[0]}\t{result[1]}\t{result[2]:.4f}\t{result[3]}")

# ✅ Determine which model is better
# Separate inclusion and exclusion scores based on naming convention
biobert_scores = [result[2] for result in results if 'exc' in result[1].lower()]
clinical_scores = [result[2] for result in results if 'inc' in result[1].lower()]

if np.mean(biobert_scores) > np.mean(clinical_scores):
    print("\nBioBERT model is better for exclusion criteria.")
else:
    print("\nClinical trial NER model is better for inclusion criteria.")


Some weights of BertModel were not initialized from the model checkpoint at /content/drive/MyDrive/biobert_chia_model and are newly initialized: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of BertModel were not initialized from the model checkpoint at /content/drive/MyDrive/clinical_trial_ner_model and are newly initialized: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


ValueError: not enough values to unpack (expected 2, got 1)

In [9]:
import os
import torch
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from transformers import BertTokenizer, BertModel

# ✅ Load the models and tokenizer
model_path_biobert = '/content/drive/MyDrive/biobert_chia_model'
model_path_clinical = '/content/drive/MyDrive/clinical_trial_ner_model'

# Load Tokenizers and Models
tokenizer_biobert = BertTokenizer.from_pretrained(model_path_biobert)
model_biobert = BertModel.from_pretrained(model_path_biobert)

tokenizer_clinical = BertTokenizer.from_pretrained(model_path_clinical)
model_clinical = BertModel.from_pretrained(model_path_clinical)

# ✅ Load the patient EHR data
patient_data_path = '/content/drive/MyDrive/CCDA_Patient_Records_Fixed.txt'
with open(patient_data_path, 'r') as file:
    patient_data = file.readlines()

# ✅ Load clinical trial data from all files in the folder
trial_data_folder = '/content/drive/MyDrive/chia_with_scope'
trial_data = []

# Iterate through all files in the folder
for filename in os.listdir(trial_data_folder):
    if filename.endswith('.txt'):  # Filter only text files
        with open(os.path.join(trial_data_folder, filename), 'r') as file:
            trial_data.extend(file.readlines())

# ✅ Function to encode text using BERT
def encode_text(text, tokenizer, model):
    """Encodes text into BERT embeddings."""
    inputs = tokenizer(text, return_tensors='pt', truncation=True, padding=True, max_length=512)

    with torch.no_grad():
        outputs = model(**inputs)

    return outputs.last_hidden_state.mean(dim=1).numpy()

# ✅ Function to calculate cosine similarity
def calculate_similarity(embedding1, embedding2):
    """Calculates cosine similarity between two embeddings."""
    similarity = cosine_similarity(embedding1, embedding2)[0][0]
    return similarity

# ✅ Preprocess and encode the data with error handling
patient_embeddings = {}
trial_embeddings = {}

# Encode patients with validation
for line in patient_data:
    parts = line.strip().split('\t')
    if len(parts) == 2:  # Ensure there are exactly 2 values
        patient_id, text = parts
        patient_embeddings[patient_id] = encode_text(text, tokenizer_biobert, model_biobert)
    else:
        print(f"Skipping invalid patient line: {line.strip()}")

# Encode trials with validation
for line in trial_data:
    parts = line.strip().split('\t')
    if len(parts) == 2:  # Ensure there are exactly 2 values
        trial_id, text = parts
        trial_embeddings[trial_id] = encode_text(text, tokenizer_clinical, model_clinical)
    else:
        print(f"Skipping invalid trial line: {line.strip()}")

# ✅ Calculate similarity
results = []

# Match patients to trials
for patient_id, patient_embedding in patient_embeddings.items():
    for trial_id, trial_embedding in trial_embeddings.items():

        # Calculate similarity
        similarity = calculate_similarity(patient_embedding, trial_embedding)

        # Determine match quality
        overall_match = 'Strong Match' if similarity > 0.75 else 'Weak Match'

        # Store the result
        results.append([patient_id, trial_id, similarity, overall_match])

# ✅ Print the results
print("\nPatient ID\tTrial ID\tSimilarity\tOverall Match")
for result in results:
    print(f"{result[0]}\t{result[1]}\t{result[2]:.4f}\t{result[3]}")

# ✅ Determine which model is better
# Separate inclusion and exclusion scores based on naming convention
biobert_scores = [result[2] for result in results if 'exc' in result[1].lower()]
clinical_scores = [result[2] for result in results if 'inc' in result[1].lower()]

# Check for empty scores to avoid division errors
if biobert_scores and clinical_scores:
    if np.mean(biobert_scores) > np.mean(clinical_scores):
        print("\nBioBERT model is better for exclusion criteria.")
    else:
        print("\nClinical trial NER model is better for inclusion criteria.")
else:
    print("\nInsufficient data to determine model performance.")


Some weights of BertModel were not initialized from the model checkpoint at /content/drive/MyDrive/biobert_chia_model and are newly initialized: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of BertModel were not initialized from the model checkpoint at /content/drive/MyDrive/clinical_trial_ner_model and are newly initialized: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Skipping invalid patient line: Patient Health Records - CCDA Format
Skipping invalid patient line: 
Skipping invalid patient line: Patient ID: 75b72703-7516-472e-b73f-c539638a5955
Skipping invalid patient line: Full Name: Joaquin141 Rice937
Skipping invalid patient line: Gender: M
Skipping invalid patient line: Birth Date: 19620827064010
Skipping invalid patient line: Location: Springfield, Massachusetts, N/A
Skipping invalid patient line: Phone: N/A
Skipping invalid patient line: Medical Conditions: Allergy to substance, Allergy to substance, Allergy to substance, Body mass index 30+ - obesity (finding), Diabetes, Hyperglycemia (disorder), Hypertriglyceridemia (disorder), Metabolic syndrome X (disorder), Prediabetes, Anemia (disorder), Diabetic retinopathy associated with type II diabetes mellitus (disorder), Hyperlipidemia, Viral sinusitis (disorder), Acute bronchitis (disorder), Acute viral pharyngitis (disorder), Nonproliferative diabetic retinopathy due to type 2 diabetes mellitus

In [10]:
import os
import torch
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from transformers import BertTokenizer, BertModel

# ✅ Load the models and tokenizer
model_path_biobert = '/content/drive/MyDrive/biobert_chia_model'
model_path_clinical = '/content/drive/MyDrive/clinical_trial_ner_model'

# Load Tokenizers and Models
tokenizer_biobert = BertTokenizer.from_pretrained(model_path_biobert)
model_biobert = BertModel.from_pretrained(model_path_biobert)

tokenizer_clinical = BertTokenizer.from_pretrained(model_path_clinical)
model_clinical = BertModel.from_pretrained(model_path_clinical)

# ✅ Load the patient EHR data
patient_data_path = '/content/drive/MyDrive/CCDA_Patient_Records_Fixed.txt'
with open(patient_data_path, 'r') as file:
    patient_data = file.readlines()

# ✅ Load clinical trial data from all files in the folder
trial_data_folder = '/content/drive/MyDrive/chia_with_scope'
trial_data = []

# Iterate through all files in the folder
for filename in os.listdir(trial_data_folder):
    if filename.endswith('.txt'):  # Filter only text files
        with open(os.path.join(trial_data_folder, filename), 'r') as file:
            trial_data.extend(file.readlines())

# ✅ Function to encode text using BERT
def encode_text(text, tokenizer, model):
    """Encodes text into BERT embeddings."""
    inputs = tokenizer(text, return_tensors='pt', truncation=True, padding=True, max_length=512)

    with torch.no_grad():
        outputs = model(**inputs)

    return outputs.last_hidden_state.mean(dim=1).numpy()

# ✅ Function to calculate cosine similarity
def calculate_similarity(embedding1, embedding2):
    """Calculates cosine similarity between two embeddings."""
    similarity = cosine_similarity(embedding1, embedding2)[0][0]
    return similarity

# ✅ Preprocess and encode the data with error handling
patient_embeddings = {}
trial_embeddings = {}

# Encode patients with validation
for line in patient_data:
    parts = line.strip().split('\t')
    if len(parts) == 2:  # Ensure there are exactly 2 values
        patient_id, text = parts
        patient_embeddings[patient_id] = encode_text(text, tokenizer_biobert, model_biobert)
    else:
        print(f"Skipping invalid patient line: {line.strip()}")

# Encode trials with validation
for line in trial_data:
    parts = line.strip().split('\t')
    if len(parts) == 2:  # Ensure there are exactly 2 values
        trial_id, text = parts
        trial_embeddings[trial_id] = encode_text(text, tokenizer_clinical, model_clinical)
    else:
        print(f"Skipping invalid trial line: {line.strip()}")

# ✅ Calculate similarity
results = []

# Match patients to trials
for patient_id, patient_embedding in patient_embeddings.items():
    for trial_id, trial_embedding in trial_embeddings.items():

        # Calculate similarity
        similarity = calculate_similarity(patient_embedding, trial_embedding)

        # Determine match quality
        overall_match = 'Strong Match' if similarity > 0.75 else 'Weak Match'

        # Store the result
        results.append([patient_id, trial_id, similarity, overall_match])

# ✅ Print the results
print("\nPatient ID\tTrial ID\tSimilarity\tOverall Match")
for result in results:
    print(f"{result[0]}\t{result[1]}\t{result[2]:.4f}\t{result[3]}")

# ✅ Model Evaluation with Robust Logic
# Initialize empty lists
biobert_scores = []
clinical_scores = []

# Automatically categorize based on trial ID patterns
for result in results:
    trial_id = result[1].lower()
    similarity = result[2]

    if 'exc' in trial_id or 'exclude' in trial_id:
        biobert_scores.append(similarity)
    elif 'inc' in trial_id or 'include' in trial_id:
        clinical_scores.append(similarity)
    else:
        # If no clear pattern, consider as general match
        biobert_scores.append(similarity)
        clinical_scores.append(similarity)

# ✅ Fallback condition if patterns are missing
if biobert_scores and clinical_scores:
    mean_biobert = np.mean(biobert_scores)
    mean_clinical = np.mean(clinical_scores)

    print("\nModel Performance Comparison:")
    print(f"BioBERT Mean Similarity: {mean_biobert:.4f}")
    print(f"Clinical NER Mean Similarity: {mean_clinical:.4f}")

    if mean_biobert > mean_clinical:
        print("\n✅ BioBERT model is better for exclusion criteria.")
    else:
        print("\n✅ Clinical trial NER model is better for inclusion criteria.")
else:
    # Fallback: Display average similarity if no patterns found
    avg_similarity = np.mean([result[2] for result in results])
    print(f"\n⚠️ No clear patterns found. Average similarity across all trials: {avg_similarity:.4f}")


Some weights of BertModel were not initialized from the model checkpoint at /content/drive/MyDrive/biobert_chia_model and are newly initialized: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of BertModel were not initialized from the model checkpoint at /content/drive/MyDrive/clinical_trial_ner_model and are newly initialized: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Skipping invalid patient line: Patient Health Records - CCDA Format
Skipping invalid patient line: 
Skipping invalid patient line: Patient ID: 75b72703-7516-472e-b73f-c539638a5955
Skipping invalid patient line: Full Name: Joaquin141 Rice937
Skipping invalid patient line: Gender: M
Skipping invalid patient line: Birth Date: 19620827064010
Skipping invalid patient line: Location: Springfield, Massachusetts, N/A
Skipping invalid patient line: Phone: N/A
Skipping invalid patient line: Medical Conditions: Allergy to substance, Allergy to substance, Allergy to substance, Body mass index 30+ - obesity (finding), Diabetes, Hyperglycemia (disorder), Hypertriglyceridemia (disorder), Metabolic syndrome X (disorder), Prediabetes, Anemia (disorder), Diabetic retinopathy associated with type II diabetes mellitus (disorder), Hyperlipidemia, Viral sinusitis (disorder), Acute bronchitis (disorder), Acute viral pharyngitis (disorder), Nonproliferative diabetic retinopathy due to type 2 diabetes mellitus

  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)


In [11]:
import os
import torch
import numpy as np
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity
from transformers import BertTokenizer, BertModel

# ✅ Load the models and tokenizer
model_path_biobert = '/content/drive/MyDrive/biobert_chia_model'
model_path_clinical = '/content/drive/MyDrive/clinical_trial_ner_model'

# Load Tokenizers and Models
tokenizer_biobert = BertTokenizer.from_pretrained(model_path_biobert)
model_biobert = BertModel.from_pretrained(model_path_biobert)

tokenizer_clinical = BertTokenizer.from_pretrained(model_path_clinical)
model_clinical = BertModel.from_pretrained(model_path_clinical)

# ✅ Load the actual clinical trial data
trial_data_folder = '/content/drive/MyDrive/chia_with_scope'
trial_data = []

# Iterate through all files in the folder
for filename in os.listdir(trial_data_folder):
    if filename.endswith('.txt'):  # Filter only text files
        with open(os.path.join(trial_data_folder, filename), 'r') as file:
            trial_data.extend(file.readlines())

# ✅ Function to encode text using BERT
def encode_text(text, tokenizer, model):
    """Encodes text into BERT embeddings."""
    inputs = tokenizer(text, return_tensors='pt', truncation=True, padding=True, max_length=512)

    with torch.no_grad():
        outputs = model(**inputs)

    return outputs.last_hidden_state.mean(dim=1).numpy()

# ✅ Function to calculate cosine similarity
def calculate_similarity(embedding1, embedding2):
    """Calculates cosine similarity between two embeddings."""
    similarity = cosine_similarity(embedding1, embedding2)[0][0]
    return similarity

# ✅ Preprocess the real data with error handling
trial_embeddings = {}

# Process real trial data
for line in trial_data:
    parts = line.strip().split('\t')

    # Only process lines with valid format
    if len(parts) == 2:
        trial_id, text = parts
        trial_embeddings[trial_id] = encode_text(text, tokenizer_clinical, model_clinical)
    else:
        print(f"Skipping invalid trial line: {line.strip()}")

# ✅ Generate synthetic sample patient data
def generate_synthetic_patients(num_patients=5):
    """Generates synthetic patient data."""
    patients = [f"P{i+1}" for i in range(num_patients)]
    patient_data = [f"{p}\tPatient record with random symptoms {p}" for p in patients]
    return patient_data

# Generate sample patient data
patient_data = generate_synthetic_patients()

# Encode patients
patient_embeddings = {}

for line in patient_data:
    parts = line.strip().split('\t')
    if len(parts) == 2:  # Ensure valid format
        patient_id, text = parts
        patient_embeddings[patient_id] = encode_text(text, tokenizer_biobert, model_biobert)
    else:
        print(f"Skipping invalid patient line: {line.strip()}")

# ✅ Calculate similarity between patients and trials
results = []

for patient_id, patient_embedding in patient_embeddings.items():
    for trial_id, trial_embedding in trial_embeddings.items():

        # Calculate similarity
        similarity = calculate_similarity(patient_embedding, trial_embedding)

        # Determine match quality
        overall_match = 'Strong Match' if similarity > 0.75 else 'Weak Match'

        # Store the result
        results.append([patient_id, trial_id, similarity, overall_match])

# ✅ Display the results in a DataFrame
results_df = pd.DataFrame(results, columns=['Patient ID', 'Trial ID', 'Similarity', 'Overall Match'])
print("\n✅ Results Summary Table:")
print(results_df)

# ✅ Model Evaluation with Robust Logic
# Initialize empty lists
biobert_scores = []
clinical_scores = []

# Automatically categorize based on trial ID patterns
for result in results:
    trial_id = result[1].lower()
    similarity = result[2]

    if 'exc' in trial_id or 'exclude' in trial_id:
        biobert_scores.append(similarity)
    elif 'inc' in trial_id or 'include' in trial_id:
        clinical_scores.append(similarity)
    else:
        # If no clear pattern, consider as general match
        biobert_scores.append(similarity)
        clinical_scores.append(similarity)

# ✅ Fallback condition if patterns are missing
if biobert_scores and clinical_scores:
    mean_biobert = np.mean(biobert_scores)
    mean_clinical = np.mean(clinical_scores)

    print("\n✅ Model Performance Comparison:")
    print(f"BioBERT Mean Similarity: {mean_biobert:.4f}")
    print(f"Clinical NER Mean Similarity: {mean_clinical:.4f}")

    if mean_biobert > mean_clinical:
        print("\n✅ BioBERT model is better for exclusion criteria.")
    else:
        print("\n✅ Clinical trial NER model is better for inclusion criteria.")
else:
    # Fallback: Display average similarity if no patterns found
    avg_similarity = np.mean([result[2] for result in results])
    print(f"\n✅ Average similarity across all trials: {avg_similarity:.4f}")


Some weights of BertModel were not initialized from the model checkpoint at /content/drive/MyDrive/biobert_chia_model and are newly initialized: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of BertModel were not initialized from the model checkpoint at /content/drive/MyDrive/clinical_trial_ner_model and are newly initialized: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


[1;30;43mStreaming output truncated to the last 5000 lines.[0m
Skipping invalid trial line: 5. Active and uncontrolled malignancy
Skipping invalid trial line: 6. Pregnant or lactating
Skipping invalid trial line: 7. Unable to wean steroids to ≤0.5 mg/kg/day prednisone.
Skipping invalid trial line: 8. Patients with Grade 3 hyperbilirubinemia
Skipping invalid trial line: 
Skipping invalid trial line: 1. Diagnosis of primary immunodeficiency with established plan to undergo myeloablative or non-myeloablative allogeneic hematopoietic stem cell transplant for treatment thereof or diagnosis of a form of primary immunodeficiency for which hematopoietic stem cell transplantation is not indicated.
Skipping invalid trial line: 2. Active infection with EBV, CMV, and/or Adenovirus, unable to be successfully controlled with standard therapy.
Skipping invalid trial line: 3. Steroids less than 0.5 mg/kg/day prednisone
Skipping invalid trial line: 4. Karnofsky/Lansky score of ≥ 50
Skipping invalid t

  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)


In [12]:
import os
import torch
import numpy as np
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity
from transformers import BertTokenizer, BertModel

# ✅ Load the models and tokenizer
model_path_biobert = '/content/drive/MyDrive/biobert_chia_model'
model_path_clinical = '/content/drive/MyDrive/clinical_trial_ner_model'

# Load Tokenizers and Models
tokenizer_biobert = BertTokenizer.from_pretrained(model_path_biobert)
model_biobert = BertModel.from_pretrained(model_path_biobert)

tokenizer_clinical = BertTokenizer.from_pretrained(model_path_clinical)
model_clinical = BertModel.from_pretrained(model_path_clinical)

# ✅ Function to load and clean the real dataset
def load_and_clean_data(folder_path, sample_size=10):
    """Reads and cleans the real dataset from the chai_with_scope folder."""
    trial_data = []

    # Iterate through all files in the folder
    for filename in os.listdir(folder_path):
        if filename.endswith('.txt'):
            with open(os.path.join(folder_path, filename), 'r') as file:
                lines = file.readlines()

                # Filter valid lines containing at least 2 tab-separated values
                for line in lines:
                    if '\t' in line and len(line.strip().split('\t')) == 2:
                        trial_data.append(line.strip())

    # Use only a sample of the dataset (if large)
    if len(trial_data) > sample_size:
        trial_data = np.random.choice(trial_data, sample_size, replace=False).tolist()

    print(f"\n✅ Loaded {len(trial_data)} valid trial records.")

    return trial_data

# ✅ Function to generate synthetic patient data
def generate_synthetic_patients(num_patients=10):
    """Generates synthetic patient records."""
    patients = [f"P{i+1}" for i in range(num_patients)]
    patient_data = [f"{p}\tPatient with condition related to trial {p}" for p in patients]

    print(f"\n✅ Generated {len(patient_data)} synthetic patient records.")

    return patient_data

# ✅ Function to encode text using BERT
def encode_text(text, tokenizer, model):
    """Encodes text into BERT embeddings."""
    try:
        inputs = tokenizer(text, return_tensors='pt', truncation=True, padding=True, max_length=512)
        with torch.no_grad():
            outputs = model(**inputs)
        return outputs.last_hidden_state.mean(dim=1).numpy()
    except Exception as e:
        print(f"⚠️ Error encoding text: {text[:30]}... => {e}")
        return None

# ✅ Function to calculate cosine similarity
def calculate_similarity(embedding1, embedding2):
    """Calculates cosine similarity between two embeddings."""
    similarity = cosine_similarity(embedding1, embedding2)[0][0]
    return similarity

# ✅ Load and clean the real trial data
trial_data_folder = '/content/drive/MyDrive/chia_with_scope'
real_trial_data = load_and_clean_data(trial_data_folder)

# ✅ Generate synthetic patient data
patient_data = generate_synthetic_patients()

# ✅ Preprocess and encode the data with error handling
patient_embeddings = {}
trial_embeddings = {}

# Encode patient data
for line in patient_data:
    parts = line.strip().split('\t')
    if len(parts) == 2:
        patient_id, text = parts
        embedding = encode_text(text, tokenizer_biobert, model_biobert)
        if embedding is not None:
            patient_embeddings[patient_id] = embedding

# Encode trial data
for line in real_trial_data:
    parts = line.strip().split('\t')
    if len(parts) == 2:
        trial_id, text = parts
        embedding = encode_text(text, tokenizer_clinical, model_clinical)
        if embedding is not None:
            trial_embeddings[trial_id] = embedding

# ✅ Verify Data Loading
print(f"\n✅ Patient Embeddings: {len(patient_embeddings)}")
print(f"✅ Trial Embeddings: {len(trial_embeddings)}")

# ✅ Calculate similarity between patients and trials
results = []

for patient_id, patient_embedding in patient_embeddings.items():
    for trial_id, trial_embedding in trial_embeddings.items():
        # Calculate similarity
        similarity = calculate_similarity(patient_embedding, trial_embedding)
        overall_match = 'Strong Match' if similarity > 0.75 else 'Weak Match'

        # Store the result
        results.append([patient_id, trial_id, similarity, overall_match])

# ✅ Display the results in a DataFrame
if results:
    results_df = pd.DataFrame(results, columns=['Patient ID', 'Trial ID', 'Similarity', 'Overall Match'])
    print("\n✅ Results Summary Table:")
    print(results_df)
else:
    print("\n⚠️ No valid patient-trial pairs found. Try with more synthetic or cleaned real data.")

# ✅ Model Evaluation with Robust Logic
biobert_scores = []
clinical_scores = []

# Automatically categorize based on trial ID patterns
for result in results:
    trial_id = result[1].lower()
    similarity = result[2]

    if 'exc' in trial_id or 'exclude' in trial_id:
        biobert_scores.append(similarity)
    elif 'inc' in trial_id or 'include' in trial_id:
        clinical_scores.append(similarity)
    else:
        # If no clear pattern, consider as general match
        biobert_scores.append(similarity)
        clinical_scores.append(similarity)

# ✅ Fallback condition if patterns are missing
if biobert_scores and clinical_scores:
    mean_biobert = np.mean(biobert_scores)
    mean_clinical = np.mean(clinical_scores)

    print("\n✅ Model Performance Comparison:")
    print(f"BioBERT Mean Similarity: {mean_biobert:.4f}")
    print(f"Clinical NER Mean Similarity: {mean_clinical:.4f}")

    if mean_biobert > mean_clinical:
        print("\n✅ BioBERT model is better for exclusion criteria.")
    else:
        print("\n✅ Clinical trial NER model is better for inclusion criteria.")
else:
    avg_similarity = np.mean([result[2] for result in results])
    print(f"\n✅ Average similarity across all trials: {avg_similarity:.4f}")


Some weights of BertModel were not initialized from the model checkpoint at /content/drive/MyDrive/biobert_chia_model and are newly initialized: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of BertModel were not initialized from the model checkpoint at /content/drive/MyDrive/clinical_trial_ner_model and are newly initialized: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.



✅ Loaded 0 valid trial records.

✅ Generated 10 synthetic patient records.

✅ Patient Embeddings: 10
✅ Trial Embeddings: 0

⚠️ No valid patient-trial pairs found. Try with more synthetic or cleaned real data.

✅ Average similarity across all trials: nan


In [13]:
import os
import torch
import numpy as np
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity
from transformers import BertTokenizer, BertModel

# ✅ Paths
model_path_biobert = '/content/drive/MyDrive/biobert_chia_model'
model_path_clinical = '/content/drive/MyDrive/clinical_trial_ner_model'
cleaned_trial_data_file = '/content/drive/MyDrive/chia_with_scope_cleaned.txt'

# ✅ Load Tokenizers and Models
tokenizer_biobert = BertTokenizer.from_pretrained(model_path_biobert)
model_biobert = BertModel.from_pretrained(model_path_biobert)

tokenizer_clinical = BertTokenizer.from_pretrained(model_path_clinical)
model_clinical = BertModel.from_pretrained(model_path_clinical)

# ✅ Function to encode text using BERT
def encode_text(text, tokenizer, model):
    """Encodes text into BERT embeddings."""
    try:
        inputs = tokenizer(text, return_tensors='pt', truncation=True, padding=True, max_length=512)
        with torch.no_grad():
            outputs = model(**inputs)
        return outputs.last_hidden_state.mean(dim=1).numpy()
    except Exception as e:
        print(f"⚠️ Error encoding text: {text[:30]}... => {e}")
        return None

# ✅ Load the cleaned dataset
def load_cleaned_trial_data(filepath):
    """Loads the cleaned trial data into a dictionary."""
    trial_embeddings = {}
    with open(filepath, 'r', encoding='utf-8') as file:
        lines = file.readlines()

        for line in lines:
            parts = line.strip().split('\t')
            if len(parts) == 2:
                trial_id, text = parts
                embedding = encode_text(text, tokenizer_clinical, model_clinical)
                if embedding is not None:
                    trial_embeddings[trial_id] = embedding
    print(f"\n✅ Loaded {len(trial_embeddings)} valid trial records.")
    return trial_embeddings

# ✅ Generate synthetic patient data
def generate_synthetic_patients(num_patients=10):
    """Generates synthetic patient records."""
    patients = [f"P{i+1}" for i in range(num_patients)]
    patient_data = [f"{p}\tPatient with condition related to trial {p}" for p in patients]

    print(f"\n✅ Generated {len(patient_data)} synthetic patient records.")

    return patient_data

# ✅ Load trial data
trial_embeddings = load_cleaned_trial_data(cleaned_trial_data_file)

# ✅ Generate and encode synthetic patient data
patient_data = generate_synthetic_patients()
patient_embeddings = {}

for line in patient_data:
    parts = line.strip().split('\t')
    if len(parts) == 2:
        patient_id, text = parts
        embedding = encode_text(text, tokenizer_biobert, model_biobert)
        if embedding is not None:
            patient_embeddings[patient_id] = embedding

# ✅ Calculate similarity between patients and trials
results = []

for patient_id, patient_embedding in patient_embeddings.items():
    for trial_id, trial_embedding in trial_embeddings.items():
        # Calculate similarity
        similarity = cosine_similarity(patient_embedding, trial_embedding)[0][0]
        overall_match = 'Strong Match' if similarity > 0.75 else 'Weak Match'
        results.append([patient_id, trial_id, similarity, overall_match])

# ✅ Display the results
if results:
    results_df = pd.DataFrame(results, columns=['Patient ID', 'Trial ID', 'Similarity', 'Overall Match'])
    print("\n✅ Results Summary Table:")
    print(results_df)
else:
    print("\n⚠️ No valid patient-trial pairs found. Try with more synthetic or cleaned real data.")

# ✅ Model Evaluation with Robust Logic
biobert_scores = []
clinical_scores = []

# Automatically categorize based on trial ID patterns
for result in results:
    trial_id = result[1].lower()
    similarity = result[2]

    if 'exc' in trial_id or 'exclude' in trial_id:
        biobert_scores.append(similarity)
    elif 'inc' in trial_id or 'include' in trial_id:
        clinical_scores.append(similarity)
    else:
        biobert_scores.append(similarity)
        clinical_scores.append(similarity)

# ✅ Fallback condition if patterns are missing
if biobert_scores and clinical_scores:
    mean_biobert = np.mean(biobert_scores)
    mean_clinical = np.mean(clinical_scores)

    print("\n✅ Model Performance Comparison:")
    print(f"BioBERT Mean Similarity: {mean_biobert:.4f}")
    print(f"Clinical NER Mean Similarity: {mean_clinical:.4f}")

    if mean_biobert > mean_clinical:
        print("\n✅ BioBERT model is better for exclusion criteria.")
    else:
        print("\n✅ Clinical trial NER model is better for inclusion criteria.")
else:
    avg_similarity = np.mean([result[2] for result in results])
    print(f"\n✅ Average similarity across all trials: {avg_similarity:.4f}")


Some weights of BertModel were not initialized from the model checkpoint at /content/drive/MyDrive/biobert_chia_model and are newly initialized: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of BertModel were not initialized from the model checkpoint at /content/drive/MyDrive/clinical_trial_ner_model and are newly initialized: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


FileNotFoundError: [Errno 2] No such file or directory: '/content/drive/MyDrive/chia_with_scope_cleaned.txt'

In [15]:
# ✅ Step 1: Mount Google Drive
from google.colab import drive
drive.mount('/content/drive')

# ✅ Step 2: Extract the ZIP File
import zipfile
import os

# Replace this with the path to your ZIP file in Google Drive
zip_path = '# ✅ Step 1: Mount Google Drive'
from google.colab import drive
drive.mount('/content/drive')

# ✅ Step 2: Extract the ZIP File
import zipfile
import os

# Replace this with the path to your ZIP file in Google Drive
zip_path = '/content/drive/MyDrive/chia_with_scope.zip'
extract_folder = '/content/drive/MyDrive/chia_with_scope_extracted'

# Extract the ZIP file
with zipfile.ZipFile(zip_path, 'r') as zip_ref:
    zip_ref.extractall(extract_folder)

print(f"✅ Extracted to {extract_folder}")

# ✅ Step 3: Clean the Extracted Dataset
# Paths
output_file = '/content/drive/MyDrive/chia_with_scope_cleaned.txt'

# Function to clean and reformat the dataset
def clean_dataset(input_folder, output_file):
    """Reads and cleans the trial data."""
    cleaned_data = []

    for filename in os.listdir(input_folder):
        if filename.endswith('.txt'):
            file_path = os.path.join(input_folder, filename)

            with open(file_path, 'r', encoding='utf-8') as file:
                lines = file.readlines()

                for line in lines:
                    # Split line into parts
                    parts = line.strip().split('\t')

                    # Ensure it contains Trial ID and text
                    if len(parts) == 2:
                        trial_id, text = parts

                        # Remove empty or corrupted lines
                        if trial_id and text:
                            cleaned_data.append(f"{trial_id}\t{text}")
                    else:
                        # Log invalid lines
                        print(f"⚠️ Skipping invalid line in {filename}: {line.strip()}")

    # ✅ Save cleaned dataset
    with open(output_file, 'w', encoding='utf-8') as out_file:
        for line in cleaned_data:
            out_file.write(line + '\n')

    print(f"\n✅ Cleaned {len(cleaned_data)} valid records saved to {output_file}.")

# ✅ Clean the dataset
clean_dataset(extract_folder, output_file)

# ✅ Step 4: Verify the Cleaned Dataset
# Display a sample of cleaned data
print("\n✅ Sample cleaned trial records:")
with open(output_file, 'r', encoding='utf-8') as file:
    for i in range(5):  # Display first 5 lines
        print(file.readline().strip())

# ✅ Final Message
print("\n🚀 Your `chia_with_scope` dataset is cleaned and ready for BioBERT and Clinical NER model processing!")
extract_folder = '/content/drive/MyDrive/chia_with_scope_extracted'

# Extract the ZIP file
with zipfile.ZipFile(zip_path, 'r') as zip_ref:
    zip_ref.extractall(extract_folder)

print(f"✅ Extracted to {extract_folder}")

# ✅ Step 3: Clean the Extracted Dataset
# Paths
output_file = '/content/drive/MyDrive/chia_with_scope_cleaned.txt'

# Function to clean and reformat the dataset
def clean_dataset(input_folder, output_file):
    """Reads and cleans the trial data."""
    cleaned_data = []

    for filename in os.listdir(input_folder):
        if filename.endswith('.txt'):
            file_path = os.path.join(input_folder, filename)

            with open(file_path, 'r', encoding='utf-8') as file:
                lines = file.readlines()

                for line in lines:
                    # Split line into parts
                    parts = line.strip().split('\t')

                    # Ensure it contains Trial ID and text
                    if len(parts) == 2:
                        trial_id, text = parts

                        # Remove empty or corrupted lines
                        if trial_id and text:
                            cleaned_data.append(f"{trial_id}\t{text}")
                    else:
                        # Log invalid lines
                        print(f"⚠️ Skipping invalid line in {filename}: {line.strip()}")

    # ✅ Save cleaned dataset
    with open(output_file, 'w', encoding='utf-8') as out_file:
        for line in cleaned_data:
            out_file.write(line + '\n')

    print(f"\n✅ Cleaned {len(cleaned_data)} valid records saved to {output_file}.")

# ✅ Clean the dataset
clean_dataset(extract_folder, output_file)

# ✅ Step 4: Verify the Cleaned Dataset
# Display a sample of cleaned data
print("\n✅ Sample cleaned trial records:")
with open(output_file, 'r', encoding='utf-8') as file:
    for i in range(5):  # Display first 5 lines
        print(file.readline().strip())

# ✅ Final Message
print("\n🚀 Your `chia_with_scope` dataset is cleaned and ready for BioBERT and Clinical NER model processing!")


[1;30;43mStreaming output truncated to the last 5000 lines.[0m
⚠️ Skipping invalid line in NCT02831166_inc.txt: Signed informed consent;
⚠️ Skipping invalid line in NCT02831166_inc.txt: Patient eligible for transradial and transfemoral primary percutaneous coronary intervention, being pre-requisites: (a) familiarity of the operator with the radial and femoral techniques using vascular closure devices, (b) agreement of the operator to use the access route determined by the randomization process.
⚠️ Skipping invalid line in NCT02833116_exc.txt: Patients with high intracranial pressure.
⚠️ Skipping invalid line in NCT02833116_exc.txt: Patients with Multiple Sclerosis.
⚠️ Skipping invalid line in NCT02833116_exc.txt: Patients with Guillain-Barré syndrome radiculopathy of vascular origin.
⚠️ Skipping invalid line in NCT02833116_exc.txt: Patients with previous lumbar surgery.
⚠️ Skipping invalid line in NCT02833116_exc.txt: Patients pregnant or lactating.
⚠️ Skipping invalid line in NCT028

In [16]:
# ✅ Step 1: Import Libraries
import os
import torch
import numpy as np
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity
from transformers import BertTokenizer, BertModel

# ✅ Paths to cleaned dataset and models
model_path_biobert = '/content/drive/MyDrive/biobert_chia_model'
model_path_clinical = '/content/drive/MyDrive/clinical_trial_ner_model'
cleaned_trial_data_file = '/content/drive/MyDrive/chia_with_scope_cleaned.txt'

# ✅ Step 2: Load Tokenizers and Models
# Load BioBERT
tokenizer_biobert = BertTokenizer.from_pretrained(model_path_biobert)
model_biobert = BertModel.from_pretrained(model_path_biobert)

# Load Clinical NER model
tokenizer_clinical = BertTokenizer.from_pretrained(model_path_clinical)
model_clinical = BertModel.from_pretrained(model_path_clinical)

# ✅ Step 3: Define Functions for Encoding and Similarity Calculation

# Function to encode text into BERT embeddings
def encode_text(text, tokenizer, model):
    """Encodes text into BERT embeddings."""
    try:
        inputs = tokenizer(text, return_tensors='pt', truncation=True, padding=True, max_length=512)
        with torch.no_grad():
            outputs = model(**inputs)
        return outputs.last_hidden_state.mean(dim=1).numpy()
    except Exception as e:
        print(f"⚠️ Error encoding text: {text[:30]}... => {e}")
        return None

# Function to load the cleaned dataset into a dictionary
def load_trial_data(filepath, tokenizer, model):
    """Loads trial data into a dictionary with embeddings."""
    trial_embeddings = {}
    with open(filepath, 'r', encoding='utf-8') as file:
        lines = file.readlines()

        for line in lines:
            parts = line.strip().split('\t')
            if len(parts) == 2:
                trial_id, text = parts
                embedding = encode_text(text, tokenizer, model)
                if embedding is not None:
                    trial_embeddings[trial_id] = embedding
    print(f"\n✅ Loaded {len(trial_embeddings)} valid trial records.")
    return trial_embeddings

# ✅ Step 4: Generate Synthetic Patient Data
def generate_synthetic_patients(num_patients=10):
    """Generates synthetic patient records."""
    patients = [f"P{i+1}" for i in range(num_patients)]
    patient_data = [f"{p}\tPatient with condition related to trial {p}" for p in patients]

    print(f"\n✅ Generated {len(patient_data)} synthetic patient records.")
    return patient_data

# ✅ Step 5: Load Trial Data
# Use Clinical NER model for trial encoding
trial_embeddings = load_trial_data(cleaned_trial_data_file, tokenizer_clinical, model_clinical)

# ✅ Step 6: Generate and Encode Synthetic Patient Data
# Use BioBERT model for patient encoding
patient_data = generate_synthetic_patients()
patient_embeddings = {}

for line in patient_data:
    parts = line.strip().split('\t')
    if len(parts) == 2:
        patient_id, text = parts
        embedding = encode_text(text, tokenizer_biobert, model_biobert)
        if embedding is not None:
            patient_embeddings[patient_id] = embedding

# ✅ Step 7: Calculate Similarity Between Patients and Trials
results = []

for patient_id, patient_embedding in patient_embeddings.items():
    for trial_id, trial_embedding in trial_embeddings.items():
        # Calculate similarity
        similarity = cosine_similarity(patient_embedding, trial_embedding)[0][0]
        overall_match = 'Strong Match' if similarity > 0.75 else 'Weak Match'
        results.append([patient_id, trial_id, similarity, overall_match])

# ✅ Step 8: Display the Results
if results:
    results_df = pd.DataFrame(results, columns=['Patient ID', 'Trial ID', 'Similarity', 'Overall Match'])

    print("\n✅ Results Summary Table:")
    print(results_df)
else:
    print("\n⚠️ No valid patient-trial pairs found. Try with more synthetic or cleaned real data.")

# ✅ Step 9: Model Evaluation with Robust Logic
biobert_scores = []
clinical_scores = []

# Categorize scores based on inclusion/exclusion criteria patterns
for result in results:
    trial_id = result[1].lower()
    similarity = result[2]

    if 'exc' in trial_id or 'exclude' in trial_id:
        biobert_scores.append(similarity)
    elif 'inc' in trial_id or 'include' in trial_id:
        clinical_scores.append(similarity)
    else:
        biobert_scores.append(similarity)
        clinical_scores.append(similarity)

# ✅ Display Model Performance Comparison
if biobert_scores and clinical_scores:
    mean_biobert = np.mean(biobert_scores)
    mean_clinical = np.mean(clinical_scores)

    print("\n✅ Model Performance Comparison:")
    print(f"BioBERT Mean Similarity: {mean_biobert:.4f}")
    print(f"Clinical NER Mean Similarity: {mean_clinical:.4f}")

    if mean_biobert > mean_clinical:
        print("\n✅ BioBERT model is better for exclusion criteria.")
    else:
        print("\n✅ Clinical trial NER model is better for inclusion criteria.")
else:
    avg_similarity = np.mean([result[2] for result in results])
    print(f"\n✅ Average similarity across all trials: {avg_similarity:.4f}")


Some weights of BertModel were not initialized from the model checkpoint at /content/drive/MyDrive/biobert_chia_model and are newly initialized: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of BertModel were not initialized from the model checkpoint at /content/drive/MyDrive/clinical_trial_ner_model and are newly initialized: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.



✅ Loaded 0 valid trial records.

✅ Generated 10 synthetic patient records.

⚠️ No valid patient-trial pairs found. Try with more synthetic or cleaned real data.

✅ Average similarity across all trials: nan


In [17]:
# ✅ Step 1: Import Libraries
import os
import torch
import numpy as np
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity
from transformers import BertTokenizer, BertModel

# ✅ Paths to Models and Data
model_path_biobert = '/content/drive/MyDrive/biobert_chia_model'
model_path_clinical = '/content/drive/MyDrive/clinical_trial_ner_model'
cleaned_trial_data_file = '/content/drive/MyDrive/chia_with_scope_cleaned.txt'
fixed_output_file = '/content/drive/MyDrive/chia_with_scope_fixed_all.txt'

# ✅ Step 2: Preprocess Dataset with All Valid Lines Included
def preprocess_all_lines(input_file, output_file):
    """
    Preprocess the dataset, assigning placeholder trial IDs for invalid lines
    to include all lines in the dataset.
    """
    fixed_data = []
    current_trial_id = "Unknown_Trial"  # Placeholder for invalid lines

    with open(input_file, 'r', encoding='utf-8') as file:
        lines = file.readlines()

        for line in lines:
            parts = line.strip().split('\t')

            if len(parts) == 2:
                # Valid line with Trial ID and text
                current_trial_id, text = parts
                fixed_data.append(f"{current_trial_id}\t{text}")

            elif len(parts) == 1:
                # Assign invalid lines to the placeholder Trial ID
                text = parts[0]
                fixed_data.append(f"{current_trial_id}\t{text}")

            else:
                print(f"⚠️ Skipping empty line: {line.strip()}")

    # ✅ Save the preprocessed data
    with open(output_file, 'w', encoding='utf-8') as out_file:
        for line in fixed_data:
            out_file.write(line + '\n')

    print(f"\n✅ Fixed dataset saved to {output_file}")
    print(f"✅ Included {len(fixed_data)} valid lines.")

# ✅ Preprocess the dataset to include all valid lines
preprocess_all_lines(cleaned_trial_data_file, fixed_output_file)

# ✅ Step 3: Load Models and Tokenizers
# Load BioBERT
tokenizer_biobert = BertTokenizer.from_pretrained(model_path_biobert)
model_biobert = BertModel.from_pretrained(model_path_biobert)

# Load Clinical NER model
tokenizer_clinical = BertTokenizer.from_pretrained(model_path_clinical)
model_clinical = BertModel.from_pretrained(model_path_clinical)

# ✅ Step 4: Functions for Encoding and Similarity Calculation
# Function to encode text into BERT embeddings
def encode_text(text, tokenizer, model):
    """Encodes text into BERT embeddings."""
    try:
        inputs = tokenizer(text, return_tensors='pt', truncation=True, padding=True, max_length=512)
        with torch.no_grad():
            outputs = model(**inputs)
        return outputs.last_hidden_state.mean(dim=1).numpy()
    except Exception as e:
        print(f"⚠️ Error encoding text: {text[:30]}... => {e}")
        return None

# Function to load trial data into dictionary with embeddings
def load_trial_data(filepath, tokenizer, model):
    """Loads trial data into dictionary with embeddings."""
    trial_embeddings = {}

    with open(filepath, 'r', encoding='utf-8') as file:
        lines = file.readlines()

        for line in lines:
            parts = line.strip().split('\t')
            if len(parts) == 2:
                trial_id, text = parts
                embedding = encode_text(text, tokenizer, model)
                if embedding is not None:
                    trial_embeddings[trial_id] = embedding
    print(f"\n✅ Loaded {len(trial_embeddings)} valid trial records.")
    return trial_embeddings

# ✅ Step 5: Generate Synthetic Patient Data
def generate_synthetic_patients(num_patients=10):
    """Generates synthetic patient records."""
    patients = [f"P{i+1}" for i in range(num_patients)]
    patient_data = [f"{p}\tPatient with condition related to trial {p}" for p in patients]

    print(f"\n✅ Generated {len(patient_data)} synthetic patient records.")
    return patient_data

# ✅ Step 6: Load Preprocessed Trial Data (All Valid Lines)
trial_embeddings = load_trial_data(fixed_output_file, tokenizer_clinical, model_clinical)

# ✅ Step 7: Generate and Encode Synthetic Patient Data
patient_data = generate_synthetic_patients()
patient_embeddings = {}

for line in patient_data:
    parts = line.strip().split('\t')
    if len(parts) == 2:
        patient_id, text = parts
        embedding = encode_text(text, tokenizer_biobert, model_biobert)
        if embedding is not None:
            patient_embeddings[patient_id] = embedding

# ✅ Step 8: Calculate Similarity Between Patients and Trials
results = []

for patient_id, patient_embedding in patient_embeddings.items():
    for trial_id, trial_embedding in trial_embeddings.items():
        # Calculate similarity
        similarity = cosine_similarity(patient_embedding, trial_embedding)[0][0]
        overall_match = 'Strong Match' if similarity > 0.75 else 'Weak Match'
        results.append([patient_id, trial_id, similarity, overall_match])

# ✅ Step 9: Display the Results
if results:
    results_df = pd.DataFrame(results, columns=['Patient ID', 'Trial ID', 'Similarity', 'Overall Match'])

    print("\n✅ Results Summary Table:")
    print(results_df)
else:
    print("\n⚠️ No valid patient-trial pairs found. Try with more synthetic or cleaned real data.")

# ✅ Step 10: Model Evaluation with Robust Logic
biobert_scores = []
clinical_scores = []

# Categorize scores based on inclusion/exclusion criteria patterns
for result in results:
    trial_id = result[1].lower()
    similarity = result[2]

    if 'exc' in trial_id or 'exclude' in trial_id:
        biobert_scores.append(similarity)
    elif 'inc' in trial_id or 'include' in trial_id:
        clinical_scores.append(similarity)
    else:
        biobert_scores.append(similarity)
        clinical_scores.append(similarity)

# ✅ Display Model Performance Comparison
if biobert_scores and clinical_scores:
    mean_biobert = np.mean(biobert_scores)
    mean_clinical = np.mean(clinical_scores)

    print("\n✅ Model Performance Comparison:")
    print(f"BioBERT Mean Similarity: {mean_biobert:.4f}")
    print(f"Clinical NER Mean Similarity: {mean_clinical:.4f}")

    if mean_biobert > mean_clinical:
        print("\n✅ BioBERT model is better for exclusion criteria.")
    else:
        print("\n✅ Clinical trial NER model is better for inclusion criteria.")
else:
    avg_similarity = np.mean([result[2] for result in results])
    print(f"\n✅ Average similarity across all trials: {avg_similarity:.4f}")



✅ Fixed dataset saved to /content/drive/MyDrive/chia_with_scope_fixed_all.txt
✅ Included 0 valid lines.


Some weights of BertModel were not initialized from the model checkpoint at /content/drive/MyDrive/biobert_chia_model and are newly initialized: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of BertModel were not initialized from the model checkpoint at /content/drive/MyDrive/clinical_trial_ner_model and are newly initialized: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.



✅ Loaded 0 valid trial records.

✅ Generated 10 synthetic patient records.

⚠️ No valid patient-trial pairs found. Try with more synthetic or cleaned real data.

✅ Average similarity across all trials: nan


In [18]:
# ✅ Step 1: Import Libraries
import os
import torch
import numpy as np
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity
from transformers import BertTokenizer, BertModel

# ✅ Step 2: Load Models and Tokenizers
# Paths to Models (Use your Colab path here)
model_path_biobert = '/content/drive/MyDrive/biobert_chia_model'
model_path_clinical = '/content/drive/MyDrive/clinical_trial_ner_model'

# Load BioBERT
tokenizer_biobert = BertTokenizer.from_pretrained(model_path_biobert)
model_biobert = BertModel.from_pretrained(model_path_biobert)

# Load Clinical NER model
tokenizer_clinical = BertTokenizer.from_pretrained(model_path_clinical)
model_clinical = BertModel.from_pretrained(model_path_clinical)

# ✅ Step 3: Sample Data Generation
# Sample trial data (similar to `chai_with_scope`)
trial_data = [
    "NCT001_exc.txt\tHistory of heart attack or stroke in the past 6 months.",
    "NCT001_exc.txt\tCurrent use of chemotherapy or radiotherapy.",
    "NCT001_inc.txt\tPatients aged between 18 and 65 years.",
    "NCT001_inc.txt\tDiagnosed with early-stage breast cancer.",
    "NCT002_exc.txt\tChronic kidney disease (Stage 4 or higher).",
    "NCT002_exc.txt\tUse of narcotic analgesics more than twice per week.",
    "NCT002_inc.txt\tEligible participants must have a BMI below 30.",
    "NCT002_inc.txt\tNon-smokers for the past 5 years."
]

# Sample patient EHR data
patient_data = [
    "P1\tPatient with early-stage breast cancer, 45 years old, non-smoker.",
    "P2\tPatient with chronic kidney disease, stage 4, using narcotic analgesics weekly.",
    "P3\tPatient with no history of heart attack, 30 years old, diagnosed with high BMI.",
    "P4\tPatient with history of stroke in the last 6 months, undergoing radiotherapy.",
    "P5\tPatient aged 50, non-smoker, no history of chronic diseases."
]

# ✅ Step 4: Functions for Encoding and Similarity Calculation
# Function to encode text into BERT embeddings
def encode_text(text, tokenizer, model):
    """Encodes text into BERT embeddings."""
    try:
        inputs = tokenizer(text, return_tensors='pt', truncation=True, padding=True, max_length=512)
        with torch.no_grad():
            outputs = model(**inputs)
        return outputs.last_hidden_state.mean(dim=1).numpy()
    except Exception as e:
        print(f"⚠️ Error encoding text: {text[:30]}... => {e}")
        return None

# ✅ Step 5: Generate Embeddings
# Load trial data into dictionary with embeddings
trial_embeddings = {}

for line in trial_data:
    trial_id, text = line.strip().split('\t')
    embedding = encode_text(text, tokenizer_clinical, model_clinical)
    if embedding is not None:
        trial_embeddings[trial_id] = embedding

# Load patient EHR data into dictionary with embeddings
patient_embeddings = {}

for line in patient_data:
    patient_id, text = line.strip().split('\t')
    embedding = encode_text(text, tokenizer_biobert, model_biobert)
    if embedding is not None:
        patient_embeddings[patient_id] = embedding

# ✅ Step 6: Calculate Similarity Between Patients and Trials
results = []

for patient_id, patient_embedding in patient_embeddings.items():
    for trial_id, trial_embedding in trial_embeddings.items():
        # Calculate cosine similarity
        similarity = cosine_similarity(patient_embedding, trial_embedding)[0][0]
        overall_match = 'Strong Match' if similarity > 0.75 else 'Weak Match'
        results.append([patient_id, trial_id, similarity, overall_match])

# ✅ Step 7: Display the Results
if results:
    results_df = pd.DataFrame(results, columns=['Patient ID', 'Trial ID', 'Similarity', 'Overall Match'])

    print("\n✅ Results Summary Table:")
    print(results_df)
else:
    print("\n⚠️ No valid patient-trial pairs found. Try with more synthetic or cleaned real data.")

# ✅ Step 8: Model Evaluation with Robust Logic
biobert_scores = []
clinical_scores = []

# Categorize scores based on inclusion/exclusion criteria patterns
for result in results:
    trial_id = result[1].lower()
    similarity = result[2]

    if 'exc' in trial_id or 'exclude' in trial_id:
        biobert_scores.append(similarity)
    elif 'inc' in trial_id or 'include' in trial_id:
        clinical_scores.append(similarity)
    else:
        biobert_scores.append(similarity)
        clinical_scores.append(similarity)

# ✅ Display Model Performance Comparison
if biobert_scores and clinical_scores:
    mean_biobert = np.mean(biobert_scores)
    mean_clinical = np.mean(clinical_scores)

    print("\n✅ Model Performance Comparison:")
    print(f"BioBERT Mean Similarity: {mean_biobert:.4f}")
    print(f"Clinical NER Mean Similarity: {mean_clinical:.4f}")

    if mean_biobert > mean_clinical:
        print("\n✅ BioBERT model is better for exclusion criteria.")
    else:
        print("\n✅ Clinical trial NER model is better for inclusion criteria.")
else:
    avg_similarity = np.mean([result[2] for result in results])
    print(f"\n✅ Average similarity across all trials: {avg_similarity:.4f}")


Some weights of BertModel were not initialized from the model checkpoint at /content/drive/MyDrive/biobert_chia_model and are newly initialized: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of BertModel were not initialized from the model checkpoint at /content/drive/MyDrive/clinical_trial_ner_model and are newly initialized: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.



✅ Results Summary Table:
   Patient ID        Trial ID  Similarity Overall Match
0          P1  NCT001_exc.txt    0.019342    Weak Match
1          P1  NCT001_inc.txt    0.019599    Weak Match
2          P1  NCT002_exc.txt    0.016947    Weak Match
3          P1  NCT002_inc.txt    0.018551    Weak Match
4          P2  NCT001_exc.txt    0.031270    Weak Match
5          P2  NCT001_inc.txt    0.033875    Weak Match
6          P2  NCT002_exc.txt    0.029460    Weak Match
7          P2  NCT002_inc.txt    0.032021    Weak Match
8          P3  NCT001_exc.txt    0.020111    Weak Match
9          P3  NCT001_inc.txt    0.019498    Weak Match
10         P3  NCT002_exc.txt    0.018222    Weak Match
11         P3  NCT002_inc.txt    0.017672    Weak Match
12         P4  NCT001_exc.txt    0.037511    Weak Match
13         P4  NCT001_inc.txt    0.039327    Weak Match
14         P4  NCT002_exc.txt    0.035341    Weak Match
15         P4  NCT002_inc.txt    0.037453    Weak Match
16         P5  NCT001_

In [19]:
# ✅ Step 1: Import Libraries
import os
import torch
import numpy as np
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity
from transformers import BertTokenizer, BertModel

# ✅ Step 2: Load Models and Tokenizers
# Paths to Models (Use your Colab path here)
model_path_biobert = '/content/drive/MyDrive/biobert_chia_model'
model_path_clinical = '/content/drive/MyDrive/clinical_trial_ner_model'

# Load BioBERT
tokenizer_biobert = BertTokenizer.from_pretrained(model_path_biobert)
model_biobert = BertModel.from_pretrained(model_path_biobert)

# Load Clinical NER model
tokenizer_clinical = BertTokenizer.from_pretrained(model_path_clinical)
model_clinical = BertModel.from_pretrained(model_path_clinical)

# ✅ Step 3: Sample Data Generation
# New clinical trial data (similar to `chai_with_scope`)
trial_data = [
    "NCT003_exc.txt\tPatients with active tuberculosis or other infectious diseases.",
    "NCT003_exc.txt\tUse of immunosuppressants in the last 12 months.",
    "NCT003_inc.txt\tParticipants with Type 2 diabetes, aged 35-70 years.",
    "NCT003_inc.txt\tPatients with controlled hypertension and BMI below 28.",
    "NCT004_exc.txt\tHistory of severe allergic reactions to investigational drugs.",
    "NCT004_exc.txt\tUse of opioids for chronic pain management.",
    "NCT004_inc.txt\tParticipants aged 50 or older, diagnosed with osteoarthritis.",
    "NCT004_inc.txt\tPatients with stable cardiovascular conditions."
]

# New patient EHR data
patient_data = [
    "P1\tPatient with Type 2 diabetes, 55 years old, controlled hypertension.",
    "P2\tPatient with osteoarthritis, aged 60, stable cardiovascular condition.",
    "P3\tPatient with active tuberculosis, currently on immunosuppressants.",
    "P4\tPatient with no severe allergic reactions, history of opioid use for chronic pain.",
    "P5\tPatient aged 45, BMI below 28, no cardiovascular issues."
]

# ✅ Step 4: Functions for Encoding and Similarity Calculation
# Function to encode text into BERT embeddings
def encode_text(text, tokenizer, model):
    """Encodes text into BERT embeddings."""
    try:
        inputs = tokenizer(text, return_tensors='pt', truncation=True, padding=True, max_length=512)
        with torch.no_grad():
            outputs = model(**inputs)
        return outputs.last_hidden_state.mean(dim=1).numpy()
    except Exception as e:
        print(f"⚠️ Error encoding text: {text[:30]}... => {e}")
        return None

# ✅ Step 5: Generate Embeddings
# Load trial data into dictionary with embeddings
trial_embeddings = {}

for line in trial_data:
    trial_id, text = line.strip().split('\t')
    embedding = encode_text(text, tokenizer_clinical, model_clinical)
    if embedding is not None:
        trial_embeddings[trial_id] = embedding

# Load patient EHR data into dictionary with embeddings
patient_embeddings = {}

for line in patient_data:
    patient_id, text = line.strip().split('\t')
    embedding = encode_text(text, tokenizer_biobert, model_biobert)
    if embedding is not None:
        patient_embeddings[patient_id] = embedding

# ✅ Step 6: Calculate Similarity Between Patients and Trials
results = []

for patient_id, patient_embedding in patient_embeddings.items():
    for trial_id, trial_embedding in trial_embeddings.items():
        # Calculate cosine similarity
        similarity = cosine_similarity(patient_embedding, trial_embedding)[0][0]
        overall_match = 'Strong Match' if similarity > 0.75 else 'Weak Match'
        results.append([patient_id, trial_id, similarity, overall_match])

# ✅ Step 7: Display the Results
if results:
    results_df = pd.DataFrame(results, columns=['Patient ID', 'Trial ID', 'Similarity', 'Overall Match'])

    print("\n✅ Results Summary Table:")
    print(results_df)
else:
    print("\n⚠️ No valid patient-trial pairs found. Try with more synthetic or cleaned real data.")

# ✅ Step 8: Model Evaluation with Robust Logic
biobert_scores = []
clinical_scores = []

# Categorize scores based on inclusion/exclusion criteria patterns
for result in results:
    trial_id = result[1].lower()
    similarity = result[2]

    if 'exc' in trial_id or 'exclude' in trial_id:
        biobert_scores.append(similarity)
    elif 'inc' in trial_id or 'include' in trial_id:
        clinical_scores.append(similarity)
    else:
        biobert_scores.append(similarity)
        clinical_scores.append(similarity)

# ✅ Display Model Performance Comparison
if biobert_scores and clinical_scores:
    mean_biobert = np.mean(biobert_scores)
    mean_clinical = np.mean(clinical_scores)

    print("\n✅ Model Performance Comparison:")
    print(f"BioBERT Mean Similarity: {mean_biobert:.4f}")
    print(f"Clinical NER Mean Similarity: {mean_clinical:.4f}")

    if mean_biobert > mean_clinical:
        print("\n✅ BioBERT model is better for exclusion criteria.")
    else:
        print("\n✅ Clinical trial NER model is better for inclusion criteria.")
else:
    avg_similarity = np.mean([result[2] for result in results])
    print(f"\n✅ Average similarity across all trials: {avg_similarity:.4f}")


Some weights of BertModel were not initialized from the model checkpoint at /content/drive/MyDrive/biobert_chia_model and are newly initialized: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of BertModel were not initialized from the model checkpoint at /content/drive/MyDrive/clinical_trial_ner_model and are newly initialized: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.



✅ Results Summary Table:
   Patient ID        Trial ID  Similarity Overall Match
0          P1  NCT003_exc.txt    0.024683    Weak Match
1          P1  NCT003_inc.txt    0.025385    Weak Match
2          P1  NCT004_exc.txt    0.021008    Weak Match
3          P1  NCT004_inc.txt    0.024696    Weak Match
4          P2  NCT003_exc.txt    0.013199    Weak Match
5          P2  NCT003_inc.txt    0.013903    Weak Match
6          P2  NCT004_exc.txt    0.010074    Weak Match
7          P2  NCT004_inc.txt    0.012152    Weak Match
8          P3  NCT003_exc.txt    0.035938    Weak Match
9          P3  NCT003_inc.txt    0.035167    Weak Match
10         P3  NCT004_exc.txt    0.033137    Weak Match
11         P3  NCT004_inc.txt    0.038494    Weak Match
12         P4  NCT003_exc.txt    0.015358    Weak Match
13         P4  NCT003_inc.txt    0.015333    Weak Match
14         P4  NCT004_exc.txt    0.013614    Weak Match
15         P4  NCT004_inc.txt    0.017356    Weak Match
16         P5  NCT003_