## Importing Genomic database

In [369]:
import pandas as pd

df1 = pd.read_csv('C:/Users/Owner/Desktop/Hackathon/clinvar_result.csv')
df2 = pd.read_csv('C:/Users/Owner/Desktop/Hackathon/htt.csv')
clinvar = df1.loc[:, ~df1.columns.str.startswith('Unnamed')]
genomad = df2.loc[:, ~df2.columns.str.startswith('Unnamed')]


In [370]:
variant_db = pd.merge(clinvar, genomad, on='VariationID', how='inner')
variant_db.head()

Unnamed: 0,VariationID,AlleleID(s),Germline classification,Chromosome,Position,Reference,Alternate,Allele Frequency
0,2654593,2822358,Likely benign,4,3074876,C,CCAGCAGCAGCAGCAGCAGCAGCAGCAGCAG,0.001744
1,1690808,1683216,Benign,4,3074876,C,CCAGCAGCAGCAG,0.011837
2,1687507,1679800,Likely pathogenic,4,3074876,C,CCAGCAGCAGCAGCAGCAGCAGCAGCAGCAGCAGCAGCAGCAGCAG...,3.5e-05
3,1301634,1291917,Uncertain significance,4,3074876,C,CCAG,0.059586
4,1174947,1164242,Benign/Likely benign,4,3074876,C,CCAGCAGCAGCAGCAG,0.00895


## Simulating patient Gene data

In [371]:
import random

def simulate_htt_variants(num_variants=random.randint(1, 3)):
    """Simulates variants within the HTT gene region."""

    htt_gene_start = 3042167 # range of positions found in variants on gnomAD
    htt_gene_end = 3240131

    variants = []
    for _ in range(num_variants):
        position = random.randint(htt_gene_start, htt_gene_end)
        reference = random.choice(["A", "C", "G", "T"])
        alternate = random.choice(["A", "C", "G", "T"])
        while alternate == reference: #make sure that alternate is different from reference.
            alternate = random.choice(["A", "C", "G", "T"])
        variants.append({
            "Chromosome": "4",
            "Position": position,
            "Reference": reference,
            "Alternate": alternate,
        })
    return variants

## Simulating general patient data

In [372]:
import pandas as pd
import random
import numpy as np

def simulate_hd_patients(num_patients=10):
    """Simulates patient data for Huntington's Disease."""

    patient_ids = [f"HD-P{i:03d}" for i in range(1, num_patients + 1)]
    cag_repeats = [int(max(random.gauss(40, 10), 20)) for _ in range(num_patients)] #Gaussian distribution, with a min of 20.
    symptom_levels = ["Mild", "Moderate", "Severe"]
    chorea = [random.choice(symptom_levels) for _ in range(num_patients)]
    cognitive_decline = [random.choice(symptom_levels) for _ in range(num_patients)]
    psychiatric_symptoms = [random.choice(symptom_levels) for _ in range(num_patients)]
    family_history = [np.random.choice([0, 1], p=[0.3, 0.7]) for _ in range(num_patients)]
    age = [int(max(random.gauss(50, 15), 15)) for _ in range(num_patients)]
    sex = [random.choice(["M", "F"]) for _ in range(num_patients)]
    travel_preference = [random.choice(["Yes", "No"]) for _ in range(num_patients)]
    cities = ["New York", "Los Angeles", "Chicago", "Florida", "San Jose"]
    patient_cities = [random.choice(cities) for _ in range(num_patients)]
    variants = [simulate_htt_variants() for _ in range(num_patients)]

    data = {
        "patientID": patient_ids,
        "CAG_Repeats": cag_repeats,
        "Chorea": chorea,
        "Cognitive_Decline": cognitive_decline,
        "Psychiatric_Symptoms": psychiatric_symptoms,
        "Family_history": family_history,
        "Age": age,
        "Sex": sex,
        "Travel_Preference": travel_preference,
        "PatientCity": patient_cities,
        "Variants": variants,
    }

    return pd.DataFrame(data)

# Example usage
patients_df = simulate_hd_patients(10)
print(patients_df.head().to_string(max_colwidth=None))

  patientID  CAG_Repeats    Chorea Cognitive_Decline Psychiatric_Symptoms  Family_history  Age Sex Travel_Preference  PatientCity                                                                                                                                                      Variants
0   HD-P001           31      Mild              Mild             Moderate               1   48   M                No      Florida  [{'Chromosome': '4', 'Position': 3180968, 'Reference': 'C', 'Alternate': 'G'}, {'Chromosome': '4', 'Position': 3110845, 'Reference': 'T', 'Alternate': 'A'}]
1   HD-P002           33    Severe              Mild             Moderate               1   30   F               Yes  Los Angeles  [{'Chromosome': '4', 'Position': 3085746, 'Reference': 'A', 'Alternate': 'G'}, {'Chromosome': '4', 'Position': 3113915, 'Reference': 'G', 'Alternate': 'T'}]
2   HD-P003           49      Mild            Severe                 Mild               1   35   M               Yes     San Jose  [{'Ch

## Example trial

In [None]:
trials = [
    {
        "NCTId": "NCT01234567",
        "BriefTitle": "Huntington's Disease Trial 1",
        "EligibilityCriteria": "Inclusion Criteria: Confirmed diagnosis of Huntington's disease via CAG-repeat length analysis,"
                               "Higher CAG_repeats will lead to higher probability of having the disease."
                               "Patient must be between 18 and 74 years of age. Mild to moderate chorea,"
                               "exclusion criteria: No severe psychiatric disorders. No depression.",
        "StudyStatus": "Recruiting",
        "TrialCity": "New York",
        "Travel_Required": "Yes" #Trial travel requirement.
    }
]

## Defining the eligibility score

In [374]:
import pandas as pd
from transformers import pipeline
import os

def calculate_hd_clinical_match(patient):
    clinical_match = 1.0
    if patient["CAG_Repeats"] < 36:
        clinical_match -= 0.4
    if patient["Age"] < 18 or patient["Age"] > 74:
        clinical_match -= 0.6
    return clinical_match

def calculate_variant_novelty(variant, variant_db):
    novelty_score = 0.0
    clinvar_classification = "Not Found"
    gnomad_frequency = None

    matching_variants = variant_db[
        (variant_db["Chromosome"] == variant["Chromosome"]) &
        (variant_db["Position"] == variant["Position"]) &
        (variant_db["Reference"] == variant["Reference"]) &
        (variant_db["Alternate"] == variant["Alternate"])
    ]

    if not matching_variants.empty:
        clinvar_classification = matching_variants["Germline classification"].iloc[0]
        gnomad_frequency = matching_variants["Allele Frequency"].iloc[0]

        if clinvar_classification == "Uncertain significance":
            novelty_score += 0.5 #increase to 0.5
        if gnomad_frequency is not None and gnomad_frequency < 0.0001:
            novelty_score += 0.5 #increase to 0.5
    else:
        novelty_score = 1 #Variant not found is now a score of 1.0.

    return min(novelty_score, 1.0), clinvar_classification

def match_patients_to_trials(patients_df, trials, variant_db):
    ranker = pipeline("zero-shot-classification", model="facebook/bart-large-mnli", device='cpu') #or use another model.
    results = {}

    # Calculate variant novelty scores
    for index, patient in patients_df.iterrows():
        patient_variants = patient["Variants"]
        total_novelty_score = 0.0
        clinvar_classifications = []
        for variant in patient_variants:
            novelty_score, clinvar_classification = calculate_variant_novelty(variant, variant_db)
            total_novelty_score += novelty_score
            clinvar_classifications.append(clinvar_classification)
        patients_df.loc[index, "variant_novelty_score"] = total_novelty_score/ len(patient_variants)
        patients_df.loc[index, "clinvar_classification"] = ", ".join(clinvar_classifications)

    for trial in trials:
        trial_id = trial["NCTId"]
        criteria = trial["EligibilityCriteria"]
        trial_city = trial["TrialCity"]
        trial_travel = trial["Travel_Required"]
        results[trial_id] = []

        for _, patient in patients_df.iterrows():
            patient_data = patient.to_dict()
            clinical_match = calculate_hd_clinical_match(patient_data)
            patient_city = patient_data.get("PatientCity", "Unknown")
            patient_travel = patient_data.get("Travel_Preference", "Unknown")
            variant_novelty_score = patient["variant_novelty_score"]
            clinvar_classification = patient["clinvar_classification"]

            prompt = f"""
            Patient Data: {patient_data}.
            Variant Novelty Score: {variant_novelty_score}.
            ClinVar Classification: {clinvar_classification}.
            Trial Eligibility Criteria: {criteria}.
            Trial Location: {trial_city}.
            Travel Required: {trial_travel}.

            Instructions:
            1. Analyze the patient data against each eligibility criterion.
            2. Consider the patient's variant novelty and ClinVar classification.
            3. Consider the patient's location and travel preference.
            4. Determine if the patient is a suitable candidate for the trial.
            """
            llama_result = ranker(prompt, candidate_labels=["Yes", "No"])
            yes_score = llama_result["scores"][llama_result["labels"].index("Yes")]
            llama_rank = yes_score

            final_score = (0.4 * clinical_match) + (0.3 * llama_rank) + (0.3 * variant_novelty_score)

            if patient_city != trial_city and trial_travel == "Yes" and patient_travel == "No":
                final_score *= 0.2

            results[trial_id].append({
                "patientID": patient["patientID"],
                "score": final_score,
                "clinical": clinical_match,
                "novelty": variant_novelty_score,
                "clinvar": clinvar_classification,
                "reasoning": llama_result
            })

    return results

In [375]:
matching_results = match_patients_to_trials(patients_df, trials, variant_db)

Device set to use cpu


## Calculating the eligibility sscore

In [376]:
# 5. Display Results (Simplified)
for trial_id, patient_scores in matching_results.items():
    print(f"\nTrial: {trial_id}")
    patient_scores.sort(key=lambda x: x["score"], reverse=True)
    for patient in patient_scores:
        print(f"  Patient: {patient['patientID']}, Score: {patient['score']:.2f}")
        #print(f"  Patient: {patient['patientID']}, clinical Score: {patient['novelty']:.2f}")
        #print(f"  Reasoning: {patient['reasoning']}") #print the reasoning.


Trial: NCT01234567
  Patient: HD-P008, Score: 0.82
  Patient: HD-P009, Score: 0.82
  Patient: HD-P003, Score: 0.82
  Patient: HD-P005, Score: 0.67
  Patient: HD-P002, Score: 0.66
  Patient: HD-P004, Score: 0.66
  Patient: HD-P010, Score: 0.66
  Patient: HD-P006, Score: 0.15
  Patient: HD-P007, Score: 0.15
  Patient: HD-P001, Score: 0.12
