<a href="https://colab.research.google.com/github/S7mitbarua/Clinical-Trial-Matching-System/blob/main/Clinical_Trial_Matching_System.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [24]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import nltk
from nltk.corpus import stopwords
from faker import Faker  # Install using !pip install faker

nltk.download('stopwords')

# Step 3: Simulate Clinical Trials Data
fake = Faker()
np.random.seed(42)

num_trials = 100
clinical_trials_data = pd.DataFrame({
    'trial_id': range(1, num_trials + 1),
    'title': [fake.sentence() for _ in range(num_trials)],
    'eligibility_criteria': [fake.paragraph() for _ in range(num_trials)]
})

# Step 4: Simulate Hospital Data
num_hospitals = 5
hospital_data = pd.DataFrame({
    'hospital_id': range(1, num_hospitals + 1),
    'hospital_name': [fake.company() + ' Hospital' for _ in range(num_hospitals)],
    'location': ['Boston'] * num_hospitals
})

# Step 5: Simulate Patient Data with Various Health Conditions
num_patients = 50
health_conditions = ['Heart Disease', 'Diabetes', 'Asthma', 'Hypertension', 'Arthritis', 'Cancer', 'Migraine', 'Depression']
patient_data = pd.DataFrame({
    'patient_id': range(1, num_patients + 1),
    'patient_name': [fake.name() for _ in range(num_patients)],
    'age': [fake.random_int(18, 80) for _ in range(num_patients)],
    'health_condition': np.random.choice(health_conditions, size=num_patients),
    'hospital_id': np.random.choice(hospital_data['hospital_id'], size=num_patients)
})

# Step 6: Preprocess and Tokenize Text
stop_words = set(stopwords.words('english'))

def preprocess_text(text):
    words = nltk.word_tokenize(text.lower())
    words = [word for word in words if word.isalnum() and word not in stop_words]
    return ' '.join(words)

clinical_trials_data['processed_text'] = clinical_trials_data['eligibility_criteria'].apply(preprocess_text)

# Step 7: Vectorize Text Using TF-IDF
vectorizer = TfidfVectorizer()
tfidf_matrix = vectorizer.fit_transform(clinical_trials_data['processed_text'])

# Step 8: Match Patients with Clinical Trials
def match_patient_with_trials(patient_profile, tfidf_matrix, vectorizer):
    processed_patient_profile = preprocess_text(patient_profile)
    patient_vector = vectorizer.transform([processed_patient_profile])
    similarities = cosine_similarity(patient_vector, tfidf_matrix).flatten()
    top_indices = similarities.argsort()[-5:][::-1]
    return clinical_trials_data.iloc[top_indices]

# Step 9: Example Usage
patient_profile = "Patient with chronic health condition seeking relevant clinical trial."
matching_trials = match_patient_with_trials(patient_profile, tfidf_matrix, vectorizer)
print("Matching Trials:")
print(matching_trials[['trial_id', 'title', 'eligibility_criteria']])
print("\nPatient Data:")
print(patient_data)
print("\nHospital Data:")
print(hospital_data)


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Matching Trials:
    trial_id                                              title  \
2          3                                  Rich apply court.   
88        89                        Certainly certainly friend.   
63        64                           Film night remain movie.   
80        81        Art or skin red sometimes attack structure.   
24        25  Professor everything experience operation othe...   

                                 eligibility_criteria  
2   History their might Congress degree enough. Go...  
88                          Rich health reality trip.  
63    Station unit outside his health exist possible.  
80  Condition particular simple rule. Growth a any...  
24  Result direction reach she total question quit...  

Patient Data:
    patient_id         patient_name  age health_condition  hospital_id
0            1         Amanda Smith   71         Migraine            3
1            2         Scott Thomas   44     Hypertension            4
2            3  