# Search Engine for Clinical Trial

In [2]:
import warnings
warnings.filterwarnings('ignore')

import pandas as pd
import numpy as np
from matplotlib import pyplot
import matplotlib.pyplot as plt
from pathlib import Path

import gensim
from gensim.models import Word2Vec
from gensim.models import FastText

import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords # used for preprocessing
from nltk.stem import WordNetLemmatizer # used for preprocessing
from sklearn.model_selection import train_test_split
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('punkt')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [4]:
#Mounting our google drive on google colab to access our data files
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [5]:
#load the dataset
filepath = '/content/drive/MyDrive/clinical-trail-search-engine/Data/Covid19_clinical_trials.csv'
trial_data=pd.read_csv(filepath) #, index_col= 0

trial_data.head(5)

Unnamed: 0,Date added,Trial ID,Title,Brief title,Acronym,Abstract,Publication date,Active years,Phase,Conditions,...,Gender,Registry,Investigators/Contacts,Sponsors/Collaborators,GRID IDs,Country of Sponsor/Collaborator,Collaborating Funders,Funder Country,Source Linkout,Dimensions URL
0,2021-09-01,NCT05029245,"The 8-week, Prospective, Randomized Controlled...",IntraDermal Versus Intramuscular Comirnaty® Ef...,PRIDE,"The 8-week, Prospective, Randomized controlled...",2021-08-31,2021; 2022,Phase 3,Covid19 Vaccine; Covid19,...,All,ClinicalTrials.gov,,Rajavithi Hospital,grid.415633.6,Thailand,,,https://clinicaltrials.gov/show/NCT05029245,https://app.dimensions.ai/details/clinical_tri...
1,2021-09-01,NCT05029037,High-dose Intravenous Vitamin C (HDIVC) as Adj...,High-dose Intravenous Vitamin C (HDIVC) as Adj...,HDIVC,The objective of this study is to evaluate the...,2021-09-15,2021; 2022,Phase 3,Covid19,...,All,ClinicalTrials.gov,,,,,,,https://clinicaltrials.gov/show/NCT05029037,https://app.dimensions.ai/details/clinical_tri...
2,2021-09-01,NCT05029011,Low-cost Sensor System for COVID-19 Patient Mo...,Low-cost Sensor System for COVID-19 Patient Mo...,,"The MediByte VTS is a low-cost, portable devic...",2021-09-01,2021; 2022,,The Focus is How Well the MediByte VTS Will Mo...,...,All,ClinicalTrials.gov,Helen Driver,Queen's University; National Research Council ...,grid.410356.5; grid.24433.32,Canada; Canada,National Research Council Canada,Canada,https://clinicaltrials.gov/show/NCT05029011,https://app.dimensions.ai/details/clinical_tri...
3,2021-09-01,NCT05028998,The Impact of COVID-19-related Medication Assi...,Covid-related Opioid Treatment Policy Evaluation,COPE,Our nation is facing the COVID-19 pandemic dur...,2021-09-01,2021; 2022; 2023,,Opioid-use Disorder; Alcohol Use Disorder,...,All,ClinicalTrials.gov,,Boston University; Patient-Centered Outcomes R...,grid.189504.1; grid.430109.f; grid.417499.6; g...,United States; United States; United States; U...,Patient-Centered Outcomes Research Institute; ...,United States; United States,https://clinicaltrials.gov/show/NCT05028998,https://app.dimensions.ai/details/clinical_tri...
4,2021-09-01,NCT05028881,Serological Responses to SARS-CoV-2 and Their ...,Hong Kong HIV SARS-CoV-2 Serology,,Immunodeficiency associated with human immunod...,2020-05-16,2020; 2021; 2022; 2023,,HIV Infections; SARS-CoV-2 Infection; Vaccinat...,...,All,ClinicalTrials.gov,Shui Shan Lee,Chinese University of Hong Kong,grid.10784.3a,China,,,https://clinicaltrials.gov/show/NCT05028881,https://app.dimensions.ai/details/clinical_tri...


In [24]:
display(trial_data.shape)
display(trial_data.info())

(12129, 21)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 12129 entries, 0 to 12128
Data columns (total 21 columns):
 #   Column                           Non-Null Count  Dtype 
---  ------                           --------------  ----- 
 0   Date added                       12129 non-null  object
 1   Trial ID                         12129 non-null  object
 2   Title                            12129 non-null  object
 3   Brief title                      11696 non-null  object
 4   Acronym                          4051 non-null   object
 5   Abstract                         12127 non-null  object
 6   Publication date                 12129 non-null  object
 7   Active years                     11061 non-null  object
 8   Phase                            4719 non-null   object
 9   Conditions                       11911 non-null  object
 10  Intervention                     8947 non-null   object
 11  Gender                           12129 non-null  object
 12  Registry                        

None

In [34]:
str(trial_data['Abstract'][0])

'The 8-week, Prospective, Randomized controlled of IntraDermal administration of Comirnaty® 6 microgram compare to Intramuscular Comirnaty® 30 microgram by 28 days interval Efficacy Study in 4 groups of healthy volunteer ( 1 people who complete sinovac vaccination 2 people who received 1 dosage of AstraZeneca vaccine 3 naive vaccination 4 any other vaccination not in 1-3 with anti Spike antibody less than 650 AU/ ml) . Comparison of antibody level and T cell response to SAR-CoV-2 antigen in vitro after 28 day post vaccination is primary outcome and the side effect as well as infection rate in 8 weeks is secondary outcomes.\n\nDetailed Description\nThe 8-week, Prospective, Randomized controlled of IntraDermal administration of Comirnaty® 6 microgram compare to Intramuscular Comirnaty® 30 microgram by 28 days interval Efficacy Study in healthy volunteer.To compare the AntiSpike antibody, ( Anti RBD ) neutralized antibody ( if possible) of SAR-CoV-2 and T-cell response after injection wit

### Data Preprocessing

In [68]:
import re
import string

# Function to lemmatize Words
lemmatizer = WordNetLemmatizer()
def lemmatize(text):
    text = [lemmatizer.lemmatize(token) for token in text]
    return text

#function for data preprocessing of texts
def pre_processing(text):

  text = text.lower()
  text = ' '.join(re.sub("(@[A-Za-z0-9]+)|([^0-9A-Za-z \t])|(\w+:\/\/\S+)"," ",str(text)).split()) #remove urls
  text = re.sub(r'\d+', '', str(text)) #emove no.s
  text=text.replace('\n',' ')
  text = word_tokenize(text)
  text = [char for char in text if char not in string.punctuation]
  text = [word for word in text if word not in stopwords.words('english')]
  text = lemmatize(text)
  text = ' '.join(text)
  return text

In [69]:
#Applying preprocessing for Abstract and Title columns

trial_data['Abstract'] = trial_data['Abstract'].replace(np.nan, '')
trial_data['Title'] = trial_data['Title'].replace(np.nan, '')

for i in range(trial_data.shape[0]):
  trial_data['Abstract'][i]=pre_processing(str(trial_data['Abstract'][i]))
  trial_data['Title'][i]=pre_processing(str(trial_data['Title'][i]))

trial_data.head(5)

Unnamed: 0,Date added,Trial ID,Title,Brief title,Acronym,Abstract,Publication date,Active years,Phase,Conditions,...,Gender,Registry,Investigators/Contacts,Sponsors/Collaborators,GRID IDs,Country of Sponsor/Collaborator,Collaborating Funders,Funder Country,Source Linkout,Dimensions URL
0,2021-09-01,NCT05029245,week prospective randomized controlled intrade...,IntraDermal Versus Intramuscular Comirnaty® Ef...,PRIDE,week prospective randomized controlled intrade...,2021-08-31,2021; 2022,Phase 3,Covid19 Vaccine; Covid19,...,All,ClinicalTrials.gov,,Rajavithi Hospital,grid.415633.6,Thailand,,,https://clinicaltrials.gov/show/NCT05029245,https://app.dimensions.ai/details/clinical_tri...
1,2021-09-01,NCT05029037,high dose intravenous vitamin c hdivc adjuvant...,High-dose Intravenous Vitamin C (HDIVC) as Adj...,HDIVC,objective study evaluate impact hdivc therapy ...,2021-09-15,2021; 2022,Phase 3,Covid19,...,All,ClinicalTrials.gov,,,,,,,https://clinicaltrials.gov/show/NCT05029037,https://app.dimensions.ai/details/clinical_tri...
2,2021-09-01,NCT05029011,low cost sensor system covid patient monitorin...,Low-cost Sensor System for COVID-19 Patient Mo...,,medibyte vt low cost portable device developed...,2021-09-01,2021; 2022,,The Focus is How Well the MediByte VTS Will Mo...,...,All,ClinicalTrials.gov,Helen Driver,Queen's University; National Research Council ...,grid.410356.5; grid.24433.32,Canada; Canada,National Research Council Canada,Canada,https://clinicaltrials.gov/show/NCT05029011,https://app.dimensions.ai/details/clinical_tri...
3,2021-09-01,NCT05028998,impact covid related medication assisted treat...,Covid-related Opioid Treatment Policy Evaluation,COPE,nation facing covid pandemic ongoing opioid ep...,2021-09-01,2021; 2022; 2023,,Opioid-use Disorder; Alcohol Use Disorder,...,All,ClinicalTrials.gov,,Boston University; Patient-Centered Outcomes R...,grid.189504.1; grid.430109.f; grid.417499.6; g...,United States; United States; United States; U...,Patient-Centered Outcomes Research Institute; ...,United States; United States,https://clinicaltrials.gov/show/NCT05028998,https://app.dimensions.ai/details/clinical_tri...
4,2021-09-01,NCT05028881,serological response sars cov temporal pattern...,Hong Kong HIV SARS-CoV-2 Serology,,immunodeficiency associated human immunodefici...,2020-05-16,2020; 2021; 2022; 2023,,HIV Infections; SARS-CoV-2 Infection; Vaccinat...,...,All,ClinicalTrials.gov,Shui Shan Lee,Chinese University of Hong Kong,grid.10784.3a,China,,,https://clinicaltrials.gov/show/NCT05028881,https://app.dimensions.ai/details/clinical_tri...


In [None]:
#Tokenize
tokens = [word_tokenize(word) for word in trial_data.Abstract ]