## Goals of Notebook

* Combine relevant text columns into new column entitled "search_terms"
* Tokenize search terms so they can later be ranked in our search engine

In [23]:
import pandas as pd
import spacy
import re
from collections import OrderedDict

In [24]:

!python -m spacy download en_core_web_sm
nlp = spacy.load("en_core_web_sm")

[38;5;2m✔ Download and installation successful[0m
You can now load the model via spacy.load('en_core_web_sm')


In [25]:
# Import cleaned csv
df = pd.read_csv('../clinical_trial.csv')

In [26]:
df.head()

Unnamed: 0.1,Unnamed: 0,agency,brief_title,brief_summary,city,condition,country,detailed_description,eligibility,gender,keyword,mesh_term,official_title,overall_status,phase,state,url
0,0,Fraunhofer Institute for Molecular Biology and...,PREVALENCE OF Anti-CCP POSITIVITY AND SUBCLINI...,"Non-interventional, prospective, observational...",Frankfurt am Main,Rheumatoid Arthritis,Germany,Studies of early arthritis cohorts have shown ...,population without pre-classified RA but new o...,All,Anti-CCP,Arthritis,PREVALENCE OF ANTI-CYCLIC CITRULLINATED PEPTID...,Recruiting,Missing,Hessia,https://clinicaltrials.gov/show/NCT03267147
1,1,University of Washington,Aldesleukin and Pembrolizumab in Treating Pati...,This phase I trial studies the side effects an...,Seattle,Clear Cell Renal Cell Carcinoma,United States,PRIMARY OBJECTIVES: I. To evaluate the safety ...,Inclusion Criteria: - Be willing and able to ...,All,Missing,Carcinoma,A Phase I Trial of Interleukin-2 (Aldesleukin)...,Recruiting,Phase 1,Washington,https://clinicaltrials.gov/show/NCT03260504
2,3,"CSPC ZhongQi Pharmaceutical Technology Co., Ltd.",Nab-Paclitaxel Versus Paclitaxel Plus Carbopla...,"This is a randomized, multicenter, open, contr...",Missing,Non-small Cell Lung Cancer,Missing,Subjects will receive one of two treatment reg...,Inclusion Criteria: - Accepted the purpose of...,All,Missing,Lung Neoplasms,"The Randomized,Open, Multicenter Phase III Stu...",Not yet recruiting,Phase 3,Missing,https://clinicaltrials.gov/show/NCT03262948
3,4,Fraunhofer Institute for Molecular Biology and...,PREVALENCE OF Anti-CCP POSITIVITY AND SUBCLINI...,"Non-interventional, prospective, observational...",Frankfurt am Main,Rheumatoid Arthritis,Germany,Studies of early arthritis cohorts have shown ...,population without pre-classified RA but new o...,All,Anti-CCP,Arthritis,PREVALENCE OF ANTI-CYCLIC CITRULLINATED PEPTID...,Recruiting,Missing,Hessia,https://clinicaltrials.gov/show/NCT03267147
4,7,Chang Gung Memorial Hospital,Analgesic Effect of Low Level Laser for Proced...,Neonates undergo several painful procedures an...,Kaohsiung,Procedural Pain,Taiwan,"This open-label, randomized controlled trial i...",Inclusion Criteria: - healthy fullterm neonat...,All,neonates,"Pain, Procedural",Analgesic Effect of Low Level Laser for Proced...,Recruiting,Missing,Missing,https://clinicaltrials.gov/show/NCT03268148


In [27]:
# Create search_terms column containing relevant text columns 
df['search_terms'] = df['mesh_term'].str.lower() + ' ' + df['condition'].str.lower() + ' ' + df['keyword'].str.lower()

### Functions

In [28]:
# Remove duplicate words from string
def set_o_strings(strings):
    new = OrderedDict()
    for string in strings.split():
        new[string] = ''
    return u" ".join(new.keys())

In [29]:
# Remove missing from search terms, replace with blank space
def replace_missing(string):
    new = re.sub(' missing', '', string)
    return u"{}".format(new)

In [30]:
# Apply above functions to search terms
df['search_terms'] = df['search_terms'].apply(replace_missing)
df['search_terms'] = df['search_terms'].apply(set_o_strings)

In [31]:
df['search_terms'].head(20)

0                         arthritis rheumatoid anti-ccp
1                            carcinoma clear cell renal
2                  lung neoplasms non-small cell cancer
3                         arthritis rheumatoid anti-ccp
4                        pain, procedural pain neonates
5                 retinal diseases diabetic retinopathy
6                                  kidney diseases ldkt
7                      glaucoma artificial intelligence
8                 spinal cord injuries neuropathic pain
9     behavior, addictive behavior problem smartphon...
10        tacrolimus kidney transplantation envarsus xr
11                missing advanced cancer immunotherapy
12                                hemorrhage antepartum
13             progesterone abortion in first trimester
14                     brain neoplasms tumor tumor, mri
15                  disease major depression stem cells
16                             diabetes mellitus type 1
17            infarction risk stratification pre

In [32]:
# Tokenize search terms
df['keyword_tokens'] = df['search_terms'].apply(lambda x: nlp.tokenizer(x))

In [33]:
df['keyword_tokens'].head(50)

0                 (arthritis, rheumatoid, anti, -, ccp)
1                       (carcinoma, clear, cell, renal)
2        (lung, neoplasms, non, -, small, cell, cancer)
3                 (arthritis, rheumatoid, anti, -, ccp)
4                 (pain, ,, procedural, pain, neonates)
5            (retinal, diseases, diabetic, retinopathy)
6                              (kidney, diseases, ldkt)
7                  (glaucoma, artificial, intelligence)
8           (spinal, cord, injuries, neuropathic, pain)
9     (behavior, ,, addictive, behavior, problem, sm...
10    (tacrolimus, kidney, transplantation, envarsus...
11           (missing, advanced, cancer, immunotherapy)
12                             (hemorrhage, antepartum)
13       (progesterone, abortion, in, first, trimester)
14             (brain, neoplasms, tumor, tumor, ,, mri)
15            (disease, major, depression, stem, cells)
16                        (diabetes, mellitus, type, 1)
17       (infarction, risk, stratification, pred

In [33]:
df.to_csv("clinial_trial_tokenized")