# NLP Pipeline 

In [157]:
from pymongo import MongoClient
import re
import string
from sklearn.feature_extraction.text import CountVectorizer
from nltk.tokenize import TreebankWordTokenizer
from nltk.stem import PorterStemmer
from nltk.corpus import stopwords
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.preprocessing import Normalizer
import itertools

## Connect to the Mongo clinical_trials database 

In [5]:
def connect_to_mongo(database, collection):
    
    """
    Opens a connection to a specified Mongo DB location
    
    Input Parameters:
    database: name of database to connect to or create (str)
    collection: name of collection to connect to or create (str)
    
    Returns:
    The connection object for the database without a collection specified
    The connection object for a specific Mongo location (database & collection)
    """
    
    client = MongoClient()
    db = client[database]
    mongo_loc = db[collection]
    return db, mongo_loc

In [130]:
trials_loc, eligibility_loc = connect_to_mongo('clinical_trials', 'eligibilities')

### Create a Mongo cursor 

In [131]:
doc_cursor = eligibility_loc.find()

## Clean the text 

### Test on 5 entires 

In [282]:
doc_cursor = eligibility_loc.find().limit(5)
stemmer = PorterStemmer()
doc_count = 1

for doc in doc_cursor:
    inclusion_criteria = doc['inclusion_criteria']
    print(doc['study_id'])
    clean_criteria_list = []
    for criteria in inclusion_criteria:
        print("\nCriteria:\n -------------\n", criteria)
        remove_comma = re.sub(',', '', criteria)
        remove_equals = re.sub('/=', '', remove_comma)
        remove_period_space = re.sub('\. ', ' ', remove_equals)
        remove_less_than = re.sub('less than', '<', remove_period_space)
        remove_less_than_equal = re.sub('less than or equal to', '<', remove_less_than)
        remove_greater_than = re.sub('greater than', '>', remove_less_than_equal)
        remove_greater_than_equal = re.sub('greater than or equal to', '>', remove_greater_than)
        remove_gt_symbol = re.sub('≥', '>', remove_greater_than_equal)
        remove_lt_symbolr = re.sub('≤', '<', remove_gt_symbol)
        remove_semicolon = re.sub(';', '', remove_gt_symbol)
        remove_colon = re.sub(':', '', remove_semicolon)
        remove_lparen = re.sub('\(', '', remove_colon)
        remove_rparen = re.sub('\)', '', remove_lparen)
        same_dash = re.sub('–', '-', remove_rparen)
        clean_crit = re.sub('\.$', '', same_dash)
        stem_crit = stemmer.stem(clean_crit)
        clean_criteria_list.append(stem_crit)
    print("\n Cleaned Criteria:\n-----------\n", clean_criteria_list)
    eligibility_loc.update_one({'study_id':doc['study_id']}, {"$set": {"cleaned_inclusion": clean_criteria_list}}, upsert=False)
    doc_count += 1
#     if doc_count%2000 == 0:
#         print(f'\nCleaning doc {doc_count}')
print('\nDone.')

NCT00864942

Criteria:
 -------------
 Documented relapsed or refractory B-cell NHL; CD-20 positive tumor. Indolent NHL: follicular B-cell lymphoma, diffuse small lymphocytic lymphoma, lymphoplasmacytic lymphoma, marginal zone lymphoma, transformed aggressive lymphomas, mantle cell lymphoma and chronic lymphocytic leukemia

Criteria:
 -------------
 Maximum of 6 prior chemotherapy regimens. Prior rituximab is allowed.

Criteria:
 -------------
 Bidimensionally measurable disease

Criteria:
 -------------
 ECOG performance status 0-2

Criteria:
 -------------
 Absolute neutrophil count >/= 1000 and platelet count >/= 50,000

Criteria:
 -------------
 Serum creatinine </= 1.5 mg/dL

Criteria:
 -------------
 Adequate hepatic function

Criteria:
 -------------
 Estimated life expectancy of at least 3 months

Criteria:
 -------------
 All study participants must be registered into the mandatory RevAssist program and be willing and able to comply with the requirements of RevAssist

Criteria

### Clean inclusion criteria for 20000 documents 

In [292]:
doc_cursor = eligibility_loc.find().limit(20000)
stemmer = PorterStemmer()
doc_count = 1

for doc in doc_cursor:
    inclusion_criteria = doc['inclusion_criteria']
#     print(doc['study_id'])
    clean_criteria_list = []
    for criteria in inclusion_criteria:
#         print("\nCriteria:\n -------------\n", criteria)
        remove_comma = re.sub(',', '', criteria)
        remove_equals = re.sub('/=', '', remove_comma)
        remove_period_space = re.sub('\. ', ' ', remove_equals)
        remove_less_than = re.sub('less than', '<', remove_period_space)
        remove_less_than_equal = re.sub('less than or equal to', '<', remove_less_than)
        remove_greater_than = re.sub('greater than', '>', remove_less_than_equal)
        remove_greater_than_equal = re.sub('greater than or equal to', '>', remove_greater_than)
        remove_gt_symbol = re.sub('≥', '>', remove_greater_than_equal)
        remove_lt_symbolr = re.sub('≤', '<', remove_gt_symbol)
        remove_semicolon = re.sub(';', '', remove_gt_symbol)
        remove_colon = re.sub(':', '', remove_semicolon)
        remove_lparen = re.sub('\(', '', remove_colon)
        remove_rparen = re.sub('\)', '', remove_lparen)
        same_dash = re.sub('–', '-', remove_rparen)
        clean_crit = re.sub('\.$', '', same_dash)
        stem_crit = stemmer.stem(clean_crit)
        clean_criteria_list.append(stem_crit)
#     print("\n Cleaned Criteria:\n-----------\n", clean_criteria_list)
    eligibility_loc.update_one({'study_id':doc['study_id']}, {"$set": {"cleaned_inclusion": clean_criteria_list}}, upsert=False)
    doc_count += 1
    if doc_count%2000 == 0:
        print(f'\nCleaning doc {doc_count}')
print('\nDone.')


Cleaning doc 2000

Cleaning doc 4000

Cleaning doc 6000

Cleaning doc 8000

Cleaning doc 10000

Cleaning doc 12000

Cleaning doc 14000

Cleaning doc 16000

Cleaning doc 18000

Cleaning doc 20000

Done.


## Vectorize 

### Unpack inclusion critera so we can fit each one to the vectorizer separately 

In [227]:
single_doc = [['crit1', 'crit2', 'crit3'], ['crita', 'critb']]
list(itertools.chain(*single_doc))
# use mongo explode next time

['crit1', 'crit2', 'crit3', 'cirta', 'critb']

### Test that unpacking works with the cursor to create a list the vectorizer will accept

In [234]:
trials_loc, eligibility_loc = connect_to_mongo('clinical_trials', 'eligibilities')
doc_cursor = eligibility_loc.find().limit(2)

unpacked_criteria = list(itertools.chain(*(doc['cleaned_inclusion'] for doc in doc_cursor)))
unpacked_criteria

['documented relapsed or refractory b-cell nhl cd-20 positive tumor indolent nhl follicular b-cell lymphoma diffuse small lymphocytic lymphoma lymphoplasmacytic lymphoma marginal zone lymphoma transformed aggressive lymphomas mantle cell lymphoma and chronic lymphocytic leukemia',
 'maximum of 6 prior chemotherapy regimens prior rituximab is allow',
 'bidimensionally measurable diseas',
 'ecog performance status 0-2',
 'absolute neutrophil count > 1000 and platelet count > 50000',
 'serum creatinine < 1.5 mg/dl',
 'adequate hepatic funct',
 'estimated life expectancy of at least 3 month',
 'all study participants must be registered into the mandatory revassist program and be willing and able to comply with the requirements of revassist',
 'able to take aspirin 81 mg daily as prophylactic anticoagul',
 'patient was diagnosed as nosocomial infection defined according to criteria established by the us cdc the diagnosis criteria for ventilator-associated pneumonia are modified from those e

### Create vectorizer 

In [293]:
count_vectorizer = CountVectorizer(ngram_range=(2, 4),  
                                   stop_words='english', 
                                   token_pattern="[a-zA-Z0-9_\-ï/><\.]+",
                                   lowercase=True,
                                   max_df = 0.6,
                                   min_df = 2)

### Fit vectorizer on 1000 docs

In [315]:
trials_loc, eligibility_loc = connect_to_mongo('clinical_trials', 'eligibilities')
doc_cursor = eligibility_loc.find().limit(1000)

X = count_vectorizer.fit(list(itertools.chain(*(doc['cleaned_inclusion'] for doc in doc_cursor))))

In [321]:
X.get_feature_names()

['- 1',
 '- 10',
 '- 2',
 '- 30',
 '- 30 kg/m2',
 '- 35',
 '- 35 kg/m2',
 '- 4.5',
 '- 50',
 '- 60',
 '- 70',
 '- 70 year',
 '- 80',
 '- 80 year',
 '- agree',
 '- agree utilize',
 '- agree utilize following',
 '- revised',
 '- revised cdrs-r',
 '- revised cdrs-r total',
 '- surgically',
 '- surgically sterile',
 '- surgically sterile years',
 '--- agree',
 '--- agree utilize',
 '--- agree utilize following',
 '--- surgically',
 '--- surgically sterile',
 '--- surgically sterile years',
 '-b -drb1',
 '/- additional',
 '0 -',
 '0 - 1',
 '0 - 2',
 '0 1',
 '0 1 2',
 '0 1 screen',
 '0 2',
 '0 screening',
 '0 screening period',
 '0.50 0.81',
 '0.50 0.81 previous',
 '0.81 previous',
 '01 2',
 '0501 nct00412360',
 '1 2',
 '1 2 3',
 '1 21',
 '1 21 years',
 '1 21 years old',
 '1 3',
 '1 alcoholic',
 '1 alcoholic drink',
 '1 alcoholic drink day',
 '1 cm',
 '1 cm spiral',
 '1 cm spiral ct',
 '1 diabetes',
 '1 diabetes >',
 '1 diabetes > 5',
 '1 diabetes using',
 '1 diabetes using usual',
 '1 dose'

### Fit vectorizer on 20000 docs 

In [245]:
trials_loc, eligibility_loc = connect_to_mongo('clinical_trials', 'eligibilities')
doc_cursor = eligibility_loc.find().limit(20000)

count_vectorizer.fit(list(itertools.chain(*(doc['cleaned_inclusion'] for doc in doc_cursor))))

CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=0.6, max_features=None, min_df=1,
        ngram_range=(2, 4), preprocessor=None, stop_words='english',
        strip_accents=None, token_pattern='[a-zA-Z0-9_\\-ï/><\\.]+',
        tokenizer=None, vocabulary=None)

In [249]:
len(count_vectorizer.get_feature_names())

1050130

In [164]:
doc_cursor.close()

### Transform 1000 docs

In [317]:
trials_loc, eligibility_loc = connect_to_mongo('clinical_trials', 'eligibilities')
doc_cursor = eligibility_loc.find().limit(1000)

X_trans = count_vectorizer.transform(' '.join(doc['cleaned_inclusion']) for doc in doc_cursor)

In [318]:
X_trans

<1000x8907 sparse matrix of type '<class 'numpy.int64'>'
	with 29687 stored elements in Compressed Sparse Row format>

In [256]:
X_array = X.toarray()

In [260]:
X_array

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ..., 
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]])

## Fit model 

### Fit model to 1k transformed dataset 

In [319]:
n_topics = 6
n_iter = 10
lda = LatentDirichletAllocation(n_topics=n_topics,
                                max_iter=n_iter,
                                random_state=42,
                               learning_method='online')
data = lda.fit_transform(X_trans)
data[0]



array([ 0.98714798,  0.00256454,  0.00257188,  0.00257726,  0.00256543,
        0.00257291])

In [320]:
def display_topics(model, feature_names, no_top_words):
    for ix, topic in enumerate(model.components_):
        print("Topic ", ix)
        print(" ".join([feature_names[i]
                        for i in topic.argsort()[:-no_top_words - 1:-1]]))
        
display_topics(lda,X.get_feature_names(),20)

Topic  0
performance status 18 years years old pregnancy test informed consent written informed informed cons 18 years old ecog performance > 18 ecog performance status male female childbearing potential count > years age age > x uln written informed consent upper limit measurable disease
Topic  1
18 years informed consent years ag < equal informed cons years age > 18 score > total score male female > equal 18 years ag > 18 years consent form informed consent form sign informed patients > legal guardian female patients sign informed cons
Topic  2
informed cons written informed informed consent years old written informed cons age 18 type 2 signed informed written informed consent 2 diabetes type 2 diabetes years ag 3 month signed informed cons consent form men women signed informed consent patient s 3 months 6 month
Topic  3
body mass index mass index body mass informed consent index bmi mass index bmi body mass index bmi medical history informed cons written informed bmi > birth contro

# Next steps 

## Improving the pipeline 

* Refactor `connect_to_mongo()` function so we don't have to reconnect when switching databases (might be able to pull this into other functions later on)
* Scrutinize stopwords - currently using default, there may be some words worth including such as 'not'

## Error checking and production aspects to add   

* Error messages
* docstrings
* create functions
* comments to explain hard-coding choices or why an approach was used
* Add test cases for cleaning