# NLP Pipeline 

In [90]:
from pymongo import MongoClient
import re
import string
from sklearn.feature_extraction.text import CountVectorizer
from nltk.tokenize import TreebankWordTokenizer
from nltk.stem import PorterStemmer
from nltk.corpus import stopwords
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.preprocessing import Normalizer

## Connect to the Mongo clinical_trials database 

In [3]:
def connect_to_mongo(database, collection):
    
    """
    Opens a connection to a specified Mongo DB location
    
    Input Parameters:
    database: name of database to connect to or create (str)
    collection: name of collection to connect to or create (str)
    
    Returns:
    The connection object for the database without a collection specified
    The connection object for a specific Mongo location (database & collection)
    """
    
    client = MongoClient()
    db = client[database]
    mongo_loc = db[collection]
    return db, mongo_loc

In [77]:
trials_loc, eligibility_loc = connect_to_mongo('clinical_trials', 'eligibilities')

### Create a Mongo cursor 

In [78]:
doc_cursor = eligibility_loc.find().limit(10000)

## Clean the text 

Regex to add:
1. remove commas
2. remove backlash equals '/=' and replace with nothing
3. remove period and space '. ' replace with space
4. remove parentheses '(', ')', replace with nothing
5. replace 'less than', 'less than or equal to', '</=' with '<'
5. replace 'greater than', 'greater than or equal to', '>/=' with '>'
6. remove ';' and replace with nothing
7. remove period from end of each string - '\.$' and replace with nothing

Token:
1. single space, or \[a-zA-Z0-9_></-\]+

Stopwords:
1. maybe include 'within' as an important word
2. include 'not'

Segmenting the 'doc':
1. consider vectorizing on each individual criteria - currently doing it on all criteria at once. The current way will give a lot of run on bi and trigrams that aren't meaningful, but will perhaps be less common overall docs since the order of criteria may not be very important

Test cases to add:
1. all of them - show example and output

## Vectorize 

In [79]:
count_vectorizer = CountVectorizer(ngram_range=(2, 4),  
                                   stop_words='english', 
                                    token_pattern="[a-z][a-z]+",
                                   lowercase=True,
                                   max_df = 0.6)

X = count_vectorizer.fit_transform([' '.join(doc['inclusion_criteria']) for doc in doc_cursor])

In [86]:
len(count_vectorizer.get_feature_names())

629900

1. get the doc
2. decide how to segment the eligibility data - first pass, try to do all inclusion data for a study as one entry, all exclusion data for a study as an entry. This might require concatenating all the elements in the inclusion_criteria into one long element:  
`" ".join(my_list)`
3. separately clean inclusion_criteria and exclusion_criteria
4. add the cleaned_inclusion_criteria and cleaned_exclusion_criteria as part of that doc record - find it by the study_id

In [71]:
stop_words = set(stopwords.words('english'))

In [72]:
stop_words

{'a',
 'about',
 'above',
 'after',
 'again',
 'against',
 'ain',
 'all',
 'am',
 'an',
 'and',
 'any',
 'are',
 'aren',
 "aren't",
 'as',
 'at',
 'be',
 'because',
 'been',
 'before',
 'being',
 'below',
 'between',
 'both',
 'but',
 'by',
 'can',
 'couldn',
 "couldn't",
 'd',
 'did',
 'didn',
 "didn't",
 'do',
 'does',
 'doesn',
 "doesn't",
 'doing',
 'don',
 "don't",
 'down',
 'during',
 'each',
 'few',
 'for',
 'from',
 'further',
 'had',
 'hadn',
 "hadn't",
 'has',
 'hasn',
 "hasn't",
 'have',
 'haven',
 "haven't",
 'having',
 'he',
 'her',
 'here',
 'hers',
 'herself',
 'him',
 'himself',
 'his',
 'how',
 'i',
 'if',
 'in',
 'into',
 'is',
 'isn',
 "isn't",
 'it',
 "it's",
 'its',
 'itself',
 'just',
 'll',
 'm',
 'ma',
 'me',
 'mightn',
 "mightn't",
 'more',
 'most',
 'mustn',
 "mustn't",
 'my',
 'myself',
 'needn',
 "needn't",
 'no',
 'nor',
 'not',
 'now',
 'o',
 'of',
 'off',
 'on',
 'once',
 'only',
 'or',
 'other',
 'our',
 'ours',
 'ourselves',
 'out',
 'over',
 'own',
 'r

In [164]:
doc_cursor.close()

## Fit model 

# Next steps 

## Improving the pipeline 

* Refactor `connect_to_mongo()` function so we don't have to reconnect when switching databases (might be able to pull this into other functions later on)

## Error checking and production aspects to add   

* Error messages
* docstrings
* create functions
* comments to explain hard-coding choices or why an approach was used