# NLP Pipeline 

In [3]:
from pymongo import MongoClient
import re
import string

## Connect to the Mongo clinical_trials database 

In [10]:
def connect_to_mongo(database, collection):
    
    """
    Opens a connection to a specified Mongo DB location
    
    Input Parameters:
    database: name of database to connect to or create (str)
    collection: name of collection to connect to or create (str)
    
    Returns:
    The connection object for the database without a collection specified
    The connection object for a specific Mongo location (database & collection)
    """
    
    client = MongoClient()
    db = client[database]
    mongo_loc = db[collection]
    return db, mongo_loc

In [11]:
trials_loc, eligibility_loc = connect_to_mongo('clinical_trials', 'eligibilities')

### Create a Mongo cursor 

In [13]:
doc_cursor = eligibility_loc.find()

In [14]:
print(doc_cursor[0])

{'_id': ObjectId('5b734462aded604ea198f141'), 'study_id': 'NCT00864942', 'minimum_age': '18 Years', 'maximum_age': 'N/A', 'gender': 'All', 'inclusion_criteria': ['Documented relapsed or refractory B-cell NHL; CD-20 positive tumor. Indolent NHL: follicular B-cell lymphoma, diffuse small lymphocytic lymphoma, lymphoplasmacytic lymphoma, marginal zone lymphoma, transformed aggressive lymphomas, mantle cell lymphoma and chronic lymphocytic leukemia', 'Maximum of 6 prior chemotherapy regimens. Prior rituximab is allowed.', 'Bidimensionally measurable disease', 'ECOG performance status 0-2', 'Absolute neutrophil count >/= 1000 and platelet count >/= 50,000', 'Serum creatinine </= 1.5 mg/dL', 'Adequate hepatic function', 'Estimated life expectancy of at least 3 months', 'All study participants must be registered into the mandatory RevAssist program and be willing and able to comply with the requirements of RevAssist', 'Able to take aspirin 81 mg daily as prophylactic anticoagulation'], 'exclu

## Clean the text 

1. get the doc
2. decide how to segment the eligibility data - first pass, try to do all inclusion data for a study as one entry, all exclusion data for a study as an entry. This might require concatenating all the elements in the inclusion_criteria into one long element:  
`" ".join(my_list)`
3. separately clean inclusion_criteria and exclusion_criteria
4. add the cleaned_inclusion_criteria and cleaned_exclusion_criteria as part of that doc record - find it by the study_id

In [15]:
# for doc in cursor:

In [16]:
doc_cursor.close()

# Next steps 

## Improving the pipeline 

* Refactor `connect_to_mongo()` function so we don't have to reconnect when switching databases (might be able to pull this into other functions later on)

## Error checking and production aspects to add   

* Error messages
* docstrings
* create functions
* comments to explain hard-coding choices or why an approach was used