# Connect to the AACT clinical trials database from local machine

## Import packages 

In [1]:
import psycopg2 as pg
import pandas as pd
import pandas.io.sql as pd_sql
import pandas as pd
import pickle
import re
import json
from pymongo import MongoClient

## Input AACT connection arguments 

###### Refactor the login code. Is there a better way to get a text file or other file in with the login credentials?  Can I make the entire connection process a function?

In [23]:
login = pd.read_csv('login.csv', header=None)
user = login.iloc[0,0]
password = login.iloc[0,1]

In [24]:
connection_args = {'host': 'aact-db.ctti-clinicaltrials.org', 
                   'user': user, 
                   'password': password, 
                   'dbname': 'aact',
                   'port': 5432}

## Connect to AACT database 

In [25]:
connection = pg.connect(**connection_args)

## Execute queries for eligibility criteria

### Eligibility criteria: inclusion and exclusion criteria

In [None]:
query = "SELECT * FROM eligibilities LIMIT 1;"

data = pd_sql.read_sql(query, connection)

In [None]:
print(data)

In [None]:
data.to_pickle("test_data.pkl")

In [None]:
# test_load = pd.read_pickle("test_data.pkl")
# test_load

In [None]:
data

## Create a dictionary for each study to load into MongoDB 

### Get a single record and create an empty dictionary 

In [None]:
study_id = data.nct_id[1]
eligibility = data.criteria[1]
print(eligibility)

In [None]:
document = {}
document['study_id'] = study_id
document['minimum_age'] = data.minimum_age[0]
document['maximum_age'] = data.maximum_age[0]
document['gender'] = data.gender[0]

### Split inclusion and exclusion criteria 

##### Add a test to see if Inclusion and Exclusion criteria are included in every study - some studies lack exclusion criteria and only have inclusion 

In [None]:
inclusion, exclusion = eligibility.split('Exclusion Criteria:')
inclusion

### Find inclusion criteria 

In [None]:
regex = '-\s\s(.+)\n\n'
inclusion_criteria = re.findall(regex, inclusion)
document['inclusion_criteria'] = inclusion_criteria
document

### Find exclusion criteria 

In [None]:
regex = '-\s\s(.+)\n\n'
exclusion_criteria = re.findall(regex, exclusion)
document['exclusion_criteria'] = exclusion_criteria
document

### Function to create a dictionary record from a SQL query 

In [4]:
def create_document(record):
    document = {}
    document['study_id'] = record.nct_id[0]
    document['minimum_age'] = record.minimum_age[0]
    document['maximum_age'] = record.maximum_age[0]
    document['gender'] = record.gender[0]
    
    # need to test if there is 'Exclusion Criteria:' in the dataset
    eligibility = record.criteria[0]
    eligibility = eligibility.replace('\n             ', ' ') 
    inclusion, exclusion = eligibility.split('Exclusion Criteria:')
    regex = '-\s\s(.+)\n\n'
    clean_inclusion = re.findall(regex, inclusion)
    clean_exclusion = re.findall(regex, exclusion)
    document['inclusion_criteria'] = clean_inclusion
    document['exclusion_criteria'] = clean_exclusion
    return document

## Create a cursor and iterate through queries

### Update record cleaner function for SQL cursor queries

The SQL cursor returns a tuple, so we must update the indexing for tuples, instead of dataframes with labeled columns

In [29]:
def clean_record(record):
    
    """Takes an AACT database read from an SQL cursor and produces a dictionary. 
    Removes new lines and extra spaces from eligibility criteria. 
    Returns a dictionary in document form to be sent to mongodb."""
    
    document = {}
    document['study_id'] = record[1]
    document['minimum_age'] = record[4]
    document['maximum_age'] = record[5]
    document['gender'] = record[3]
    
    eligibility = record[8]
    eligibility = eligibility.replace('\n             ', ' ') 
    
    # need to test if there is 'Exclusion Criteria:' in the dataset
    # if there isn't Exclusion Criteria, don't have to split the eligibility
    
    inclusion, exclusion = eligibility.split('Exclusion Criteria:')
    regex = '-\s\s(.+)\n\n'
    clean_inclusion = re.findall(regex, inclusion)
    clean_exclusion = re.findall(regex, exclusion)
    document['inclusion_criteria'] = clean_inclusion
    document['exclusion_criteria'] = clean_exclusion
    return document

In [30]:
def send_to_mongodb(document, database, collection):
    
    """Takes a dictionary in document form and sends it to the specified database
    and collection in mongodb. document is the document to enter into the database. 
    database and collection are specified as strings."""
    
    # Create an error message if there is no database or collection specified 
    
    client = MongoClient() # Connect to/close mongo outside function?
    db = client[database] # can I use variable like this? Can test that...
    collection = db[collection]  
    collection.insert_one(document)
    client.close()

In [36]:
# another way to read in the login credentials:
connection_args = json.load(open("login.txt"))

Create a login.txt file with the database and login credentials, adding in your specific username and password:

{"host": "aact-db.ctti-clinicaltrials.org", "user": "username", "password": "password", "dbname": "aact", "port": 5432}


In [None]:
login = pd.read_csv('login.csv', header=None)
user = login.iloc[0,0]
password = login.iloc[0,1]

connection_args = {'host': 'aact-db.ctti-clinicaltrials.org', 
                   'user': user, 
                   'password': password, 
                   'dbname': 'aact',
                   'port': 5432}

connection = pg.connect(**connection_args)
cursor = connection.cursor()

In [40]:
def sql_to_mongo(query, login, database, collection):
    
    """SQL to MongoDB pipeline. Retrieves single SQL record from a cursor, 
    converts it into a dictionary, and inputs that to MongoDB.
    query is a SQL query. login is a text file with the login parameters. 
    database and collections are strings of MongoDB locations. 
    login is the login specifications for the SQL database."""
    
    connection_args = json.load(open(login))
    connection = pg.connect(**connection_args)
    cursor = connection.cursor()
    cursor.execute(query) # open the database connection within the function as well? 
    
    for result in cursor:
        document = clean_record(result)
        send_to_mongodb(document, database, collection)
        
    connection.close()

In [41]:
query = "SELECT * FROM eligibilities LIMIT 3;"

sql_to_mongo(query, 'login.txt', 'test_database', 'trials')

## Close connection to AACT 

In [22]:
connection.close()