# Connect to the AACT clinical trials database from local machine

## Import packages 

In [209]:
import psycopg2 as pg
import pandas as pd
import pandas.io.sql as pd_sql
import pandas as pd
import pickle
import re
import json
from pymongo import MongoClient

## Input AACT connection arguments 

###### Refactor the login code. Is there a better way to get a text file or other file in with the login credentials?  Can I make the entire connection process a function?

In [193]:
login = pd.read_csv('login.csv', header=None)
user = login.iloc[0,0]
password = login.iloc[0,1]

In [194]:
connection_args = {'host': 'aact-db.ctti-clinicaltrials.org', 
                   'user': user, 
                   'password': password, 
                   'dbname': 'aact',
                   'port': 5432}

## Connect to AACT database 

In [176]:
connection = pg.connect(**connection_args)

## Execute queries for eligibility criteria

### Eligibility criteria: inclusion and exclusion criteria

In [182]:
query = "SELECT * FROM eligibilities LIMIT 1;"

data = pd_sql.read_sql(query, connection)

In [184]:
print(data)

        id       nct_id sampling_method gender minimum_age maximum_age  \
0  1573978  NCT00147654                   Male    40 Years         N/A   

  healthy_volunteers population  \
0                 No              

                                            criteria gender_description  \
0  \n        Inclusion Criteria:\n\n          -  ...                      

  gender_based  
0         None  


In [14]:
data.to_pickle("test_data.pkl")

In [15]:
# test_load = pd.read_pickle("test_data.pkl")
# test_load

In [17]:
data

Unnamed: 0,id,nct_id,sampling_method,gender,minimum_age,maximum_age,healthy_volunteers,population,criteria,gender_description,gender_based
0,1472568,NCT01488188,,All,20 Years,49 Years,Accepts Healthy Volunteers,,\n Inclusion Criteria:\n\n - ...,,
1,1472569,NCT01488175,,All,18 Years,90 Years,No,,\n Inclusion Criteria:\n\n - ...,,
2,1472570,NCT01488162,Probability Sample,All,18 Years,,No,Adult patients with relapsing or refractory ch...,\n Inclusion Criteria:\n\n - ...,,


## Create a dictionary for each study to load into MongoDB 

### Get a single record and create an empty dictionary 

In [100]:
study_id = data.nct_id[1]
eligibility = data.criteria[1]
print(eligibility)


        Inclusion Criteria:

          -  clinical diagnosis og osteoarthritis of the knee

          -  patients must be 18 years or older

          -  patients must understand and speak danish

          -  must be able to give signed consent

        Exclusion Criteria:

          -  severe medical illness

          -  documented osteoporosis

          -  rheumatoid arthritis

          -  prior surgery in the knee

          -  neuropathy
      


In [123]:
document = {}
document['study_id'] = study_id
document['minimum_age'] = data.minimum_age[0]
document['maximum_age'] = data.maximum_age[0]
document['gender'] = data.gender[0]

### Split inclusion and exclusion criteria 

##### Add a test to see if Inclusion and Exclusion criteria are included in every study - some studies lack exclusion criteria and only have inclusion 

In [101]:
inclusion, exclusion = eligibility.split('Exclusion Criteria:')
inclusion

'\n        Inclusion Criteria:\n\n          -  clinical diagnosis og osteoarthritis of the knee\n\n          -  patients must be 18 years or older\n\n          -  patients must understand and speak danish\n\n          -  must be able to give signed consent\n\n        '

### Find inclusion criteria 

In [106]:
regex = '-\s\s(.+)\n\n'
inclusion_criteria = re.findall(regex, inclusion)
document['inclusion_criteria'] = inclusion_criteria
document

{'study_id': 'NCT01488175',
 'inclusion_criteria': ['clinical diagnosis og osteoarthritis of the knee',
  'patients must be 18 years or older',
  'patients must understand and speak danish',
  'must be able to give signed consent'],
 'exclusion_criteria': ['severe medical illness',
  'documented osteoporosis',
  'rheumatoid arthritis',
  'prior surgery in the knee']}

### Find exclusion criteria 

In [105]:
regex = '-\s\s(.+)\n\n'
exclusion_criteria = re.findall(regex, exclusion)
document['exclusion_criteria'] = exclusion_criteria
document

{'study_id': 'NCT01488175',
 'inclusion_criteria': ['clinical diagnosis og osteoarthritis of the knee',
  'patients must be 18 years or older',
  'patients must understand and speak danish',
  'must be able to give signed consent'],
 'exclusion_criteria': ['severe medical illness',
  'documented osteoporosis',
  'rheumatoid arthritis',
  'prior surgery in the knee']}

### Function to create a dictionary record from a SQL query 

In [177]:
def create_document(record):
    document = {}
    document['study_id'] = record.nct_id[0]
    document['minimum_age'] = record.minimum_age[0]
    document['maximum_age'] = record.maximum_age[0]
    document['gender'] = record.gender[0]
    
    # need to test if there is 'Exclusion Criteria:' in the dataset
    eligibility = record.criteria[0]
    eligibility = eligibility.replace('\n             ', ' ') 
    inclusion, exclusion = eligibility.split('Exclusion Criteria:')
    regex = '-\s\s(.+)\n\n'
    clean_inclusion = re.findall(regex, inclusion)
    clean_exclusion = re.findall(regex, exclusion)
    document['inclusion_criteria'] = clean_inclusion
    document['exclusion_criteria'] = clean_exclusion
    return document

## Create a cursor and iterate through queries

### Update create document for SQL cursor queries

The SQL cursor returns a tuple, so we must update the indexing for tuples, instead of dataframes with labeled columns

In [206]:
def clean_record(record):
    
    """Takes an AACT database read from an SQL cursor and produces a dictionary. 
    Removes new lines and extra spaces from eligibility criteria. 
    Returns a dictionary in document form to be sent to mongodb."""
    
    document = {}
    document['study_id'] = record[1]
    document['minimum_age'] = record[4]
    document['maximum_age'] = record[5]
    document['gender'] = record[3]
    
    eligibility = record[8]
    eligibility = eligibility.replace('\n             ', ' ') 
    # need to test if there is 'Exclusion Criteria:' in the dataset
    inclusion, exclusion = eligibility.split('Exclusion Criteria:')
    regex = '-\s\s(.+)\n\n'
    clean_inclusion = re.findall(regex, inclusion)
    clean_exclusion = re.findall(regex, exclusion)
    document['inclusion_criteria'] = clean_inclusion
    document['exclusion_criteria'] = clean_exclusion
    return document

In [None]:
def send_to_mongodb(document, database, collection):
    
    """Takes a dictionary in document form and sends it to the specified database and collection in mongodb."""
    
    # Create an error message if there is no database or collection specified 
    client = MongoClient() # Connect to/close mongo outside function?
    db = client.database # can I use variable like this? Can test that...
    eligibility_criteria = db.eligibility  
    db.eligibility.insertOne(document)
    client.close()

In [195]:
connection = pg.connect(**connection_args)
cursor = connection.cursor()

In [None]:
# Function (SQL_query, data_cleaner_function, send_to_mongo):
    # connect to database
    # cursor.execute(SQL_query)
    # for result in cursor:
        # document = data_cleaner_function(result)
        # send_to_mongo(document)
        
        
# where to send in mongo: clinical_trials, collection = 

In [None]:
def sql_to_mongo(query, ):

In [208]:
# function that takes query, data cleaning function, specifications for where to go in mongo (collection to go to in mongo)

query = "SELECT * FROM eligibilities LIMIT 1;"
cursor.execute(query)

for result in cursor:
    print(clean_document(result))

{'study_id': 'NCT02554435', 'minimum_age': '55 Years', 'maximum_age': '74 Years', 'gender': 'All', 'inclusion_criteria': ['physically inactive (less than 60 minutes per week)', 'BMI between 25-35', 'in good health measured by Par-Q+', 'access to a smart phone'], 'exclusion_criteria': ['physical activity is inadvisable by their doctor', 'involved in another physical activity intervention within the past 6 months', 'used an activity monitor in the past 6 months', 'unwilling to travel for scheduled visits', 'currently taking medications that affect body composition', 'current smoker', 'report alcohol or drug problem', 'institutionalized for psychiatric illness within the last year']}


## Close connection to AACT 

In [118]:
connection.close()