# Connect to the AACT clinical trials database from local machine

## Import packages 

In [65]:
import psycopg2 as pg
import pandas as pd
import pandas.io.sql as pd_sql
import pandas as pd
import pickle
import re

## Input AACT connection arguments 

###### Refactor the login code. Is there a better way to get a text file or other file in with the login credentials?  Can I make the entire connection process a function?

In [107]:
login = pd.read_csv('login.csv', header=None)
user = login.iloc[0,0]
password = login.iloc[0,1]

In [108]:
connection_args = {'host': 'aact-db.ctti-clinicaltrials.org', 
                   'user': user, 
                   'password': password, 
                   'dbname': 'aact',
                   'port': 5432}

## Connect to AACT database 

In [109]:
connection = pg.connect(**connection_args)

## Execute queries for eligibility criteria

### Eligibility criteria: inclusion and exclusion criteria

In [114]:
query = "SELECT * FROM eligibilities LIMIT 1;"

data = pd_sql.read_sql(query, connection)

In [115]:
data

Unnamed: 0,id,nct_id,sampling_method,gender,minimum_age,maximum_age,healthy_volunteers,population,criteria,gender_description,gender_based
0,1472568,NCT01488188,,All,20 Years,49 Years,Accepts Healthy Volunteers,,\n Inclusion Criteria:\n\n - ...,,


In [14]:
data.to_pickle("test_data.pkl")

In [15]:
# test_load = pd.read_pickle("test_data.pkl")
# test_load

In [17]:
data

Unnamed: 0,id,nct_id,sampling_method,gender,minimum_age,maximum_age,healthy_volunteers,population,criteria,gender_description,gender_based
0,1472568,NCT01488188,,All,20 Years,49 Years,Accepts Healthy Volunteers,,\n Inclusion Criteria:\n\n - ...,,
1,1472569,NCT01488175,,All,18 Years,90 Years,No,,\n Inclusion Criteria:\n\n - ...,,
2,1472570,NCT01488162,Probability Sample,All,18 Years,,No,Adult patients with relapsing or refractory ch...,\n Inclusion Criteria:\n\n - ...,,


## Create a dictionary for each study to load into to MongoDB 

### Get a single record and create an empty dictionary 

In [100]:
study_id = data.nct_id[1]
eligibility = data.criteria[1]
print(eligibility)


        Inclusion Criteria:

          -  clinical diagnosis og osteoarthritis of the knee

          -  patients must be 18 years or older

          -  patients must understand and speak danish

          -  must be able to give signed consent

        Exclusion Criteria:

          -  severe medical illness

          -  documented osteoporosis

          -  rheumatoid arthritis

          -  prior surgery in the knee

          -  neuropathy
      


In [98]:
document = {}
document['study_id'] = study_id

### Split inclusion and exclusion criteria 

##### Add a test to see if Inclusion and Exclusion criteria are included in every study - some studies lack exclusion criteria and only have inclusion 

In [101]:
inclusion, exclusion = eligibility.split('Exclusion Criteria:')
inclusion

'\n        Inclusion Criteria:\n\n          -  clinical diagnosis og osteoarthritis of the knee\n\n          -  patients must be 18 years or older\n\n          -  patients must understand and speak danish\n\n          -  must be able to give signed consent\n\n        '

### Find inclusion criteria 

In [106]:
regex = '-\s\s(.+)\n\n'
inclusion_criteria = re.findall(regex, inclusion)
document['inclusion_criteria'] = inclusion_criteria
document

{'study_id': 'NCT01488175',
 'inclusion_criteria': ['clinical diagnosis og osteoarthritis of the knee',
  'patients must be 18 years or older',
  'patients must understand and speak danish',
  'must be able to give signed consent'],
 'exclusion_criteria': ['severe medical illness',
  'documented osteoporosis',
  'rheumatoid arthritis',
  'prior surgery in the knee']}

### Find exclusion criteria 

In [105]:
regex = '-\s\s(.+)\n\n'
exclusion_criteria = re.findall(regex, exclusion)
document['exclusion_criteria'] = exclusion_criteria
document

{'study_id': 'NCT01488175',
 'inclusion_criteria': ['clinical diagnosis og osteoarthritis of the knee',
  'patients must be 18 years or older',
  'patients must understand and speak danish',
  'must be able to give signed consent'],
 'exclusion_criteria': ['severe medical illness',
  'documented osteoporosis',
  'rheumatoid arthritis',
  'prior surgery in the knee']}

## Close connection to AACT 

In [29]:
connection.close()