# Using remote aact db load Clinical Trials data
For the script top work you have to sign-up for aact database access (free) o their webpage.

In [2]:
# Importing libraries
import pandas as pd
from datetime import datetime as dt
import re
from tqdm import tqdm
import numpy as np
# import graph_tool.all as gt
from sklearn.metrics.pairwise import cosine_similarity

import psycopg2 # This package allows you to connect to PostgreSQL database

# Our stuff:
from alltrials.etl_utils import column_is_empty, column_contains_text, column_is_numeric, column_is_categorical

n_samples = 10000
# %%


## Database credentials setup
CHange to your own

In [3]:

# Set your connection parameters
db_params = {
    'dbname': 'aact', # Default
    'user': 'wesserg', # This is your username on the aact webpage
    'password': 'h5p1le4sq', # Tjis password is the password you defined while loging on the aact webpage
    'host': 'aact-db.ctti-clinicaltrials.org',
    'port': 5432  # Default PostgreSQL port
}
# Connect to the database
try:
    conn = psycopg2.connect(**db_params)
    cursor = conn.cursor()
    print("Connected to the database!")
except (Exception, psycopg2.DatabaseError) as error:
    print("Error:", error)


Connected to the database!


## ctgov schema tables
While there are multiple tables available in the database, we will focus on the ctgov schema.

In [4]:

# %%
# Get a list of tables in the ctgov schema
cursor = conn.cursor()
cursor.execute("""
   SELECT table_name
   FROM information_schema.tables
   WHERE table_schema = 'ctgov';
""")

tables = cursor.fetchall()
print("Here are the tables in the ctgov schema:")
print([table_name[0] for table_name in tables])


Here are the tables in the ctgov schema:
['id_information', 'drop_withdrawals', 'reported_event_totals', 'browse_conditions', 'browse_interventions', 'countries', 'design_outcomes', 'overall_officials', 'all_design_outcomes', 'all_facilities', 'all_group_types', 'all_id_information', 'all_intervention_types', 'all_interventions', 'all_keywords', 'reported_events', 'provided_documents', 'interventions', 'search_results', 'documents', 'pending_results', 'retractions', 'baseline_measurements', 'design_groups', 'keywords', 'conditions', 'calculated_values', 'intervention_other_names', 'design_group_interventions', 'mesh_headings', 'links', 'brief_summaries', 'sponsors', 'baseline_counts', 'eligibilities', 'detailed_descriptions', 'designs', 'facility_contacts', 'outcome_analyses', 'responsible_parties', 'outcome_counts', 'study_references', 'result_groups', 'milestones', 'central_contacts', 'facility_investigators', 'result_agreements', 'study_searches', 'studies', 'participant_flows', 'ou

### Loading an example table from ctgov schema

In [5]:
cursor.execute(f"""SELECT * FROM ctgov.conditions LIMIT {n_samples}""")
result = cursor.fetchall()
column_names = [desc[0] for desc in cursor.description]

# Convert the result to a DataFrame with column names
df = pd.DataFrame(result, columns=column_names)
print(df.head())

         id       nct_id                        name  \
0  65954254  NCT03968679        Head and Neck Cancer   
1  65954337  NCT03650322                      Cancer   
2  65954583  NCT02098252                         AVM   
3  65954667  NCT00719888  Small Lymphocytic Lymphoma   
4  65954669  NCT00489307                Solid Tumors   

                downcase_name  
0        head and neck cancer  
1                      cancer  
2                         avm  
3  small lymphocytic lymphoma  
4                solid tumors  


## Lets now access all tables, and combine them using the nct_id as the key.

In the intermediate steps we will aslo conduct some data cleanup atempting to constarin the data to columns/tables and rows that seem useful. We are trying to strip off missing data rows, missing data columns, duplicated ids etc.

In [4]:
# Analyze missing data for each column in a table
all_tables_dict = dict()
df_list = list()

for table_name in tqdm(tables):
    # 1. Load the table into a DataFrame
    cursor.execute(
        f"""SELECT * FROM ctgov.{table_name[0]} LIMIT 10000
""")
    result = cursor.fetchall()
    column_names = [desc[0] for desc in cursor.description]

    df = pd.DataFrame(result, columns=column_names)
    # 2. Check of table has a nct_id column and if it is unique and non-empty. If so, set it as index
    if len(df) > 0 and "nct_id" in df.columns and df["nct_id"].nunique() == len(df):
        df.set_index('nct_id', inplace=True)
    else:
        continue
    if "id" in df.columns:
        df.drop('id', axis=1, inplace=True)
    df.rename({"name": table_name[0], "names": table_name[0]}, axis=1, inplace=True)

    # 3. Check for useful columns
    text_columns = []
    categorical_columns = []
    numerical_columns = []
    gibberish_columns = []
    mostly_empty_columns = []
    for column in df.columns:
        if column_is_empty(df[column]):
            mostly_empty_columns.append(column)
        elif column_is_categorical(df[column]):  # Detect categorical columns
            categorical_columns.append(column)
        elif column_is_numeric(df[column]):  # Detect numerical columns
            numerical_columns.append(column)
        elif column_contains_text(df[column]):  # Detect text columns
            text_columns.append(column)
        else:
            gibberish_columns.append(column)

    all_tables_dict[table_name[0]] = {'text_columns': text_columns, 'categorical_columns': categorical_columns,
                                        'numerical_columns': numerical_columns, 'gibberish_columns': gibberish_columns,
                                        'mostly_empty_columns': mostly_empty_columns}
    df_list.append(df[categorical_columns + numerical_columns + text_columns])    

# Aggregate and clean rows
all_tables_df = pd.concat([add_df for add_df in df_list if len(add_df)>0], axis=1)
all_tables_df.dropna(axis=0, thresh=int(all_tables_df.shape[1]/5), inplace=True)


# Dont forget to close the connection
conn.close()
