In [1]:
import warnings
from sklearn.preprocessing import MinMaxScaler
from scipy.sparse import hstack
import scipy.sparse as sp
import matplotlib.pyplot as plt
import numpy as np
import requests
import pandas as pd
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
import string
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score
import spacy

nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
warnings.filterwarnings('ignore')
nlp = spacy.load("en_core_web_sm")


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [2]:
# Initial URL for the first API call
base_url = "https://clinicaltrials.gov/api/v2/studies"

# Initialize an empty list to store the data
data_list = []
params = {
}

In [5]:
# Loop until there is no nextPageToken
while True:
    # Print the current URL (for debugging purposes)
    print("Fetching data from:", base_url + '?' + '&'.join([f"{k}={v}" for k, v in params.items()]))

    # Send a GET request to the API
    response = requests.get(base_url, params=params)

    # Check if the request was successful
    if response.status_code == 200:
        data = response.json()  # Parse JSON response
        studies = data.get('studies', [])  # Extract the list of studies

        # Loop through each study and extract specific information
        for study in studies:
            # Safely access nested keys
            orgFullName = study['protocolSection']['identificationModule'].get('organization', {}).get('fullName', 'Unknown')
            orgClass = study['protocolSection']['identificationModule'].get('organization',{}).get('class', 'Unknown')
            overallStatus = study['protocolSection']['statusModule'].get('overallStatus', 'Unknown')
            if study['protocolSection']['sponsorCollaboratorsModule'].get('responsibleParty') == 'SPONSOR':
              responsibleParty = study['protocolSection']['sponsorCollaboratorsModule'].get('responsibleParty', {}).get('type', 'Unknown')
            startDate = study['protocolSection']['statusModule'].get('startDateStruct', {}).get('date', 'Unknown')
            if 'conditionsModule' in study['protocolSection']:
              conditions = ', '.join(study['protocolSection']['conditionsModule'].get('conditions', ['Unknown']))
            else:
              conditions = 'No conditions listed'
            briefTitle = study['protocolSection']['identificationModule'].get('briefTitle', 'Unknown')
            fullTitle = study['protocolSection']['identificationModule'].get('officialTitle', 'Unknown')
            # Extract interventions safely
            interventions_list = study['protocolSection'].get('armsInterventionsModule', {}).get('interventions', [])
            interventions = ', '.join([intervention.get('name', 'Unknown') for intervention in interventions_list]) if interventions_list else "Unknown"
            interventionDesc_list = study['protocolSection'].get('armsInterventionsModule', {}).get('interventions',[])
            interventionDesc = ', '.join([interventionDesc.get('description', 'Unknown') for interventionDesc in interventionDesc_list]) if interventionDesc_list else "Unknown"

            # Extract dates and phases
            if 'designModule' in study['protocolSection']:
              studyType = study['protocolSection']['designModule'].get('studyType', 'Unknown')
            else:
              studyType = 'Unknown'
            if 'designModule' in study['protocolSection']:
              phases = ', '.join(study['protocolSection']['designModule'].get('phases', ['Unknown']))
            else:
              phases = 'No phases listed'
            outcome_list = study.get('protocolSection', {}).get('outcomesModule', {}).get('primaryOutcomes', [])
            if outcome_list:
              outcomes = ' '.join([outcome.get('measure', 'Unknown') for outcome in outcome_list])
            else:
              outcomes = "Unknown"
            medSubHead_list = study['derivedSection'].get('conditionBrowseModule', {}).get('meshes', [])
            medSubHeads = ' '.join([f"{subheads.get('term', 'Unknown')}" for subheads in medSubHead_list]) if medSubHead_list else "Unknown"
            standardAge_list = study['protocolSection'].get('eligibilityModule', {}).get('stdAges',[])
            standardAge = ' '.join(standardAge_list)
            if 'designModule' in study['protocolSection']:
              if 'designInfo' in study['protocolSection']['designModule']:
                primaryPurpose = study['protocolSection']['designModule']['designInfo'].get('primaryPurpose', 'Unknown')
              else:
                primaryPurpose = 'Unknown'
            else:
                primaryPurpose = 'Unknown'
            # Append the data to the list as a dictionary
            data_list.append({
                "Organization Full Name": orgFullName,
                "Organization Class": orgClass,
                "Responsible Party": responsibleParty,
                "Brief Title": briefTitle,
                "Full Title": fullTitle,
                "Overall Status": overallStatus,
                "Start Date": startDate,
                "Standard Age": standardAge,
                "Conditions": conditions,
                "Primary Purpose": primaryPurpose,
                "Interventions": interventions,
                "Intervention Description": interventionDesc,
                "Study Type": studyType,
                "Phases": phases,
                "Outcome Measure": outcomes,
                "Medical Subject Headings": medSubHeads
            })

        # Check for nextPageToken and update the params or break the loop
        nextPageToken = data.get('nextPageToken')
        if nextPageToken:
            params['pageToken'] = nextPageToken  # Set the pageToken for the next request
        else:
            break  # Exit the loop if no nextPageToken is present
    else:
        print("Failed to fetch data. Status code:", response.status_code)
        break


[1;30;43mStreaming output truncated to the last 5000 lines.[0m
Fetching data from: https://clinicaltrials.gov/api/v2/studies?pageToken=KV117JOEkvg
Fetching data from: https://clinicaltrials.gov/api/v2/studies?pageToken=KV157paClPEg
Fetching data from: https://clinicaltrials.gov/api/v2/studies?pageToken=KV167JqHlPEg
Fetching data from: https://clinicaltrials.gov/api/v2/studies?pageToken=KV195pWClvEg
Fetching data from: https://clinicaltrials.gov/api/v2/studies?pageToken=KV1955SCkvg
Fetching data from: https://clinicaltrials.gov/api/v2/studies?pageToken=KV1-75CGm_Eg
Fetching data from: https://clinicaltrials.gov/api/v2/studies?pageToken=KV146pKAkfEg
Fetching data from: https://clinicaltrials.gov/api/v2/studies?pageToken=KV157pGClfEg
Fetching data from: https://clinicaltrials.gov/api/v2/studies?pageToken=KV147ZuDmvEg
Fetching data from: https://clinicaltrials.gov/api/v2/studies?pageToken=KV195pqFlfEg
Fetching data from: https://clinicaltrials.gov/api/v2/studies?pageToken=KV195pSOlfEg
Fe

In [6]:

clin_trials = pd.DataFrame(data_list)

In [7]:
pd.set_option('display.max_columns', None)
print(len(clin_trials))
clin_trials.head(1)

496615


Unnamed: 0,Organization Full Name,Organization Class,Responsible Party,Brief Title,Full Title,Overall Status,Start Date,Standard Age,Conditions,Primary Purpose,Interventions,Intervention Description,Study Type,Phases,Outcome Measure,Medical Subject Headings
0,Montefiore Medical Center,OTHER,SPONSOR,Kinesiotape for Edema After Bilateral Total Kn...,"Effect of Kinesiotaping on Edema Management, P...",COMPLETED,2021-10-18,ADULT OLDER_ADULT,"Arthroplasty Complications, Arthroplasty, Repl...",TREATMENT,Kinesio(R)Tape for edema control,"Kinesio(R)Tape is an elastic, cotton tape with...",INTERVENTIONAL,,Change from baseline and during 1-2-day time i...,Edema


In [None]:
clin_trials.describe()

In [None]:
clin_trials.info()

In [None]:
clin_trials.info()

In [None]:
clin_trials['Organization Class'].value_counts()

In [None]:
clin_trials['Responsible Party'].value_counts()

In [None]:
clin_trials = clin_trials.drop('Responsible Party', axis = 1)
clin_trials.head(1)

In [None]:
clin_trials['Primary Purpose'].value_counts()

In [None]:
clin_trials['Intervention Description'].value_counts()

In [None]:
clin_trials['Outcome Measure'].value_counts()

In [None]:
clin_trials.columns

In [None]:
combined_prep_cols = clin_trials.drop(columns=['Organization Full Name']).columns
combined_prep_cols

In [None]:
cols = [col for col in combined_prep_cols]
clin_trials['combined_text'] = clin_trials[cols].agg(' '.join, axis=1)
clin_trials.head(1)

In [None]:
clin_trials = clin_trials[['Organization Full Name', 'combined_text']]
clin_trials.head(1)


In [None]:
def preprocess_text(text):

    # Remove punctuation including '|'
    text = text.translate(str.maketrans('', '', string.punctuation + '|')).lower()
    # Tokenize
    tokens = word_tokenize(text)
    # Remove stopwords
    tokens = [word for word in tokens if word not in stopwords.words('english')]
    # Lemmatize
    lemmatizer = WordNetLemmatizer()
    tokens = [lemmatizer.lemmatize(word) for word in tokens]
    return ' '.join(tokens)

In [None]:
clin_trials['combined_text'] = clin_trials['combined_text'].apply(preprocess_text)
clin_trials.head(1)

In [None]:
len(clin_trials)

In [None]:
sponsor_counts = clin_trials.groupby('Organization Full Name').size().reset_index(name='entry_count')
sponsor_counts.head()


In [None]:
clin_trials = clin_trials.groupby('Organization Full Name')['combined_text'].apply(lambda x: ' '.join(x)).reset_index()
clin_trials.head(1)


In [None]:
len(clin_trials)

In [None]:
clin_trials = pd.merge(clin_trials, sponsor_counts, on='Organization Full Name', how='outer')
clin_trials.head(1)

In [None]:
max_features = round(int(len(clin_trials)/1.25))
print(max_features)
vectorizer = TfidfVectorizer(max_features = max_features)

In [None]:
tfidf_vectors = vectorizer.fit_transform(clin_trials['combined_text'])

In [None]:
tfidf_df = pd.DataFrame(tfidf_vectors.toarray(), columns=vectorizer.get_feature_names_out())
tfidf_df = sp.csr_matrix(tfidf_df.values)

In [None]:
count_array = clin_trials['entry_count'].values.reshape(-1, 1)
scaler_count = MinMaxScaler()
scaled_count = scaler_count.fit_transform(count_array)
scaled_count = sp.csr_matrix(scaled_count)
tfidf_df = hstack([scaled_count, tfidf_df])

In [None]:
tfidf_df = hstack([scaled_count, tfidf_df])

In [None]:
components_range = range(1, min(tfidf_df.shape) + 1)

threshold = .01
print(components_range)

In [None]:
explained_variance_ratio = []
fin_component = 0
for n_components in components_range:
  svd = TruncatedSVD(n_components=n_components)
  svd.fit(tfidf_df)
  explained_variance_ratio.append(svd.explained_variance_ratio_.sum())
  if n_components > 1:
    if explained_variance_ratio[-1] - explained_variance_ratio[-2] > threshold:
      continue
    else:
      fin_component = n_components
      break

In [None]:
fin_component

# This plot is for review purposes. The code will auto choose a number of components, but the user should review this modified elbow graph for extreme oddities. Threshold can be adjusted quickly

In [None]:
plt.plot(range(fin_component), explained_variance_ratio, marker='o')
plt.xlabel('Number of Components')
plt.ylabel('Cumulative Explained Variance Ratio')
plt.title('Modified Elbow Method for Optimal Number of Components')
plt.grid(True)
plt.show()

In [None]:
svd = TruncatedSVD(n_components=fin_component)
svd_matrix = svd.fit_transform(tfidf_df)

In [None]:
silhouette_scores = []

for n_clusters in range(2, int(round(fin_component) + 1)):
    kmeans = KMeans(n_clusters=n_clusters, random_state=42)
    cluster_labels = kmeans.fit_predict(svd_matrix)
    silhouette_avg = silhouette_score(svd_matrix, cluster_labels)
    silhouette_scores.append(silhouette_avg)


In [None]:
max(silhouette_scores)