In [10]:
#importing Libraries
import pandas as pd
import numpy as np
import os
import re
import ast
from sklearn.model_selection import train_test_split
from sklearn.cluster import KMeans

In [6]:
#Collection of helper function iterated off of a previous Groups work. Change have been for clarity and preformance optimization 
def convert_to_datetime(date_str):
    try:
        return pd.to_datetime(date_str, format='%Y-%m-%d')
    except ValueError:
        try:
            return pd.to_datetime(date_str + '-01', format='%Y-%m-%d')  # Assume first day of the month if YYYY-MM format
        except ValueError:
            return pd.NaT  # Return NaT (Not a Time) if conversion fails
        
def count_loc(value):
    if pd.isna(value):
        return np.nan
    else:
        new_string = value.split(", ")
        counter = sum(1 for i in new_string if "facility" in i)
        return counter
    
def trial_loc(value):
    if pd.isna(value):
        return np.nan
    else:
        new_string = value.split(", ")
        temp_list = [i for i in new_string if "country" in i]
        has_us = any("United States" in i for i in temp_list)

        if all(i == temp_list[0] for i in temp_list) and has_us:
            loc = 'USA'
        elif not has_us:
            loc = "non-USA"
        else:
            loc = "USA & non-USA"
        return loc
    
def extract_measures(outcomes):
    if isinstance(outcomes, list):
        return [item.get('measure', 'No description available') for item in outcomes]
    else:
        return ['No description available']  # Return a list with a default message if not iterable


def extract_timeframes(outcomes):
    if isinstance(outcomes, list):
        return [item.get('timeFrame', 'No description available') for item in outcomes]
    else:
        return ['No description available']  # Return a list with a default message if not iterable

def extract_time_length_from_list(timeFrames):
    written_numbers = {
        "one": 1, "two": 2, "three": 3, "four": 4, "five": 5, 
        "six": 6, "seven": 7, "eight": 8, "nine": 9, "ten": 10,
        "eleven": 11, "twelve": 12, "thirteen": 13, "fourteen": 14,
        "fifteen": 15, "sixteen": 16, "seventeen": 17, "eighteen": 18,
        "nineteen": 19, "twenty": 20
    }


    number_pattern = r"(\d+(\.\d+)?)"
    written_number_pattern = r"(one|two|three|four|five|six|seven|eight|nine|ten|eleven|twelve|thirteen|fourteen|fifteen|sixteen|seventeen|eighteen|nineteen|twenty)"
    unit_pattern = r"(hour|day|week|month|year|minute)s?"
    reversed_pattern = rf"(?P<unit3>{unit_pattern})\s+(?P<number2>\d+(\.\d+)?)"

    time_pattern = re.compile(
        rf"""
        (?:
            (?P<number>{number_pattern})\s*(?P<unit1>{unit_pattern}) |
            (?P<written>{written_number_pattern})\s*(?P<unit2>{unit_pattern}) |
            {reversed_pattern}
        )
        """,
        re.IGNORECASE | re.VERBOSE
    )

    def extract_time_length(timeFrame):
        results = time_pattern.findall(timeFrame)
        time_durations = []

        for result in results:
            if result[0]:
                number, unit = result[0], result[3]
            elif result[5]:
                number, unit = written_numbers[result[5].lower()], result[7]
            elif result[10]:
                number, unit = result[11], result[9]

            number = float(number) if isinstance(number, str) else number
            time_durations.append((number, unit.lower()))
        return time_durations

    return [
        duration
        for timeFrame in timeFrames if timeFrame
        for duration in extract_time_length(timeFrame)
    ]

def convert_to_days(amount, unit):
    if unit in ['minutes', 'minute']:
        return np.ceil(amount / 1440)
    elif unit in ['hours', 'hour']:
        return np.ceil(amount / 24)
    elif unit in ['weeks', 'week']:
        return amount * 7
    elif unit in ['months', 'month']:
        return amount * 30  
    elif unit in ['years', 'year']:
        return amount * 365 
    elif unit in ['days', 'day']:
        return amount
    else:
        return 0
    
def find_max_duration(durations):
    if not durations:  # Check if the list is empty
        return np.nan  
    max_days = max(convert_to_days(amount, unit) for amount, unit in durations)
    return max_days

def remove_special_chars(col):
    # Define a translation table to replace characters with underscores
    translation_table = str.maketrans({ord(char): '_' for char in "[]',"})
    
    # Use translate method to apply the translation table
    cleaned_col = col.translate(translation_table)
    
    return cleaned_col

def count_criteria(criteria):
    if pd.isna(criteria):
        return np.nan, np.nan
    
    pattern = r'\n\s*\w{1,2}\.|[*]|\-'  # Pattern to match alphanumeric followed by period, asterisk, or hyphen
    
    if "exclusion criteria" in criteria.lower():
        parts = criteria.lower().split("exclusion criteria", 1)  # Split only once to preserve the rest
        inclusion_criteria = parts[0]
        exclusion_criteria = parts[1]

        inclusion_matches = re.findall(pattern, inclusion_criteria)
        exclusion_matches = re.findall(pattern, exclusion_criteria)
        num_inclusion = len(inclusion_matches)
        num_exclusion = len(exclusion_matches)
    else:
        inclusion_matches = re.findall(pattern, criteria)
        num_inclusion = len(inclusion_matches)
        num_exclusion = np.nan
    
    return num_inclusion, num_exclusion

def conditions_map(condition):
    condition = condition.lower() 
    
    category_map = {
        'squamous cell': ['cell lung', 'head and neck', 'squamous cell', 'small cell', 'lung', 'keratosis', 'nsclc'],
        'myeloma': ['myeloma'],
        'sarcoma': ['sarcoma'],
        'lymphoma': ['lymphoma', 'lymphoid'],
        'brain': ['brain cancer', 'glioblastoma'],
        'melanoma': ['melanoma'],
        'adeno': ['adenocarcinoma', 'prostate cancer', 'colorectal', 'kidney', 'renal', 'cervix', 'cervical', 'liver', 'hepatic', 'hepatocellular', 'thyroid'],
        'ductal': ['breast'],
        'leukemia': ['leukemia', 'hematopoietic'],
        'pain': ['pain'],
        'carcinoma': ['carcinoma']
    }
    
    for category, keywords in category_map.items():
        if any(keyword in condition for keyword in keywords):
            return category
    
    return 'other'

def list_to_lower_string(lst):
    # Check if input is iterable (not strictly requiring a list)
    if hasattr(lst, '__iter__'):
        # Convert each element to lowercase string and join with ", "
        return ", ".join(map(str, lst)).lower()
    else:
        return ""  # Return empty string if input is not iterable



def drop_outliers(df, threshold=5):
    # Calculate z-scores for each column
    z_scores = np.abs((df - df.mean()) / df.std())
    
    # Identify outliers using the threshold
    outlier_mask = z_scores > threshold
    
    # Drop rows containing any outliers
    df_cleaned = df[~outlier_mask.any(axis=1)]
    
    return df_cleaned



In [None]:
#start data processing
df = pd.read_csv(r'C:\Users\Spenc\OneDrive\Documents\Masters Thesis\ctg-studies.csv')
# len of rows
rows0 = len(df)

def process_study_duration(df):
    # Convert dates to datetime
    df['start_date'] = df['protocolSection_statusModule_startDateStruct_date'].apply(convert_to_datetime)
    df['primary_completion_date'] = df['protocolSection_statusModule_primaryCompletionDateStruct_date'].apply(convert_to_datetime)
    df['completion_date'] = df['protocolSection_statusModule_completionDateStruct_date'].apply(convert_to_datetime)
    
    # Calculate study durations in days
    df['primary_study_duration_days'] = (df['primary_completion_date'] - df['start_date']).dt.days
    df['study_duration_days'] = (df['completion_date'] - df['start_date']).dt.days
    
    return df

def clean_and_report_missing(df):
    # Define columns that are vital for accurate prediction
    need_cols = [
        'primary_study_duration_days',
        'study_duration_days',
        'protocolSection_designModule_enrollmentInfo_count']

    # Calculate initial number of rows
    initial_rows = len(df)
    # Drop rows with NaNs in vital columns and create a copy of the DataFrame
    df_cleaned = df.dropna(subset=need_cols).copy()
    # Calculate number of rows after dropping NaNs
    cleaned_rows = len(df_cleaned)
    # Calculate the number of rows dropped
    dropped_rows = initial_rows - cleaned_rows
    # Print message about dropped rows
    print(f"{dropped_rows} rows were dropped due to missing values in one of: {need_cols}")

    return df_cleaned, dropped_rows

def create_bins_and_count_criteria(df, n_intervals=5):
    # Create bins for study duration and primary study duration
    df['study_eq_bins'] = pd.qcut(df['study_duration_days'], q=n_intervals, labels=False)
    df['primary_eq_bins'] = pd.qcut(df['primary_study_duration_days'], q=n_intervals, labels=False)

    # Calculate bin labels for study and primary study durations
    df['study_eq_labels'] = pd.qcut(df['study_duration_days'], q=n_intervals, labels=False)
    df['primary_eq_labels'] = pd.qcut(df['primary_study_duration_days'], q=n_intervals, labels=False)

    # Create a dictionary of bin labels and corresponding intervals for study duration
    bins_dict = df.groupby('study_eq_labels')['study_eq_bins'].first().to_dict()
    msg = f"Bin labels and their corresponding intervals for study duration are: {bins_dict}"
    print(msg)

    # Extract number of inclusion and exclusion criteria
    df[['num_inclusion', 'num_exclusion']] = df['protocolSection_eligibilityModule_eligibilityCriteria'].apply(count_criteria).apply(pd.Series)

    return df

def map_sponsor_type(df):
    # Define mappings for sponsor types
    sponsor_class_map = {
        'OTHER_GOV': 'OTHER',
        'NETWORK': 'OTHER',
        'NIH': 'OTHER',
        'FED': 'OTHER',
        'INDIV': 'OTHER'
    }

    sponsor_type_map = {
        "INDUSTRY": 1,
        "OTHER": 0
    }

    # Apply mapping for sponsor class
    df['sponsor_type_class'] = df['protocolSection_sponsorCollaboratorsModule_leadSponsor_class'].replace(sponsor_class_map)

    # Map sponsor type based on mapped sponsor class
    df['sponsor_type'] = df['sponsor_type_class'].map(sponsor_type_map)

    return df


def process_conditions_and_model(df):
    # Number of conditions, integer
    df['protocolSection_conditionsModule_conditions'] = df['protocolSection_conditionsModule_conditions'].apply(ast.literal_eval)
    df['number_of_conditions'] = df['protocolSection_conditionsModule_conditions'].apply(lambda x: len(x))

    # Intervention model, categorical (mapped to int/float)
    intervention_model_map = {
        "CROSSOVER": "OTHER",
        "SEQUENTIAL": "OTHER",
        "FACTORIAL": "OTHER"
    }

    intervention_model_map2 = {
        "SINGLE_GROUP": 0,
        "PARALLEL": 1,
        "OTHER": 2
    }

    # Apply mappings for intervention model
    df['intervention_model_mapped'] = df['protocolSection_designModule_designInfo_interventionModel'].replace(intervention_model_map)
    df['intervention_model'] = df['intervention_model_mapped'].map(intervention_model_map2)

    # Drop intermediate column
    df.drop(columns=['intervention_model_mapped'], inplace=True)

    return df

def process_study_design(df):
    # Primary purpose, bool/int
    df['protocolSection_designModule_designInfo_primaryPurpose'] = df['protocolSection_designModule_designInfo_primaryPurpose'].fillna('')
    df['treatment_purpose'] = df['protocolSection_designModule_designInfo_primaryPurpose'].apply(lambda x: 1 if 'TREATMENT' in x else 0)
    df['diagnostic_purpose'] = df['protocolSection_designModule_designInfo_primaryPurpose'].apply(lambda x: 1 if 'DIAGNOSTIC' in x else 0)
    df['prevention_purpose'] = df['protocolSection_designModule_designInfo_primaryPurpose'].apply(lambda x: 1 if 'PREVENTION' in x else 0)
    df['supportive_purpose'] = df['protocolSection_designModule_designInfo_primaryPurpose'].apply(lambda x: 1 if 'SUPPORTIVE_CARE' in x else 0)

    # Intervention type, bool/int
    df['protocolSection_armsInterventionsModule_interventions'] = df['protocolSection_armsInterventionsModule_interventions'].apply(ast.literal_eval)
    df['procedure_intervention'] = df['protocolSection_armsInterventionsModule_interventions'].apply(lambda x: 1 if 'PROCEDURE' in x else 0)
    df['device_intervention'] = df['protocolSection_armsInterventionsModule_interventions'].apply(lambda x: 1 if 'DEVICE' in x else 0)
    df['behavioral_intervention'] = df['protocolSection_armsInterventionsModule_interventions'].apply(lambda x: 1 if 'BEHAVIORAL' in x else 0)
    df['drug_intervention'] = df['protocolSection_armsInterventionsModule_interventions'].apply(lambda x: 1 if 'DRUG' in x else 0)
    df['radiation_intervention'] = df['protocolSection_armsInterventionsModule_interventions'].apply(lambda x: 1 if 'RADIATION' in x else 0)
    df['biological_intervention'] = df['protocolSection_armsInterventionsModule_interventions'].apply(lambda x: 1 if 'BIOLOGICAL' in x else 0)

    # Number of groups, int
    df['number_of_groups'] = df['protocolSection_armsInterventionsModule_armGroups'].apply(lambda x: len(x) if isinstance(x, list) else 0)

    # Number of intervention types, int
    df['number_of_intervention_types'] = df['intervention_types'].apply(len)

    return df

def map_age_groups(df):
    # Age group, categorical (mapped to int/float)
    age_map = {
        "['ADULT', 'OLDER_ADULT']": "adult",
        "['ADULT']": "adult",
        "['OLDER_ADULT']": "adult",
        "['CHILD']": "youth",
        "['CHILD', 'ADULT']": "youth",
        "['CHILD', 'ADULT', 'OLDER_ADULT']": "all"
    }

    age_map2 = {
        "youth": 0,
        "adult": 1,
        "all": 2
    }

    df["age_group0"] = df["protocolSection_eligibilityModule_stdAges"].map(age_map)
    df["age_group"] = df["age_group0"].map(age_map2)

    return df

def process_location_data(df):
    # Number of locations, int
    df["num_locations"] = df["protocolSection_contactsLocationsModule_locations"].apply(count_loc)

    # Location of trials, categorical
    loc_map = {
        "USA": 0,
        "non-USA": 1,
        "USA & non-USA": 2
    }

    df['location0'] = df["protocolSection_contactsLocationsModule_locations"].apply(trial_loc)
    df['location'] = df['location0'].map(loc_map)

    return df

def process_outcome_measures(df):
    # Combine outcome measures
    df['outcome_measures'] = df['protocolSection_outcomesModule_primaryOutcomes'] + df['protocolSection_outcomesModule_secondaryOutcomes']
    df['outcome_measures'] = df['outcome_measures'].astype('str').str.lower()

    # OS outcome measure
    df['os_outcome_measure'] = df['outcome_measures'].apply(lambda x: 1 if ' os ' in x or 'overall survival' in x else 0)

    # Adverse event outcome measure
    df['ae_outcome_measure'] = df['outcome_measures'].apply(lambda x: 1 if 'adverse event' in x else 0)

    # DOR outcome measure
    df['dor_outcome_measure'] = df['outcome_measures'].apply(lambda x: 1 if ' dor ' in x or 'duration of response' in x else 0)

    # Max timeframe from primary outcome measures
    df['primary_measure'] = df['protocolSection_outcomesModule_primaryOutcomes'].apply(extract_measures)
    df['primary_timeFrame'] = df['protocolSection_outcomesModule_primaryOutcomes'].apply(extract_timeframes)
    df['primary_duration'] = df['primary_timeFrame'].apply(extract_time_length_from_list)
    df['primary_max_days'] = df['primary_duration'].apply(find_max_duration)

    # Max timeframe from secondary outcome measures
    df['secondary_measure'] = df['protocolSection_outcomesModule_secondaryOutcomes'].apply(extract_measures)
    df['secondary_timeFrame'] = df['protocolSection_outcomesModule_secondaryOutcomes'].apply(extract_timeframes)
    df['secondary_duration'] = df['secondary_timeFrame'].apply(extract_time_length_from_list)
    df['secondary_max_days'] = df['secondary_duration'].apply(find_max_duration)

    # Drop intermediate columns
    df = df.drop(columns=['os_outcome_measure2', 'dor_outcome_measure2'])

    return df

def process_oversight_and_responsible_party(df):
    # Convert to lowercase and map 'true'/'false' to 1/0 for has_dmc
    df['protocolSection_oversightModule_oversightHasDmc'] = df['protocolSection_oversightModule_oversightHasDmc'].astype(str).str.lower()
    dmc_map = {'true': 1, 'false': 0}
    df['has_dmc'] = df['protocolSection_oversightModule_oversightHasDmc'].map(dmc_map)

    # Map responsible party types to integers
    party_map = {
        "PRINCIPAL_INVESTIGATOR": 0,
        "SPONSOR": 1,
        "SPONSOR_INVESTIGATOR": 2
    }
    df['resp_party'] = df['protocolSection_sponsorCollaboratorsModule_responsibleParty_type'].map(party_map)

    return df

def process_design_and_conditions(df):
    # Allocation mapping
    allo_map = {
        'NON_RANDOMIZED': 0,
        'RANDOMIZED': 1
    }
    df['allocation'] = df['protocolSection_designModule_designInfo_allocation'].map(allo_map)

    # Masking mapping
    mask_map = {
        "NONE": 0,
        "SINGLE": 1,
        "DOUBLE": 2,
        "TRIPLE": 3,
        "QUADRUPLE": 4
    }
    df['masking'] = df['protocolSection_designModule_designInfo_maskingInfo_masking'].map(mask_map)

    # Convert conditions to lowercase string and map to categories
    df['conditions'] = df['protocolSection_conditionsModule_conditions'].apply(list_to_lower_string)
    df['conditions_category'] = df['conditions'].apply(lambda x: conditions_map(x))

    # Category mapping dictionary
    category_map = {
        'myeloma': 0, 'squamous cell': 1, 'adeno': 2, 'carcinoma': 3, 
        'leukemia': 4, 'ductal': 5, 'sarcoma': 6, 'lymphoma': 7, 
        'melanoma': 8, 'brain': 9, 'pain': 10, 'other': 11
    }
    df['conditions_category_num'] = df['conditions_category'].map(category_map)

    return df

# Consolidated mappings into a single dictionary
conditions_map = {
    'myeloma': {
        'survival_5yr_relative': 0.598,
        'max_treatment_duration': 180,
        'min_treatment_duration': 90
    },
    'squamous cell': {
        'survival_5yr_relative': 0.99,
        'max_treatment_duration': 49,
        'min_treatment_duration': 14
    },
    'adeno': {
        'survival_5yr_relative': 0.175,
        'max_treatment_duration': 1080,
        'min_treatment_duration': 360
    },
    'carcinoma': {
        'survival_5yr_relative': 0.99,
        'max_treatment_duration': 1440,
        'min_treatment_duration': 360
    },
    'leukemia': {
        'survival_5yr_relative': 0.65,
        'max_treatment_duration': 1095,
        'min_treatment_duration': 730
    },
    'ductal': {
        'survival_5yr_relative': 0.99,
        'max_treatment_duration': 1825,
        'min_treatment_duration': 365
    },
    'sarcoma': {
        'survival_5yr_relative': 0.65,
        'max_treatment_duration': 1825,
        'min_treatment_duration': 240
    },
    'lymphoma': {
        'survival_5yr_relative': 0.83,
        'max_treatment_duration': 730,
        'min_treatment_duration': 180
    },
    'melanoma': {
        'survival_5yr_relative': 0.94,
        'max_treatment_duration': 730,
        'min_treatment_duration': 150
    },
    'brain': {
        'survival_5yr_relative': 0.326,
        'max_treatment_duration': 4320,
        'min_treatment_duration': 1080
    },
    'pain': {
        'survival_5yr_relative': 0.68,
        'max_treatment_duration': 4320,
        'min_treatment_duration': 14
    },
    'other': {
        'survival_5yr_relative': 0.68,
        'max_treatment_duration': 4320,
        'min_treatment_duration': 14
    }
}

def apply_conditions_mapping(df, conditions_column, conditions_map):
    # Apply mappings to create new columns
    df['conditions_category_info'] = df[conditions_column].map(conditions_map)
    df = pd.concat([df, df['conditions_category_info'].apply(pd.Series)], axis=1)
    df.drop(columns=['conditions_category_info'], inplace=True)  # Drop the intermediate column
    return df


def clean_and_save_data(df):
    def drop_outliers(df, threshold=5):
        # Calculate the mean and standard deviation for each column
        means = df.mean()
        stds = df.std()
        # Identify outliers
        outliers = (np.abs((df - means) / stds) > threshold)
        # Create a DataFrame to store the outliers
        dropped_values = df[outliers]
        # Drop the rows with outliers
        df_cleaned = df.drop(index=dropped_values.dropna(how='all').index)
        return df_cleaned

    def fill_missing_with_mode(df, columns):
        for column in columns:
            mode_value = df[column].mode()[0]  # Calculate the mode
            df[column] = df[column].fillna(mode_value)
        return df

    def remove_special_chars(col_name):
        # Replace special characters with underscores
        return col_name.replace(' ', '_').replace('/', '_').replace('-', '_')

    # Rename columns of interest
    df = df.rename(columns={
        'protocolSection_designModule_phases': 'phase',
        'protocolSection_designModule_enrollmentInfo_count': 'enroll_count',
        'protocolSection_eligibilityModule_healthyVolunteers': 'healthy_vol'
    })

    # Select columns of interest
    cols = [
        'protocolSection_identificationModule_nctId',
        'primary_study_duration_days',
        'study_duration_days',
        'primary_eq_bins',
        'study_eq_bins',
        'study_eq_labels',
        'primary_eq_labels',
        'number_of_conditions',
        'number_of_groups',
        'age_group',
        'num_locations',
        'location',
        'num_inclusion',
        'num_exclusion',
        'number_of_intervention_types',
        'sponsor_type',
        'intervention_model',
        'resp_party',
        'has_dmc',
        'phase',
        'allocation',
        'masking',
        'enroll_count',
        'healthy_vol',
        'treatment_purpose',
        'diagnostic_purpose',
        'prevention_purpose',
        'supportive_purpose',
        'procedure_intervention',
        'device_intervention',
        'behavioral_intervention',
        'drug_intervention',
        'radiation_intervention',
        'biological_intervention',
        'os_outcome_measure',
        'dor_outcome_measure',
        'ae_outcome_measure',
        'primary_max_days',
        'secondary_max_days',
        'max_treatment_duration',
        'min_treatment_duration',
        'survival_5yr_relative',
        'conditions_category_num'
    ]

    clean_df = df[cols].copy()

    # Remove outliers
    numeric_cols = clean_df.select_dtypes(include=['float16', 'float32', 'float64', 'int', 'int32', 'int64', 'bool']).columns
    temp = clean_df[numeric_cols]
    clean_temp = drop_outliers(temp)
    clean_df = clean_df.loc[clean_temp.index]

    # Print message about dropped outliers
    msg_outliers = f"The number of rows dropped due to outliers (greater than 5 standard deviations from the mean) is {len(df) - len(clean_df)}"
    print(msg_outliers)

    # Handle missing values by filling with mode
    nan_cols = clean_df.columns[clean_df.isna().any()].tolist()
    nan_cols.remove('primary_max_days')
    nan_cols.remove('secondary_max_days')
    clean_df = fill_missing_with_mode(clean_df, nan_cols)

    # One hot encode remaining object columns
    object_columns = clean_df.select_dtypes(include=['object']).columns
    object_columns = [col for col in object_columns if 'nctId' not in col]
    encoded_df = pd.get_dummies(clean_df, columns=object_columns)

    # Apply function to clean column names
    encoded_df.columns = encoded_df.columns.map(remove_special_chars)

    # Split the data into train and test sets
    train_df, test_df = train_test_split(encoded_df, test_size=0.3, random_state=42, shuffle=True)

    output_dir = "C:\\Users\\Spenc\\OneDrive\\Documents\\Masters Thesis\\"



    # Ensure the directory exists; create if it doesn't
    os.makedirs(output_dir, exist_ok=True)

    # Save the cleaned data to CSV files in the specified directory
    train_df.to_csv(os.path.join(output_dir, "cleaned_data_train.csv"), index=False)
    test_df.to_csv(os.path.join(output_dir, "cleaned_data_test.csv"), index=False)

process_study_duration(df)
#clean_and_report_missing(df)

#create_bins_and_count_criteria(df, n_intervals=5)

data_msg = "Data processing completed."
print(data_msg)