# Initialization

In [3]:
# Imports
from google.colab import drive
import pandas as pd
import numpy as np
import os
import json

from sklearn.impute import KNNImputer     # to fill in missing numerical data
from sklearn.impute import SimpleImputer  # to fill in missing categorical data

In [2]:
# Mount drive
drive.mount('/content/drive')

# Set PATH variable to project's directory
PATH = "/content/drive/MyDrive/OncoLens"

# Load the clinical.json file
with open(PATH + '/clinical.json') as f:
    samples = json.load(f)

# Create "csv" directory to store csv files
os.makedirs(PATH + "/csv", exist_ok=True)

Mounted at /content/drive


## dt_and_ps.csv (disease_type and primary_site)

In [None]:
# Collect each sample's "disease_type" and "primary_site" data
data = []
for sample in samples:
    case_id = sample.get("case_id", "")
    row = {"case_id": case_id}
    row["disease_type"] = sample.get("disease_type", "")
    row["primary_site"] = sample.get("primary_site", "")
    data.append(row)

# Create DataFrame and save to CSV
fieldnames = ["case_id", "disease_type", "primary_site"]
df = pd.DataFrame(data, columns=fieldnames)
df.to_csv("dt_and_ps.csv", index=False)
df.to_csv(PATH + "/csv/dt_and_ps.csv", index=False)

## exposure.csv

In [None]:
# Collect all unique "exposure" object's keys
exposure_keys = set()
for sample in samples:
    for exposure in sample.get("exposures", []): # Some cases have multiple exposures
        for key in exposure.keys():
            exposure_keys.add(key)

# Discard unnecessary keys
discard_set = ["tobacco_smoking_onset_year", "exposure_duration_years", "alcohol_history",
               "type_of_tobacco_used", "created_datetime", "tobacco_smoking_quit_year",
               "type_of_smoke_exposure", "exposure_id", "state", "submitter_id",
               "secondhand_smoke_as_child", "updated_datetime"]
for item in discard_set:
    exposure_keys.discard(item)
print(exposure_keys)

# Define DataFrame columns
fieldnames = ["case_id"] + sorted(exposure_keys)

# Collect each sample's "exposure" data
data = []
for sample in samples:
    case_id = sample.get("case_id", "")
    row = {"case_id": case_id}
    for exposure in sample.get("exposures", []):
        for key in exposure_keys:
            if (key == "years_smoked" or
                key == "cigarettes_per_day" or
                key == "pack_years_smoked"):
                value = exposure.get(key, 0)  # Set to 0 if patient never smoked
                row[key] = value
            elif key == "exposure_type":
                if exposure.get("exposure_type"):   # If patient have exposure type, set it
                    row[key] = exposure.get("exposure_type")
                else:   # Else, check to see if they have smoked before and set accordingly
                    if exposure.get("tobacco_smoking_status") != "Lifelong Non-Smoker":
                        row[key] = "Unknown"
                    else:
                        row[key] = "None"
            else:
                row[key] = exposure.get(key)
    data.append(row)

# Create DataFrame and save to CSV
df = pd.DataFrame(data, columns=fieldnames)
df.to_csv("exposures.csv", index=False)
df.to_csv(PATH + "/csv/exposures.csv", index=False) # Save a copy to drive

{'alcohol_intensity', 'tobacco_smoking_status', 'exposure_type', 'pack_years_smoked', 'years_smoked', 'cigarettes_per_day'}


## demographic.csv

In [None]:
# Initialize a set with the first sample's "demographic" keys
common_demographic_keys = set(samples[0]["demographic"].keys())

# Iterate through all the samples and perform intersection with their keys
for sample in samples[1:]:
    common_demographic_keys &= set(sample["demographic"].keys())

# Discard unnecessary keys
discard_set = ["created_datetime", "demographic_id", "age_is_obfuscated",
               "year_of_birth", "state", "submitter_id", "updated_datetime",
               "ethnicity"]
for item in discard_set:
    common_demographic_keys.discard(item)

# Collect each sample's "demographic" data
data = []
for sample in samples:
    case_id = sample.get("case_id", "")
    row = {"case_id": case_id}
    demographic = sample.get("demographic", [])
    for key in common_demographic_keys:
        if key == "days_to_birth":  # Calculate age instead
            value = (-1 * demographic.get(key, 0)) // 365.25
            row["age"] = value
        else:
            value = demographic.get(key, "")
            row[key] = value
    data.append(row)

# Define DataFrame columns, replacing days_to_birth with age
common_demographic_keys.discard("days_to_birth")
common_demographic_keys.add("age")
fieldnames = ["case_id"] + sorted(common_demographic_keys)

# Create DataFrame and save to CSV
df = pd.DataFrame(data, columns=fieldnames)
df.to_csv("demograhic.csv", index=False)
df.to_csv(PATH + "/csv/demographic.csv", index=False) # Save a copy to drive

## diagnoses.csv

In [None]:
# Discarded set - can determine later
keys = set()

discard_set = ["submitter_id", "treatment_id", "diagnosis_id", "created_datetime",
           "updated_datetime", "pathology_details", "treatments"]
for sample in samples:
    diagnoses = sample.get("diagnoses", [])
    for diagnosis in diagnoses:
        for key in diagnosis.keys():
            if key not in discard_set:
                keys.add(key)

        # A list of dictionaries for the nested columns
        if "treatments" in diagnosis:
            keys.add("immunotherapy_treatments")
            keys.add("chemotherapy_treatments")
            keys.add("radiation_therapy_treatments")
            keys.add("treatment_outcome")
        if "pathology_details" in diagnosis:
            keys.add("lymph_nodes_positive")
            keys.add("tumor_largest_dimension_diameter")

file_headers = ["case_id"] + sorted(list(keys))
rows = []
for sample in samples:
    case_id = sample.get("case_id")
    diagnoses = sample.get("diagnoses", [])

    # Patients can have more than one diagnosis
    for diagnosis in diagnoses:
        row = {"case_id": case_id}

        # Regular fields
        for key in keys:
          row[key] = diagnosis.get(key, None)

        # Initialize treatment types columns with 0
        row["immunotherapy_treatments"] = 0
        row["chemotherapy_treatments"] = 0
        row["radiation_therapy_treatments"] = 0
        row["treatment_outcome"] = -1  # initialized as unknown

        # Fill in "treatments"
        if "treatments" in diagnosis:
            # "treatments" can have 3 values:
            #    Immunotherapy (Including Vaccines)
            #    Chemotherapy
            #    Radiation Therapy, NOS
            # Store frequency of each treament received
            for treatment in diagnosis.get("treatments", []):
                treatment_type = treatment.get("treatment_type", "")
                if treatment_type == "Immunotherapy (Including Vaccines)":
                    row["immunotherapy_treatments"] += 1
                elif treatment_type == "Chemotherapy":
                    row["chemotherapy_treatments"] += 1
                else:
                    row["radiation_therapy_treatments"] += 1

            # Store "treatment_outcome" corresponding to "treatments"
            #    Complete Response = 1
            #    Persistent Disease = 0
            # NOTE: Some cases had multiple outcomes, but the updated dates are all the same
            # so instead of recording the most recent treatment outcome, we recorded '1'
            # if the patient had any complete responses recorded
                treatment_outcome = treatment.get("treatment_outcome", "")
                if treatment_outcome == "Complete Response":
                    row["treatment_outcome"] = 1
                elif treatment_outcome == "Persistent Disease" and row["treatment_outcome"] == -1:
                    row["treatment_outcome"] = 0

        # Fill in "pathology_details"
        if "pathology_details" in diagnosis:
            pathology_detail = diagnosis.get("pathology_details", [])
            row["lymph_nodes_positive"] = pathology_detail[0].get("lymph_nodes_positive")
            row["tumor_largest_dimension_diameter"] = pathology_detail[0].get("tumor_largest_dimension_diameter")

        # Append row
        rows.append(row)

# Create DataFrame and save to CSV
df = pd.DataFrame(rows, columns=file_headers)
df.to_csv("diagnoses.csv", index=False)
df.to_csv(PATH + "/csv/diagnoses.csv", index=False) # Save a copy to drive

## follow_ups.csv

In [None]:
# Define important "follow_ups" object's keys
follow_ups_keys = ["karnofsky_performance_status", "ecog_performance_status",
                   "disease_response", "other_clinical_attributes"]
                   # other_clinical_attributes contains bmi

# Collect data
data = []
for sample in samples:
    row = {"case_id": sample.get("case_id", "")}

    # Initialize last reported dates for the relevant fields
    dr_last_reported = 0
    prog_last_reported = 0

    # Loop through each "follow_ups"
    for follow_up in sample.get("follow_ups", []):
        for key in follow_ups_keys:
            # Update time (to be used for getting last reported fields)
            time = follow_up.get("days_to_follow_up", "")
            try:
                time = int(time)
            except ValueError:
                time = -1

            match key:
                case "karnofsky_performance_status":
                    # Store minimum value (least healthy on scale of 0-100)
                    karnofsky = follow_up.get(key, "")
                    try:
                        karnofsky = float(karnofsky)
                        if row.get(key) is None or karnofsky < row[key]:
                            row[key] = karnofsky
                    except ValueError:
                        continue

                case "ecog_performance_status":
                    # Store highest value (least healthy on scale of 1-5)
                    ecog = follow_up.get(key, "")
                    try:
                        ecog = float(ecog)
                        if row.get(key) is None or ecog > row[key]:
                            row[key] = ecog
                    except ValueError:
                        continue

                case "disease_response":
                    # Store last reported response
                    if time > dr_last_reported:
                        row[key] = follow_up.get(key, "")
                        dr_last_reported = time
                        continue

                case "other_clinical_attributes":
                    # Extract and store bmi from other_clinical_attributes
                    for attr in follow_up.get(key, []):
                        bmi = attr.get("bmi", "")
                        if (bmi):
                            row["bmi"] = bmi
                            break

    # Append row to data
    data.append(row)

# Get full list of field names for DataFrame columns
fieldnames = follow_ups_keys + ["case_id"]
# fieldnames.insert(0, "case_id")
fieldnames.remove("other_clinical_attributes")
fieldnames.append("bmi")

# Crate DataFrame and save to CSV
df = pd.DataFrame(data, columns=fieldnames)
df.to_csv("follow_ups.csv", index=False)
df.to_csv(PATH + "/csv/follow_ups.csv", index=False) # Save a copy to drive

# MERGE

In [None]:
# Read csv files
demo = pd.read_csv(PATH + "/csv/demographic.csv")
diag = pd.read_csv(PATH + "/csv/diagnoses.csv")
dtps = pd.read_csv(PATH + "/csv/dt_and_ps.csv")
expo = pd.read_csv(PATH + "/csv/exposures.csv")
foll = pd.read_csv(PATH + "/csv/follow_ups.csv")

# Merged DataFrames
merged_df_1 = pd.merge(demo, diag, on='case_id', how='inner')
merged_df_2 = pd.merge(merged_df_1, dtps, on='case_id', how='inner')
merged_df_3 = pd.merge(merged_df_2, expo, on='case_id', how='inner')
merged_df_4 = pd.merge(merged_df_3, foll, on='case_id', how='inner')

# Print DataFrames and save
print(merged_df_4.head())
merged_df_4.to_csv("merged.csv", index=False)
merged_df_4.to_csv(PATH + "/csv/merged.csv", index=False)

                                case_id   age  gender   race vital_status  \
0  003f4f85-3244-4132-8c9d-c29f09382269  60.0  female  white        Alive   
1  01b5cff1-824f-4fd4-a149-7bcbacf8c4dc  54.0    male  asian        Alive   
2  01db2fd5-6443-4ed8-8318-3ef4f9012450  55.0  female  white        Alive   
3  024bde91-ea3e-4157-83c4-8482801b00dc  71.0  female  white        Alive   
4  028061ef-a99c-42dd-b011-344036e5357f  60.0    male  asian        Alive   

   age_at_diagnosis ajcc_clinical_m ajcc_pathologic_m ajcc_pathologic_n  \
0             22056              M0                M0                N0   
1             20054              M0                MX                N0   
2             20260              MX                M1                N0   
3             26003              MX                MX                N0   
4             22039              M0                MX                N0   

  ajcc_pathologic_stage  ...     alcohol_intensity cigarettes_per_day  \
0            

## KNNimputer and SimpleImputer to fill in missing data

In [None]:
# Load merged data
df = pd.read_csv(PATH + "/csv/merged.csv")

# Separate numerical and categorical columns
numerical_cols = df.select_dtypes(include=["float64", "int64"]).columns
categorical_cols = df.select_dtypes(include=["object"]).columns

# Impute missing values in numerical data with KNNImputer
knn_imputer = KNNImputer(n_neighbors=5, weights="uniform")
df[numerical_cols] = knn_imputer.fit_transform(df[numerical_cols])

# Impute missing values in categorical data with SimpleImputer
simp_imputer = SimpleImputer(strategy="most_frequent") # Use most frequent strategy
df[categorical_cols] = simp_imputer.fit_transform(df[categorical_cols])

# Create DataFrame and save to CSV
df.to_csv("knn_imputed.csv", index=False)
df.to_csv(PATH + "/csv/knn_imputed.csv", index=False) # Save a copy drive