In [1]:
# make_csv.py
# Create csv file from the different json files

import json
import pandas as pd
import glob

files = glob.glob("clinical_data/*.json")

rows = []

for f in files:
    with open(f, "r") as fp:
        data = json.load(fp)
        rows.append(data)

df = pd.DataFrame(rows)
df.to_csv("clinical_data.csv", index=False)

In [21]:
# preprocess_clinical_data.py
# Preprocesses clinical_data.csv to ensure proper training

from sklearn.preprocessing import OneHotEncoder
import pandas as pd

df = pd.read_csv('clinical_data.csv')

# Handle missing values in binary columns, where missing value does not have meaning, by replacing missing value with 0
df["capsular_penetration"] = df["capsular_penetration"].replace("x", 0)

# Fill columns with NaNs as 0, BCR_PSA <0.1 means undectable, i.e. 0 and tertinary_gleason was not detected so 0 on the ordinal scale (1-5) 
df[["BCR_PSA", "tertiary_gleason"]] = df[["BCR_PSA", "tertiary_gleason"]].fillna(0)

# OneHotEncode categorical columns
categorical_cols = ["pT_stage", "positive_lymph_nodes"]

encoder = OneHotEncoder(
    sparse_output=False,
    handle_unknown="ignore"
)

encoded = encoder.fit_transform(df[categorical_cols])

encoded_df = pd.DataFrame(
    encoded,
    columns=encoder.get_feature_names_out(categorical_cols),
    index=df.index
)

df = df.drop(columns=categorical_cols) 
df = pd.concat([df, encoded_df], axis=1)