In [234]:
import numpy as np 
import pandas as pd 
import json

trial_dirs=[]
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        if(".json" in filename): # only if json (ignore other files)
            trial_dirs.append(os.path.join(dirname, filename))
print(len(trial_dirs))

93


## Loading into the lists

In [235]:
trials=[]
trials_NCTid = []
noDuplic_trial_dirs=[]
for td in trial_dirs:
    f = open(td, "r").read()
    json_f=json.loads(f)
    nctid=json_f["protocolSection"]["identificationModule"]["nctId"]
    if nctid not in trials_NCTid:
        trials_NCTid.append(nctid)
        trials.append(json_f)
        noDuplic_trial_dirs.append(td)
trial_dirs=noDuplic_trial_dirs
print(len(trials))

69


## Filtering lists

In [236]:
removed_items=0
for i in range(len(trials)):
    if(trials[i-removed_items]["hasResults"]==False):
        trials.pop(i-removed_items)
        trials_NCTid.pop(i-removed_items)
        trial_dirs.pop(i-removed_items)
        removed_items+=1
print(len(trials))

60


## Group class

In [237]:
import math
class Group:
    def __init__(self,nctid,trial, indx):
        self.NCTid=nctid
        self.trial = trial
        self.index = indx
        self.group_name= self.trial["resultsSection"]["participantFlowModule"]["groups"][self.index]["title"]
        self.group_desc= self.trial["resultsSection"]["participantFlowModule"]["groups"][self.index]["description"]
        self.dose_unit=None
        self.dose_cells=None
        self.dose_cells_kg=None
        self.target=None
    def set_dose(self,doseAmount, doseUnit):
        if(doseAmount==None or doseAmount=="" or math.isnan(doseAmount)): # don't assign anything
            pass
        elif(doseUnit=='cells'):
            self.dose_unit=doseUnit
            self.dose_cells=doseAmount
        elif(doseUnit=='cells/kg'):
            self.dose_unit=doseUnit
            self.dose_cells_kg=doseAmount
        else:
            raise ValueError("Invalid dose unit. Valid units are 'cells' and 'cells/kg'.")
    def set_target(self,targ):
        self.target=targ

In [238]:
# Each trial has one or more groups. Put all groups in an array
groups=[]
for i in range(len(trials)):
    num_of_groups=len(trials[i]["resultsSection"]["participantFlowModule"]["groups"])
    for j in range(num_of_groups):
        new_gr=Group(trials_NCTid[i], trials[i], j)
        groups.append(new_gr)

In [239]:
targets = [
    "CD80",
    "TNFRSF17",
    "CD22",
    "CD19",
    "NY-ESO-1",
    "CD66e",
    "CD27-L",
    "CLDN18",
    "CD137",
    "CD38",
    "DLL3",
    "EPHA2",
    "EGFR",
    "EGFR vIII",
    "EBV LMP1",
    "HER2",
    "GPRC5D",
    "GCPII",
    "GPC3",
    "PMEL",
    "CD34",
    "ICAM1",
    "IL3RA",
    "CD20",
    "TNFRSF8",
    "MAGEA1",
    "MAGEA4",
    "MSLN",
    "MUC1",
    "CD33",
    "CD371",
    "NCAM1",
    "L1CAM",
    "ROR1",
    "ROR2",
    "MME",
    "KLRK1",
    "PD-L1",
    "PROM1",
    "PSCA",
    "MET",
    "CS1",
    "SDC1",
    "CD7",
    "TRBC1",
    "CD4",
    "TRAIL-R2",
    "EPCAM",
    "KIT",
    "BCMA"
]

## Perform quick text mining to guess used targets
It then exports it for manual inspection and to manually find doses.

In [240]:
gr_id=[]
gr_titles=[]
gr_descs=[]
gr_used_targets=[]

for gr in groups:
    gr_id.append(gr.NCTid)
    gr_titles.append(gr.group_name)
    gr_descs.append(gr.group_desc)

for k in range(len(gr_titles)):
    used_target=""
    for t in targets:
        if((t in gr_titles[k]) or (t in gr_descs[k])):
            if(used_target==""):
                used_target=t
            else:
                used_target="multi-target"
        if(used_target==""):
            if(t in str(groups[k].trial)):
                if(used_target==""):
                    used_target=t
                else:
                    used_target="multi-target"
    gr_used_targets.append(used_target)
df=pd.DataFrame({"NCTid": gr_id, "Group_Title": gr_titles, "Group_Description": gr_descs, "Targets": gr_used_targets})
df.to_excel("Groups_Summary.xlsx")

## After manual inspection
Remove rows that don't have info on dose or used target.

In [241]:
labeled_df=pd.read_excel("/kaggle/input/groups-summary-targets-doses/Groups_Summary_Targets_Doses.xlsx")
labeled_df.dropna(subset=["Cells/kg", "Cells"], how='all', inplace=True, ignore_index=True)
labeled_df.dropna(subset="Targets", inplace=True, ignore_index=True)
display(labeled_df)

Unnamed: 0,NCTid,Group_Title,Group_Description,Targets,Cells/kg,Cells
0,NCT02706392,Cohort A Dose Level 1,"Patients with ROR1+ CLL, MCL or ALL that are r...",ROR1,330000.0,
1,NCT02706392,Cohort A Dose Level 2,"Patients with ROR1+ CLL, MCL or ALL that are r...",ROR1,1000000.0,
2,NCT02706392,Cohort B Dose Level 1,Patients with ROR1+ NSCLC or TNBC who have fai...,ROR1,330000.0,
3,NCT02706392,Cohort B Dose Level 2,Patients with ROR1+ NSCLC or TNBC who have fai...,ROR1,1000000.0,
4,NCT02706392,Cohort B Dose Level 3,Patients with ROR1+ NSCLC or TNBC who have fai...,ROR1,3300000.0,
...,...,...,...,...,...,...
177,NCT03704298,Phase 1: Cohort 3: Axicabtagene Ciloleucel + U...,Participants received cyclophosphamide 500 mg/...,CD19,2000000.0,
178,NCT03704298,Phase 1: Cohort 4: Axicabtagene Ciloleucel + U...,Participants received cyclophosphamide 500 mg/...,CD19,2000000.0,
179,NCT04314843,Phase 1/Cohort 1,Participants received 500 mg/m\^2 cyclophospha...,CD19,2000000.0,
180,NCT04314843,Phase 1/Cohort 2,Participants received 500 mg/m\^2 cyclophospha...,CD19,2000000.0,


## Update groups list
Remove groups that were excluded in manual inspection and add dose and target info.

In [242]:
print(len(groups))

removed_groups=0
for i in range(len(groups)):
    if groups[i-removed_groups].NCTid not in list(labeled_df["NCTid"]):
        groups.pop(i-removed_groups)
        removed_groups+=1
    elif groups[i-removed_groups].group_name not in list(labeled_df.loc[labeled_df['NCTid']==groups[i-removed_groups].NCTid,"Group_Title"]):
        groups.pop(i-removed_groups)
        removed_groups+=1

print(len(groups))

235
182


In [243]:
# Find which targets are represented in the dataset
display(labeled_df["Targets"].value_counts())

Targets
CD19      101
BCMA       32
EGFR       13
CD20       11
VEGFR      11
ROR1        6
SLAMF7      4
PSCA        3
PD-L1       1
Name: count, dtype: int64

In [244]:
targets_col=list(labeled_df["Targets"])
dose_col_kg=list(labeled_df["Cells/kg"])
dose_col_flat=list(labeled_df["Cells"])
for i in range(len(groups)):
    groups[i].set_target(targets_col[i])
    groups[i].set_dose(dose_col_kg[i],"cells/kg")
    groups[i].set_dose(dose_col_flat[i],"cells")
    print(groups[i].dose_unit)

cells/kg
cells/kg
cells/kg
cells/kg
cells/kg
cells/kg
cells
cells
cells
cells
cells
cells
cells
cells
cells/kg
cells/kg
cells/kg
cells/kg
cells/kg
cells/kg
cells/kg
cells/kg
cells/kg
cells/kg
cells/kg
cells/kg
cells/kg
cells/kg
cells
cells
cells
cells
cells
cells
cells
cells
cells
cells
cells
cells
cells
cells
cells
cells
cells
cells
cells
cells/kg
cells/kg
cells/kg
cells/kg
cells/kg
cells/kg
cells/kg
cells/kg
cells/kg
cells/kg
cells/kg
cells/kg
cells/kg
cells/kg
cells/kg
cells
cells/kg
cells/kg
cells/kg
cells/kg
cells/kg
cells/kg
cells/kg
cells/kg
cells
cells
cells
cells
cells
cells
cells
cells/kg
cells/kg
cells/kg
cells/kg
cells/kg
cells/kg
cells/kg
cells
cells
cells
cells
cells
cells
cells
cells/kg
cells/kg
cells/kg
cells/kg
cells
cells/kg
cells/kg
cells/kg
cells/kg
cells/kg
cells/kg
cells/kg
cells/kg
cells/kg
cells/kg
cells/kg
cells/kg
cells/kg
cells/kg
cells/kg
cells/kg
cells/kg
cells
cells
cells
cells
cells
cells
cells
cells
cells
cells
cells
cells
cells
cells
cells
cells
cells
c