In [None]:
import warnings
import pandas as pd
import numpy as np

pd.set_option('display.max_rows', 200)
pd.set_option('display.max_columns', 500)

# Suppress FutureWarning messages
warnings.simplefilter(action='ignore')

In [None]:
import plotly
from plotly import graph_objects as go
import plotly.express as px
from plotly.subplots import make_subplots

# Exclusion criteria for the analysis

The dataset contains studies conducted all over the world. Considering the cultural context of the United States, we consider only those studies that performed in US only. Further, given that we need to consider the information on the race of the participants, we apply an additional filter to consider only those studies where both Num. White participants and Num. Non-white participants was reported (field was not blank).

The number of studies with the successive filters can be given as follows,
- Total number of studies: 278
- Total number of studies performed in USA only: 187
- Total number of studies that contain information on race: 116

In [None]:
df = pd.read_csv("all_studies.csv")
df.columns, df.shape

(Index(['Study Title - Link to Page here', 'Study ID ', 'Study Start Date',
        'APC Date', 'Cancer Site', 'Trial Type ', 'Trial Phase', 'Tumor Type',
        'Modalities', 'Area Offered', 'Trial Status', 'Total Included',
        'Median Age', 'Mean Age', 'Min Age', 'Max Age', '# Female', '# Male',
        '# White', '#Hispanic (ethnicity)', '# Non White', '# Asian',
        '#American Indian', '#Native Hawaiian or Pacifi Islande', '#Black ',
        '#Not Reported/Other', 'Unnamed: 26', 'full text link ', 'notes',
        'Unnamed: 29', 'Unnamed: 30', 'Unnamed: 31', 'contact library?'],
       dtype='object'),
 (278, 33))

In [None]:
df['Area Offered'].value_counts()

United States                                                          187
Europe                                                                  20
United States + Europe                                                  10
Asia                                                                     9
United States + Europe + Asia                                            5
Canada                                                                   5
worldwide                                                                5
North America                                                            4
Worldwide                                                                3
North America + Europe                                                   3
United States + Canada                                                   2
United Stts                                                              1
North America + South America + Europe + Africa                          1
North America + Europe + 

In [None]:
df_studies_per_area = df['Area Offered'].value_counts().reset_index().rename(
    columns={"index": "Countries", 'Area Offered': "Counts"}
)

px.bar(
    df_studies_per_area,
    x="Countries",
    y="Counts"
).update_layout(
      xaxis=dict(title="place where the study happened"),
      yaxis=dict(title="how many studies per place"),
)


In [None]:
# Filter for USA studies only
df_usa = df[df['Area Offered'] == "United States"]

# Filter for prescence of race information
df_final = df_usa[~(df_usa["# White"].isna() | df_usa["# Non White"].isna())]

print(f"Num. studies in USA: {df_usa.shape[0]}")
print(f"Num. studies in USA AND contains race information: {df_final.shape[0]}")

Num. studies in USA: 187
Num. studies in USA AND contains race information: 116


In [None]:
a=3
b = "meow"
c=(1, 4, 8)
print(f"{a}hello{b}is{c}")

3hellomeowis(1, 4, 8)


## Success metric and its distribution

This analysis defines the "success metric" of a study as a percentage of Non-white particpants in the given study.

Coonsidering the above success metric we can arrive at the following statistics on the metric:
- Avg. success percentage: 14.80%
- Median success percentage: 11.55%
- 20th percentile success percentage (low success): 4.76%
- 80th percentile success percentage (high success): 21.93%



In [None]:
df_final["success_metric"] = df_final["# Non White"] / (df_final["# White"] + df_final["# Non White"]) * 100.0

print(f"Avg. success percentage: {df_final.success_metric.mean()} %")
print(f"Median success percentage: {np.quantile(df_final.success_metric, 0.5)} %")
print(f"20th percentile success percentage (low success): {np.quantile(df_final.success_metric, 0.2)} %")
print(f"80th percentile success percentage (high success): {np.quantile(df_final.success_metric, 0.8)} %")

Avg. success percentage: 14.802489980274535 %
Median success percentage: 11.555555555555555 %
20th percentile success percentage (low success): 4.761904761904762 %
80th percentile success percentage (high success): 21.935483870967744 %


In [None]:
# Cumulative distribution function of the success metric
hist, bins = np.histogram(df_final["success_metric"], bins=100)
cdf = np.cumsum(hist)
cdf = cdf/cdf[-1]

px.line(x=bins[:-1], y=cdf).update_xaxes(title="Success Metric: %age Non-White particpants").update_yaxes(title="Fraction of studies").update_layout(width = 800)

In [None]:
success_metric_20th_perc = np.quantile(df_final.success_metric, 0.2)
success_metric_80th_perc = np.quantile(df_final.success_metric, 0.8)

df_top_20 = df_final[df_final.success_metric >= success_metric_80th_perc]
df_bottom_20 = df_final[df_final.success_metric <= success_metric_20th_perc]

In [None]:
df_top_20

In [None]:
df_bottom_20

In [None]:
df_top_20.to_csv("top_20_studies.csv")
df_bottom_20.to_csv("bottom_20_studies.csv")

# Distribution by success categories

Considering the success metric above, we can define two groups of the studies being considered:
- Top20: Top 20% of the studies by success metric
- Bottom20: Bottom 20% of the studies by success metric

In [None]:
top_20_success_metric_threshold = np.quantile(df_final["success_metric"], 0.8)
bottom_20_success_metric_threshold = np.quantile(df_final["success_metric"], 0.2)

def get_category_label(x):
  if x >= top_20_success_metric_threshold:
    return "Top20"
  elif x<= bottom_20_success_metric_threshold:
    return "Bottom20"
  else:
    return "Neither"

df_final["success_category"] = df_final["success_metric"].apply(lambda x: get_category_label(x))

In [None]:
categories = ["Top20", "Bottom20", "Neither"]

def compare_field_by_category(df, field, height=900, width=1200):
  fig = make_subplots(rows=3, subplot_titles=categories, vertical_spacing=0.1, shared_xaxes=True)
  for i, category in enumerate(categories):
    df_category = df[df["success_category"] == category][field].value_counts().reset_index().rename(columns={"index": field, field: "Num. Studies"})
    fig.add_trace(
        go.Bar(
            x=df_category[field],
            y=df_category["Num. Studies"],
            name=category
        ),
        row=i+1,
        col=1
    )

  fig.update_layout(height=height, width=width).show()


In [None]:
#field = "Modalities"
field = "Trial Type "
#field = "Cancer Site"
#field = "Trial Phase"
#field = "Tumor Type"

compare_field_by_category(df_final, field)

In [None]:
df_final[[
    'Study Title - Link to Page here',
    'Study ID ',
    'Study Start Date',
    'APC Date',
    'Cancer Site',
    'Trial Type ',
    'Trial Phase',
    'Tumor Type',
    'Modalities',
    'Trial Status',
    'Total Included',
    'Median Age',
    'Mean Age',
    'Min Age',
    'Max Age',
    '# Female',
    '# Male',
    '# White',
    '#Hispanic (ethnicity)',
    '# Non White',
    '# Asian',
    '#American Indian',
    '#Native Hawaiian or Pacifi Islande',
    '#Black ',
    '#Not Reported/Other',
    'notes',
    'contact library?',
    'success_metric']]

Unnamed: 0,Study Title - Link to Page here,Study ID,Study Start Date,APC Date,Cancer Site,Trial Type,Trial Phase,Tumor Type,Modalities,Trial Status,Total Included,Median Age,Mean Age,Min Age,Max Age,# Female,# Male,# White,#Hispanic (ethnicity),# Non White,# Asian,#American Indian,#Native Hawaiian or Pacifi Islande,#Black,#Not Reported/Other,notes,contact library?,success_metric
3,Chemotherapy and Radiation Therapy With or Wit...,NCT00047008,Jul-02,Jun-10,"oral cavity, oropharynx, hypopharynx, larynx",Primary/Recurrent,3,SCC,Radiation + Drug,C,721.0,56.0,,26.0,82.0,124.0,597.0,589.0,,132.0,,,,,,,N,18.307906
5,High-Dose Radiation Therapy Plus Chemotherapy ...,NCT00052429,Sep-02,Nov-10,nasopharynx,Primary,12,not specifed,Procedure + Radiation + Drug,C,25.0,,,,,8.0,17.0,8.0,,17.0,11.0,,,,6.0,,N,68.0
6,A Study of a New Combination and Schedule of C...,NCT00148122,Nov-02,Aug-08,head and neck,Recurrent/Metastatic,2,"SCC, adenocarcinoma",Drug,C,38.0,59.5,,39.9,75.1,6.0,32.0,36.0,0.0,2.0,0.0,,,0.0,2.0,,N,5.263158
7,Trial of Induction Chemotherapy With Carboplat...,NCT01185171,Jan-03,Jul-18,nasopharynx,Primary,2,"SCC,poorly differentiated carcinoma, lymphoepi...",Drug,C,69.0,55.0,,49.0,64.0,14.0,55.0,58.0,1.0,11.0,0.0,0.0,0.0,10.0,0.0,,N,15.942029
8,Radiation Therapy With or Without Chemotherapy...,NCT00057785,Feb-03,Feb-07,nasopharynx,Palliative,2,SCC,Radiation + Drug,C,68.0,48.5,,,,17.0,51.0,37.0,,31.0,23.0,0.0,1.0,5.0,2.0,,N,45.588235
12,Cox-2 Inhibition in Radiation-induced Oral Muc...,NCT00698204,Jul-03,Sep-12,head and neck,Palliative,2,not specifed,Drug,C,40.0,,54.58,34.0,71.0,8.0,32.0,39.0,2.0,1.0,0.0,0.0,0.0,1.0,0.0,,N,2.5
17,Fruit and Vegetable Extracts in Treating Patie...,NCT00064298,Jan-04,Oct-08,"oral cavity, oropharynx, hypopharynx, larynx",Primary/Recurrent/Metastatic (cured),2,SCC,Dietary Supplement,C,134.0,58.5,,30.0,82.0,21.0,113.0,111.0,9.0,23.0,0.0,,,11.0,3.0,,N,17.164179
25,"Cetuximab, Chemotherapy, and Radiation Therapy...",NCT00089297,Dec-04,Feb-10,"oral cavity, oropharynx, hypopharynx, larynx",Primary,2,SCC,Biological + Drug+ Radiation,C,63.0,57.0,,31.0,76.0,14.0,49.0,56.0,7.0,0.0,0.0,,,7.0,0.0,,N,0.0
26,Evaluation of Cetuximab (ERBITUX) and Concurre...,NCT00343083,Dec-04,Jun-11,"oral cavity, oropharynx, hypopharynx, larynx, ...",Primary,2,SCC,Radiation + Drug,C,43.0,58.0,,42.0,75.0,6.0,37.0,33.0,,10.0,,,,10.0,0.0,,N,23.255814
27,"S0329, Gemcitabine and Paclitaxel in Treating ...",NCT00100789,Jan-05,Feb-08,head and neck,Recurrent/Metastatic,2,SCC,Drug,C,63.0,63.1,,40.7,82.9,13.0,50.0,56.0,,7.0,0.0,,,11.0,2.0,,N,11.111111


In [None]:
df_race_reported = df_usa[~(df_usa["# White"].isna() | df_usa["# Non White"].isna())]

In [None]:
df_race_reported[df_race_reported["# Non White"] == 0]

(12, 33)

The dataset contains

In [None]:
df["has_valid_participants"] = ~(df["# White"].isna() | df["# Non White"].isna() | (df["# White"] == 0))
df_filtered = df[df["has_valid_participants"]]

In [None]:

df['Area Offered'].value_counts()

United States                                                          187
Europe                                                                  20
United States + Europe                                                  10
Asia                                                                     9
United States + Europe + Asia                                            5
Canada                                                                   5
worldwide                                                                5
North America                                                            4
Worldwide                                                                3
North America + Europe                                                   3
United States + Canada                                                   2
United Stts                                                              1
North America + South America + Europe + Africa                          1
North America + Europe + 

In [None]:
df['Area Offered'].value_counts().reset_index()


Unnamed: 0,index,Area Offered
0,United States,187
1,Europe,20
2,United States + Europe,10
3,Asia,9
4,United States + Europe + Asia,5
5,Canada,5
6,worldwide,5
7,North America,4
8,Worldwide,3
9,North America + Europe,3


In [None]:
dict(title="Area the trial was offered", cat='Meow is sweet')

{'title': 'Area the trial was offered', 'cat': 'Meow is sweet'}

In [None]:
px.bar(
df_studies_per_area,
x="area_offered",
y="num_studies"
).update_layout(
xaxis=dict(title="Area the trial was offered"),
yaxis=dict(title="Num. trials"),
)



ValueError: ignored