# Check for confounding variables

This notebook uses chi-squared tests to look for clinical variables that are associated with having a chromosome event or not.

- Get clinical tables
- Get event tables
- Binarize clinical columns as needed
- For each binary column in the clinical table, make a contingency table of that column and the event table
- Run chi squared test and save results

In [1]:
import pandas as pd
import numpy as np
import os
import cptac
import cptac.utils as ut
import altair as alt
import scipy.stats
import statsmodels.stats.multitest

In [2]:
pd.options.display.max_columns = None
pd.options.display.max_colwidth = None

In [3]:
dss = {
    "brca": cptac.Brca,
#     "ccrcc": cptac.Ccrcc,
    "colon": cptac.Colon,
#     "endometrial": cptac.Endometrial,
#     "gbm": cptac.Gbm,
    "hnscc": cptac.Hnscc,
    "lscc": cptac.Lscc,
    "luad": cptac.Luad,
    "ovarian": cptac.Ovarian
}

In [4]:
def load_tables(cancer_type):
    
    # Load the dataset
    ds = dss[cancer_type]()
    
    # Get the clinical table
    clin = ds.get_clinical()
    
    # Get the event table
    event = pd.\
    read_csv(f"{cancer_type}_has_event.tsv", sep="\t", index_col=0).\
    rename(columns={"gain_event": "8q_gain", "loss_event": "8p_loss"})
    
    joined = clin.join(event, how="inner")
    
    return joined

In [5]:
def test_cnv_association(cancer_type, df, test_cols, cnv_col):
    
    pvals = []

    for col in test_cols:
        
        # Create contingency table
        contingency_table = pd.crosstab(df[cnv_col], df[col])
        
        # Run test
        chi2, p, dof, exp_freq = scipy.stats.chi2_contingency(contingency_table)
        
        # Check assumptions: No group has expected value < 1, and no more than
        # 20% of groups have expected frequency < 5.
        exp_freq = pd.DataFrame(exp_freq)
        
        if (exp_freq < 1).any().any():
            pvals.append("Not all expected frequencies were > 1.")
        elif (exp_freq < 5).sum().sum() > 0.2 * exp_freq.shape[0] * exp_freq.shape[1]:
            pvals.append("More than 20% of groups had expected frequency < 5.")
        else:
            pvals.append(p)
        
    pvals = pd.DataFrame({
        "cancer_type": cancer_type,
        "cnv_event": cnv_col,
        "variable": test_cols,
        "pval": pvals
    })
    
    return pvals

In [6]:
all_results = pd.DataFrame()

## BRCA

In [7]:
brca = load_tables("brca")

                                         

### Simplify the age column
For the age column, we will create groups of 15 years, and combine all > 75 years.

In [8]:
brca = brca.assign(Age=brca["Age.in.Month"] // 12)
brca = brca.assign(Age_group=(brca["Age"] // 15) * 15)
brca = brca.assign(Age_group=brca["Age_group"].where(cond=(brca["Age_group"] < 75) | (pd.isnull(brca["Age"])), other=75))

In [9]:
brca["Age_group"].value_counts(dropna=False).sort_index()

30.0    12
45.0    36
60.0    38
75.0    19
NaN     17
Name: Age_group, dtype: int64

### Simplify the stage column

We will also simplify the "Stage" column.

In [10]:
brca["Stage"].value_counts(dropna=False).sort_index()

Stage IA       4
Stage IIA     50
Stage IIB     20
Stage III      4
Stage IIIA    22
Stage IIIB     3
Stage IIIC     4
NaN           15
Name: Stage, dtype: int64

Because there are only 4 Stage I samples, we will group them with Stage II.

In [11]:
def simplify_stage_brca(row):
    if pd.isna(row):
        return row
    elif row.startswith("Stage III"):
        return "III"
    elif row.startswith("Stage II"):
        return "I or II"
    elif row.startswith("Stage I"):
        return "I or II"
    else:
        return row
    
brca = brca.assign(Stage=brca["Stage"].apply(simplify_stage_brca))

In [12]:
brca["Stage"].value_counts(dropna=False).sort_index()

I or II    74
III        33
NaN        15
Name: Stage, dtype: int64

### Race column

There aren't enough people in the hispanic.or.latino group to satisfy the requirements of the chi-squared test, so we have to drop the category.

In [13]:
brca["Race"].value_counts(dropna=False)

white                        78
asian                        19
black.or.african.american    14
NaN                           7
hispanic.or.latino            4
Name: Race, dtype: int64

In [14]:
brca["Race"] = brca["Race"].replace(
    to_replace="hispanic.or.latino",
    value=np.nan
)

In [15]:
brca["Race"].value_counts(dropna=False)

white                        78
asian                        19
black.or.african.american    14
NaN                          11
Name: Race, dtype: int64

### Run chi-squared tests
Now we will run chi-squared tests to look for association between each variable and CNV events.

In [16]:
brca_cols = [
    "Age_group",
    "Race",
    "Stage",
    "PAM50",
    "NMF.v2.1",
]
# Don't use gender because all female

In [17]:
all_results = all_results.append(test_cnv_association(
    cancer_type="brca",
    df=brca,
    test_cols=brca_cols,
    cnv_col="8p_loss"
))

In [18]:
all_results = all_results.append(test_cnv_association(
    cancer_type="brca",
    df=brca,
    test_cols=brca_cols,
    cnv_col="8q_gain"
))

## Colon

In [19]:
colon = load_tables("colon")

                                          

### Simplify the Age column

In [20]:
colon = colon.assign(Age_years=colon["Age"] // 12)
colon = colon.assign(Age_group=(colon["Age_years"] // 15) * 15)

In [21]:
colon["Age_group"].value_counts(dropna=False).sort_index()

30.0     4
45.0    31
60.0    48
75.0    19
90.0     1
NaN      2
Name: Age_group, dtype: int64

In [22]:
colon = colon.assign(
    Age_group=colon["Age_group"].where(cond=(colon["Age_group"] < 75) | (pd.isnull(colon["Age"])), other=75)
)
colon = colon.assign(
    Age_group=colon["Age_group"].where(cond=(colon["Age_group"] > 45) | (pd.isnull(colon["Age"])), other=30)
)

In [23]:
colon["Age_group"].value_counts(dropna=False).sort_index()

30.0    35
60.0    48
75.0    20
NaN      2
Name: Age_group, dtype: int64

### Simplify the Stage column

In [24]:
colon["Stage"].value_counts(dropna=False).sort_index()

Stage I      12
Stage II     42
Stage III    44
Stage IV      7
Name: Stage, dtype: int64

In [25]:
colon = colon.assign(Stage=colon["Stage"].where(colon["Stage"] != "Stage IV", "Stage III or IV"))
colon = colon.assign(
    Stage=colon["Stage"].where(colon["Stage"] != "Stage III", "Stage III or IV")
)

In [26]:
colon["Stage"].value_counts(dropna=False).sort_index()

Stage I            12
Stage II           42
Stage III or IV    51
Name: Stage, dtype: int64

### Run chi-squared tests

In [27]:
colon_cols = [
    "Age_group",
    "Gender",
    "Stage",
    "Mucinous"
]

In [28]:
all_results = all_results.append(test_cnv_association(
    cancer_type="colon",
    df=colon,
    test_cols=colon_cols,
    cnv_col="8p_loss"
))

In [29]:
all_results = all_results.append(test_cnv_association(
    cancer_type="colon",
    df=colon,
    test_cols=colon_cols,
    cnv_col="8q_gain"
))

## HNSCC

In [30]:
hnscc = load_tables("hnscc")

                                          

### Group ages

In [31]:
hnscc = hnscc.assign(Age_group=(hnscc["age"] // 10) * 10)

In [32]:
hnscc["Age_group"].value_counts(dropna=False).sort_index()

20.0     1
40.0     5
50.0    37
60.0    48
70.0    14
80.0     3
NaN      1
Name: Age_group, dtype: int64

In [33]:
hnscc = hnscc.assign(
    Age_group=hnscc["Age_group"].where(cond=(hnscc["Age_group"] < 70) | (pd.isnull(hnscc["age"])), other=70)
)
hnscc = hnscc.assign(
    Age_group=hnscc["Age_group"].where(cond=(hnscc["Age_group"] > 50) | (pd.isnull(hnscc["age"])), other=50)
)

In [34]:
hnscc["Age_group"].value_counts(dropna=False).sort_index()

50.0    43
60.0    48
70.0    17
NaN      1
Name: Age_group, dtype: int64

### Simplify alcohol consumption column

We are going to combine the past drinker group with the current but less group.

Also replace the history not available group with NaN.

In [35]:
hnscc["alcohol_consum"].value_counts(dropna=False)

Alcohol consumption equal to or less than 2 drinks per day for men and 1 drink or less per day for women    44
Alcohol consumption history not available                                                                   23
Lifelong non-drinker                                                                                        21
Alcohol consumption more than 2 drinks per day for men and more than 1 drink per day for women              11
NaN                                                                                                          7
Consumed alcohol in the past, but currently a non-drinker                                                    3
Name: alcohol_consum, dtype: int64

In [36]:
hnscc["alcohol_consum"] = hnscc["alcohol_consum"].replace(
    to_replace="Consumed alcohol in the past, but currently a non-drinker",
    value="Alcohol consumption equal to or less than 2 drinks per day for men and 1 drink or less per day for women"
).replace(
    to_replace="Alcohol consumption history not available",
    value=np.nan
)

In [37]:
hnscc["alcohol_consum"].value_counts(dropna=False)

Alcohol consumption equal to or less than 2 drinks per day for men and 1 drink or less per day for women    47
NaN                                                                                                         30
Lifelong non-drinker                                                                                        21
Alcohol consumption more than 2 drinks per day for men and more than 1 drink per day for women              11
Name: alcohol_consum, dtype: int64

### Simplify smoking history column

Combine all the "current reformed" groups and set the "history not available" group to NaN.

In [38]:
hnscc["smoking_history"].value_counts(dropna=False)

Current smoker: Includes daily and non-daily smokers                38
Smoking history not available                                       21
Lifelong non-smoker: Less than 100 cigarettes smoked in lifetime    21
Current reformed smoker within past 15 years                        14
Current reformed smoker, more than 15 years                         10
Current reformed smoker, years unknown                               4
NaN                                                                  1
Name: smoking_history, dtype: int64

In [39]:
hnscc["smoking_history"] = hnscc["smoking_history"].replace(
    to_replace="Smoking history not available",
    value=np.nan
).replace(
    to_replace=[
        "Current reformed smoker, years unknown", 
        "Current reformed smoker within past 15 years",
        "Current reformed smoker, more than 15 years"
    ],
    value="Current reformed smoker"
)

In [40]:
hnscc["smoking_history"].value_counts(dropna=False)

Current smoker: Includes daily and non-daily smokers                38
Current reformed smoker                                             28
NaN                                                                 22
Lifelong non-smoker: Less than 100 cigarettes smoked in lifetime    21
Name: smoking_history, dtype: int64

### Simplify tumor site column

Combine the two pharynx categories, and put lip with Oral cavity.

In [41]:
hnscc["tumor_site_curated"].value_counts(dropna=False)

Oral cavity    49
Larynx         47
Oropharynx      6
Lip             4
Hypopharynx     2
NaN             1
Name: tumor_site_curated, dtype: int64

In [42]:
hnscc["tumor_site_curated"] = hnscc["tumor_site_curated"].replace(
    to_replace=["Oropharynx", "Hypopharynx"],
    value="Pharynx"
).replace(
    to_replace="Lip",
    value="Oral cavity"
)

In [43]:
hnscc["tumor_site_curated"].value_counts(dropna=False)

Oral cavity    53
Larynx         47
Pharynx         8
NaN             1
Name: tumor_site_curated, dtype: int64

### Simplify stage column

Combine stage I and stage II groups

In [44]:
hnscc["patho_staging_curated"].value_counts(dropna=False)

Stage IV     45
Stage III    32
Stage II     24
Stage I       7
NaN           1
Name: patho_staging_curated, dtype: int64

In [45]:
hnscc["patho_staging_curated"] = hnscc["patho_staging_curated"].replace(
    to_replace=["Stage I", "Stage II"],
    value="Stage I/II"
)

In [46]:
hnscc["patho_staging_curated"].value_counts(dropna=False)

Stage IV      45
Stage III     32
Stage I/II    31
NaN            1
Name: patho_staging_curated, dtype: int64

### Run chi-squared tests

In [47]:
hnscc_cols = [
    "Age_group",
    "alcohol_consum",
#     "gender", # There are only 14 women and 94 men. Chi square assumption not met: More than 20% of groups had expected frequency < 5.
    "histologic_grade",
#     "histologic_type", # 97 out of 104 are all "Squamous cell carcinoma, conventional"
    "patho_staging_curated",
    "smoking_history",
    "tumor_site_curated"
]

In [48]:
all_results = all_results.append(test_cnv_association(
    cancer_type="hnscc",
    df=hnscc,
    test_cols=hnscc_cols,
    cnv_col="8p_loss"
))

In [49]:
all_results = all_results.append(test_cnv_association(
    cancer_type="hnscc",
    df=hnscc,
    test_cols=hnscc_cols,
    cnv_col="8q_gain"
))

## LSCC

In [50]:
lscc = load_tables("lscc")

                                         



In [51]:
lscc["Age"] = lscc["Age"].astype(int)

### Group ages

In [52]:
lscc = lscc.assign(Age_group=(lscc["Age"] // 10) * 10)

In [53]:
lscc["Age_group"].value_counts(dropna=False).sort_index()

40     5
50    18
60    46
70    37
80     2
Name: Age_group, dtype: int64

In [54]:
lscc = lscc.assign(
    Age_group=lscc["Age_group"].where(cond=(lscc["Age_group"] < 70) | (pd.isnull(lscc["Age"])), other=70)
)
lscc = lscc.assign(
    Age_group=lscc["Age_group"].where(cond=(lscc["Age_group"] > 50) | (pd.isnull(lscc["Age"])), other=50)
)

In [55]:
lscc["Age_group"].value_counts(dropna=False).sort_index()

50    23
60    46
70    39
Name: Age_group, dtype: int64

### Simplify the Stage column

In [56]:
lscc["Stage"].value_counts(dropna=False).sort_index()

I        3
IA      12
IB      22
II       2
IIA     22
IIB     20
III      2
IIIA    17
IIIB     2
IV       1
NaN      5
Name: Stage, dtype: int64

In [57]:
def simplify_stage_lscc(row):
    if pd.isna(row):
        return row
    elif row.startswith("IV"):
        return "III or IV"
    elif row.startswith("III"):
        return "III or IV"
    elif row.startswith("II"):
        return "II"
    elif row.startswith("I"):
        return "I"
    else:
        return row
    
lscc = lscc.assign(Stage=lscc["Stage"].apply(simplify_stage_lscc))

In [58]:
lscc["Stage"].value_counts(dropna=False).sort_index()

I            37
II           44
III or IV    22
NaN           5
Name: Stage, dtype: int64

### Country.of.Origin column

Combine by region--North America, Asian, or Eurasia. We use this column instead of the Ethnicity column because the Ethnicity column only had enough info test two groups--white and Asian.

In [59]:
lscc["Country.of.Origin"].value_counts(dropna=False).sort_index()

bulgaria    25
china       17
other        2
poland      14
russia       2
ukraine     10
usa         32
vietnam      6
Name: Country.of.Origin, dtype: int64

In [60]:
lscc["Country.of.Origin"] = lscc["Country.of.Origin"].replace(
    to_replace="usa",
    value="North America"
).replace(
    to_replace=["bulgaria", "poland", "russia", "ukraine"],
    value="Eurasia"
).replace(
    to_replace=["china", "vietnam"],
    value="Asia"
).replace(
    to_replace="other",
    value=np.nan
)

In [61]:
lscc["Country.of.Origin"].value_counts(dropna=False).sort_index()

Asia             23
Eurasia          51
North America    32
NaN               2
Name: Country.of.Origin, dtype: int64

### Pack.Years.Smoked column

In [62]:
lscc["Pack.Years.Smoked"] = lscc["Pack.Years.Smoked"].astype(float)

lscc = lscc.assign(**{"Pack.Years.Smoked": (lscc["Pack.Years.Smoked"] // 20) * 20})

lscc = lscc.assign(**{
    "Pack.Years.Smoked": lscc["Pack.Years.Smoked"].where(
        cond=(lscc["Pack.Years.Smoked"] < 60) | (pd.isnull(lscc["Pack.Years.Smoked"])), other=60
    )
})
lscc = lscc.assign(**{
    "Pack.Years.Smoked": lscc["Pack.Years.Smoked"].where(
        cond=(lscc["Pack.Years.Smoked"] > 20) | (pd.isnull(lscc["Pack.Years.Smoked"])), other=20
    )
})

In [63]:
lscc["Pack.Years.Smoked"].value_counts(dropna=False).sort_index()

20.0    23
40.0    20
60.0    14
NaN     51
Name: Pack.Years.Smoked, dtype: int64

### Run chi squared tests

In [64]:
lscc_cols = [
    "Age_group",
    "Gender",
    "Stage",
    "Country.of.Origin",
    "Pack.Years.Smoked"
]

In [65]:
all_results = all_results.append(test_cnv_association(
    cancer_type="lscc",
    df=lscc,
    test_cols=lscc_cols,
    cnv_col="8p_loss"
))

In [66]:
all_results = all_results.append(test_cnv_association(
    cancer_type="lscc",
    df=lscc,
    test_cols=lscc_cols,
    cnv_col="8q_gain"
))

## LUAD

In [67]:
luad = load_tables("luad")

                                         

### Group ages

In [68]:
luad = luad.assign(Age_group=(luad["Age"] // 10) * 10)

In [69]:
luad["Age_group"].value_counts(dropna=False).sort_index()

30.0     2
40.0    10
50.0    32
60.0    39
70.0    23
80.0     3
Name: Age_group, dtype: int64

In [70]:
luad = luad.assign(
    Age_group=luad["Age_group"].where(cond=(luad["Age_group"] < 70) | (pd.isnull(luad["Age"])), other=70)
)
luad = luad.assign(
    Age_group=luad["Age_group"].where(cond=(luad["Age_group"] > 40) | (pd.isnull(luad["Age"])), other=40)
)

In [71]:
luad["Age_group"].value_counts(dropna=False).sort_index()

40.0    12
50.0    32
60.0    39
70.0    26
Name: Age_group, dtype: int64

### Simplify BMI column

Based on the ranges from the CDC: https://www.cdc.gov/obesity/adult/defining.html

In [72]:
luad = luad.assign(BMI_range=np.nan)
luad["BMI_range"] = np.where(luad["BMI"] < 18.5, "underweight", luad["BMI_range"])
luad["BMI_range"] = np.where((luad["BMI"] < 25) & (luad["BMI"] >= 18.5), "healthyweight", luad["BMI_range"])
luad["BMI_range"] = np.where(luad["BMI"] >= 25, "overweight or obese", luad["BMI_range"])

In [73]:
luad["BMI_range"].value_counts(dropna=False).sort_index()

healthyweight          52
overweight or obese    40
underweight            17
Name: BMI_range, dtype: int64

### Simplify the Stage column

We had to combine the Stage 2 and 3 categories to get high enough counts to meet the assumptions of the chi squared test.

In [74]:
luad["Stage"].value_counts(dropna=False).sort_index()

1       1
1A     15
1B     29
2A      8
2B      9
3       1
3A     12
NaN    34
Name: Stage, dtype: int64

In [75]:
def simplify_stage_luad(row):
    if pd.isna(row):
        return row
    elif row.startswith("3"):
        return "2 or 3"
    elif row.startswith("2"):
        return "2 or 3"
    elif row.startswith("1"):
        return "1"
    else:
        return row
    
luad = luad.assign(Stage=luad["Stage"].apply(simplify_stage_luad))

In [76]:
luad["Stage"].value_counts(dropna=False).sort_index()

1         45
2 or 3    30
NaN       34
Name: Stage, dtype: int64

### Pack.Years.Smoked column

In [77]:
luad["Pack.Years.Smoked"] = luad["Pack.Years.Smoked"].astype(float)

luad = luad.assign(**{"Pack.Years.Smoked": (luad["Pack.Years.Smoked"] // 20) * 20})

In [78]:
luad = luad.assign(**{
    "Pack.Years.Smoked": luad["Pack.Years.Smoked"].where(
        cond=(luad["Pack.Years.Smoked"] < 20) | (pd.isnull(luad["Pack.Years.Smoked"])), other=20
    )
})

In [79]:
luad["Pack.Years.Smoked"].value_counts(dropna=False).sort_index()

0.0     23
20.0    26
NaN     60
Name: Pack.Years.Smoked, dtype: int64

### Run chi squared tests

In [80]:
luad_cols = [
    "Age_group",
    "Gender",
    "BMI_range",
    "Stage",
    "Region.of.Origin",
    "Pack.Years.Smoked"
]

In [81]:
all_results = all_results.append(test_cnv_association(
    cancer_type="luad",
    df=luad,
    test_cols=luad_cols,
    cnv_col="8p_loss"
))

In [82]:
all_results = all_results.append(test_cnv_association(
    cancer_type="luad",
    df=luad,
    test_cols=luad_cols,
    cnv_col="8q_gain"
))

In [83]:
test_cnv_association(
    cancer_type="luad",
    df=luad,
    test_cols=luad_cols,
    cnv_col="8q_gain"
)

Unnamed: 0,cancer_type,cnv_event,variable,pval
0,luad,8q_gain,Age_group,0.01565
1,luad,8q_gain,Gender,0.755291
2,luad,8q_gain,BMI_range,0.005345
3,luad,8q_gain,Stage,0.865882
4,luad,8q_gain,Region.of.Origin,0.001375
5,luad,8q_gain,Pack.Years.Smoked,0.817302


## Ovarian

In [84]:
ovarian = load_tables("ovarian")

                                            

In [85]:
ovarian

Unnamed: 0,Sample_Tumor_Normal,Participant_Procurement_Age,Participant_Gender,Participant_Race,Participant_Ethnicity,Participant_Jewish_Heritage,Participant_History_Malignancy,Participant_History_Chemotherapy,Participant_History_Neo-adjuvant_Treatment,Participant_History_Radiation_Therapy,Participant_History_Hormonal_Therapy,Aliquots_Plasma,Blood_Collection_Time,Blood_Collection_Method,Anesthesia_Time,Tumor_Surgical_Devascularized_Time,Tumor_Sample_Number,Tumor_Sample_1_Weight,Tumor_Sample_1_LN2_Time,Tumor_Sample_1_Ischemia_Time,Tumor_Sample_2_Weight,Tumor_Sample_2_LN2_Time,Tumor_Sample_2_Ischemia_Time,Tumor_Sample_3_Weight,Tumor_Sample_3_LN2_Time,Tumor_Sample_3_Ischemia_Time,Tumor_Sample_4_Weight,Tumor_Sample_4_LN2_Time,Tumor_Sample_4_Ischemia_Time,Tumor_Sample_5_Weight,Tumor_Sample_5_LN2_Time,Tumor_Sample_5_Ischemia_Time,Normal_Sample_Number,Normal_Sample_1_Surgical_Devascularized_Time,Normal_Sample_1_Weight,Normal_Sample_1_LN2_Time,Normal_Sample_1_Ischemia_Time,Normal_Sample_2_Surgical_Devascularized_Time,Normal_Sample_2_Weight,Normal_Sample_2_LN2_Time,Normal_Sample_2_Ischemia_Time,Normal_Sample_3_Surgical_Devascularized_Time,Normal_Sample_3_Weight,Normal_Sample_3_LN2_Time,Normal_Sample_3_Ischemia_Time,Normal_Sample_4_Surgical_Devascularized_Time,Normal_Sample_4_Weight,Normal_Sample_4_LN2_Time,Normal_Sample_4_Ischemia_Time,Normal_Sample_5_Surgical_Devascularized_Time,Normal_Sample_5_Weight,Normal_Sample_5_LN2_Time,Normal_Sample_5_Ischemia_Time,Origin_Site_Disease,Anatomic_Site_Tumor,Anatomic_Lateral_Position_Tumor,Histological_Subtype,Method_of_Pathologic_Diagnosis,Tumor_Stage_Ovary_FIGO,Tumor_Grade,Tumor_Residual_Disease_Max_Diameter,Days_Between_Collection_And_Last_Contact,Vital_Status,Days_Between_Collection_And_Death,Tumor_Status,Review_Of_Initial_Pathological_Findings,Pathology_Review_Consistent_With_Diagnosis,Adjuvant_Radiation_Therapy,Adjuvant_Pharmaceutical_Therapy,Adjuvant_Immunotherapy,Adjuvant_Hormone_Therapy,Adjuvant_Targeted_Molecular_Therapy,Response_After_Surgery_And_Adjuvant_Therapies,New_Tumor_Event_After_Initial_Treatment,New_Tumor_Event_Type,New_Tumor_Event_Site,Other_New_Tumor_Event_Site,Days_Between_Collection_And_New_Tumor_Event,New_Tumor_Event_Diagnosis,New_Tumor_Event_Surgery,Days_Between_Collection_And_New_Tumor_Event_Surgery,New_Tumor_Event_Chemotherapy,New_Tumor_Event_Immunotherapy,New_Tumor_Event_Hormone_Therapy,New_Tumor_Event_Targeted_Molecular_Therapy,8q_gain,8p_loss
01OV002,Tumor,540.0,Female,White,Not Hispanic or Latino,Not Jewish,No,No,No,No,No,2.0,1240,Venipuncture (Vacutainer Apparatus),1418.0,1450.0,3.0,432.0,1455.0,5.0,462.0,1455.0,5.0,362.0,1455.0,5.0,,,,,,,3.0,1500.0,96.0,1505.0,5.0,1500.0,62.0,1505.0,5.0,1500.0,94.0,1505.0,5.0,,,,,,,,,Ovary,Omentum,Not applicable,Serous Adenocarcinoma,Tumor resection,IIIC,G3,1-10 mm,414.0,Living,,Tumor free,Yes,Yes,No,Yes,No,No,No,Complete Response,No,,,,,,,,,,,,True,False
01OV007,Tumor,820.0,Female,White,Not Hispanic or Latino,Unknown,No,No,No,No,No,2.0,1330,Venipuncture (Syringe),732.0,820.0,3.0,317.0,823.0,3.0,164.0,823.0,3.0,378.0,823.0,3.0,,,,,,,3.0,835.0,387.0,837.0,2.0,835.0,415.0,837.0,2.0,835.0,364.0,837.0,2.0,,,,,,,,,Peritoneum,Omentum,Not applicable,Serous Adenocarcinoma,Tumor resection,IV,G3,,368.0,Living,,Tumor free,Yes,Yes,No,Yes,No,No,No,Complete Response,No,,,,,,,,,,,,False,True
01OV008,Tumor,665.0,Female,White,Not Hispanic or Latino,Not Jewish,No,No,No,No,No,2.0,630,Venipuncture (Syringe),730.0,815.0,3.0,253.0,822.0,7.0,369.0,822.0,7.0,387.0,822.0,7.0,,,,,,,3.0,835.0,638.0,837.0,2.0,835.0,534.0,837.0,2.0,835.0,513.0,837.0,2.0,,,,,,,,,Fallopian tube,Ovary,Right,Serous Adenocarcinoma,Tumor resection,IIIC,G3,,367.0,Living,,Tumor free,Yes,Yes,No,Yes,No,No,No,Complete Response,No,,,,,,,,,,,,False,False
01OV010,Tumor,625.0,Female,Black or African American,Unknown,Unknown,No,No,No,No,No,2.0,1430,Venipuncture (Syringe),2036.0,2120.0,3.0,463.0,2127.0,7.0,332.0,2127.0,7.0,322.0,2127.0,7.0,,,,,,,3.0,2145.0,325.0,2147.0,2.0,2145.0,155.0,2147.0,2.0,2145.0,167.0,2147.0,2.0,,,,,,,,,Fallopian tube,Ovary,Right,Serous Adenocarcinoma,Tumor resection,IIIC,G3,1-10 mm,20.0,Living,,Not Reported/Unknown,Yes,Yes,No,No,No,No,No,Not Reported/Unknown,No,,,,,,,,,,,,True,False
01OV013,Tumor,828.0,Female,White,Not Hispanic or Latino,Not Jewish,No,No,No,No,No,2.0,1230,Venipuncture (Syringe),1515.0,1630.0,3.0,195.0,1635.0,5.0,250.0,1635.0,5.0,307.0,1635.0,5.0,,,,,,,3.0,1650.0,155.0,1653.0,3.0,1650.0,114.0,1653.0,3.0,1650.0,270.0,1653.0,3.0,,,,,,,,,Ovary,Ovary,Right,Serous Adenocarcinoma,Tumor resection,IIIC,G3,,410.0,Living,,Tumor free,Yes,Yes,No,Yes,No,No,No,Complete Response,No,,,,,,,,,,,,False,True
01OV017,Tumor,672.0,Female,White,Not Hispanic or Latino,Not Jewish,No,No,No,No,No,2.0,630,Venipuncture (Syringe),725.0,825.0,3.0,679.0,830.0,5.0,775.0,830.0,5.0,438.0,830.0,5.0,,,,,,,3.0,850.0,137.0,855.0,5.0,850.0,121.0,855.0,5.0,850.0,134.0,855.0,5.0,,,,,,,,,Ovary,Ovary,Right,Serous Adenocarcinoma,Tumor resection,IIIC,G3,,395.0,Living,,Tumor free,Yes,Yes,No,Yes,No,No,No,Complete Response,No,,,,,,,,,,,,False,True
01OV018,Tumor,535.0,Female,White,Not Hispanic or Latino,Not Jewish,No,No,No,No,No,2.0,1200,Venipuncture (Syringe),1301.0,1349.0,3.0,180.0,1354.0,5.0,310.0,1354.0,5.0,468.0,1354.0,5.0,,,,,,,3.0,1411.0,64.0,1416.0,5.0,1411.0,35.0,1416.0,5.0,1411.0,156.0,1416.0,5.0,,,,,,,,,Fallopian tube,Omentum,Not applicable,Serous Adenocarcinoma,Tumor resection,IIIC,G3,,390.0,Living,,With tumor,Yes,Yes,No,Yes,No,No,No,Complete Response,Yes,Metastatic,Other (specify),"Vaginal cuff, right hepatorenal recess, liver, and spleen",325.0,Convincing imaging,No,,Yes,No,No,No,False,True
01OV019,Tumor,694.0,Female,White,Not Hispanic or Latino,Not Jewish,No,No,No,No,No,3.0,1000,Venipuncture (Syringe),1140.0,1228.0,3.0,350.0,1232.0,4.0,165.0,1232.0,4.0,366.0,1232.0,4.0,,,,,,,3.0,1228.0,134.0,1234.0,6.0,1228.0,98.0,1234.0,6.0,1228.0,60.0,1234.0,6.0,,,,,,,,,Ovary,Ovary,Right,Serous Adenocarcinoma,Tumor resection,IIIC,G3,,354.0,Living,,Tumor free,Yes,Yes,No,Yes,No,No,No,Complete Response,No,,,,,,,,,,,,False,False
01OV023,Tumor,707.0,Female,White,Not Hispanic or Latino,Not Jewish,No,No,No,No,No,2.0,1100,Venipuncture (Syringe),1158.0,1309.0,3.0,256.0,1313.0,4.0,295.0,1313.0,4.0,189.0,1313.0,4.0,,,,,,,2.0,1341.0,28.0,1345.0,4.0,1341.0,112.0,1345.0,4.0,,,,,,,,,,,,,Peritoneum,Ovary,Left,Serous Adenocarcinoma,Tumor resection,IIIC,G3,,375.0,Living,,Not Reported/Unknown,Yes,Yes,No,Yes,No,No,No,Partial Response,No,,,,,,,,,,,,True,False
01OV026,Tumor,932.0,Female,White,Not Hispanic or Latino,Not Jewish,No,No,No,No,No,2.0,930,Venipuncture (Syringe),1224.0,1355.0,3.0,337.0,1358.0,3.0,333.0,1358.0,3.0,207.0,1358.0,3.0,,,,,,,2.0,1348.0,30.0,1355.0,7.0,1348.0,30.0,1355.0,7.0,,,,,,,,,,,,,Peritoneum,Pelvic mass,Not applicable,Serous Adenocarcinoma,Tumor resection,IIIC,G3,,,Deceased,112.0,Not Reported/Unknown,Yes,Yes,No,Yes,No,No,No,Complete Response,No,,,,,,,,,,,,True,False


### Group ages

In [86]:
ovarian = ovarian.assign(Age=ovarian["Participant_Procurement_Age"] // 12)
ovarian = ovarian.assign(Age_group=(ovarian["Age"] // 10) * 10)

In [87]:
ovarian["Age_group"].value_counts(dropna=False).sort_index()

30.0     3
40.0    14
50.0    38
60.0    29
70.0    15
80.0     1
Name: Age_group, dtype: int64

In [88]:
ovarian = ovarian.assign(
    Age_group=ovarian["Age_group"].where(cond=(ovarian["Age_group"] < 70) | (pd.isnull(ovarian["Age"])), other=70)
)
ovarian = ovarian.assign(
    Age_group=ovarian["Age_group"].where(cond=(ovarian["Age_group"] > 40) | (pd.isnull(ovarian["Age"])), other=40)
)

In [89]:
ovarian["Age_group"].value_counts(dropna=False).sort_index()

40.0    17
50.0    38
60.0    29
70.0    16
Name: Age_group, dtype: int64

### Run chi squared tests

In [90]:
ovarian_cols = [
    "Age_group",
#     "Participant_Race", # Not enough patients in the different categories to pass the assumptions of the test.
#     "Histological_Subtype", # All are "Serous Adenocarcinoma"
#     "Tumor_Stage_Ovary_FIGO", # Not enough of different stages to pass assumptions of test
#     "Tumor_Grade" # Not enough of different grades to pass assumptions of test
]

In [91]:
all_results = all_results.append(test_cnv_association(
    cancer_type="ovarian",
    df=ovarian,
    test_cols=ovarian_cols,
    cnv_col="8p_loss"
))

In [92]:
all_results = all_results.append(test_cnv_association(
    cancer_type="ovarian",
    df=ovarian,
    test_cols=ovarian_cols,
    cnv_col="8q_gain"
))

## Multiple testing correction

In [93]:
all_results

Unnamed: 0,cancer_type,cnv_event,variable,pval
0,brca,8p_loss,Age_group,0.279619
1,brca,8p_loss,Race,0.383808
2,brca,8p_loss,Stage,0.973413
3,brca,8p_loss,PAM50,0.904833
4,brca,8p_loss,NMF.v2.1,0.943659
0,brca,8q_gain,Age_group,0.353898
1,brca,8q_gain,Race,0.288137
2,brca,8q_gain,Stage,0.606505
3,brca,8q_gain,PAM50,0.000375
4,brca,8q_gain,NMF.v2.1,0.000138


In [94]:
reject, pvals_corrected, alphacSidak, alphacBonf = statsmodels.stats.multitest.multipletests(
    pvals=all_results["pval"], 
    alpha=0.05, 
    method="fdr_bh"
)

all_results = all_results.assign(adj_p=pvals_corrected)

In [95]:
all_results[all_results["adj_p"] <= 0.05]

Unnamed: 0,cancer_type,cnv_event,variable,pval,adj_p
3,brca,8q_gain,PAM50,0.000375,0.010138
4,brca,8q_gain,NMF.v2.1,0.000138,0.007445
4,luad,8q_gain,Region.of.Origin,0.001375,0.024744


In [96]:
all_results

Unnamed: 0,cancer_type,cnv_event,variable,pval,adj_p
0,brca,8p_loss,Age_group,0.279619,0.622375
1,brca,8p_loss,Race,0.383808,0.740202
2,brca,8p_loss,Stage,0.973413,0.991779
3,brca,8p_loss,PAM50,0.904833,0.991779
4,brca,8p_loss,NMF.v2.1,0.943659,0.991779
0,brca,8q_gain,Age_group,0.353898,0.707796
1,brca,8q_gain,Race,0.288137,0.622375
2,brca,8q_gain,Stage,0.606505,0.942532
3,brca,8q_gain,PAM50,0.000375,0.010138
4,brca,8q_gain,NMF.v2.1,0.000138,0.007445


In [97]:
alt.Chart(all_results).mark_bar().encode(
    x=alt.X(
        "adj_p",
        bin=alt.Bin(step=0.05)
    ),
    y=alt.Y(
        "count()"
    )
)

In [98]:
all_results

Unnamed: 0,cancer_type,cnv_event,variable,pval,adj_p
0,brca,8p_loss,Age_group,0.279619,0.622375
1,brca,8p_loss,Race,0.383808,0.740202
2,brca,8p_loss,Stage,0.973413,0.991779
3,brca,8p_loss,PAM50,0.904833,0.991779
4,brca,8p_loss,NMF.v2.1,0.943659,0.991779
0,brca,8q_gain,Age_group,0.353898,0.707796
1,brca,8q_gain,Race,0.288137,0.622375
2,brca,8q_gain,Stage,0.606505,0.942532
3,brca,8q_gain,PAM50,0.000375,0.010138
4,brca,8q_gain,NMF.v2.1,0.000138,0.007445


In [134]:
def pval_plot(df, title, group_col, val_col, color_col, y=True, sig=0.05):
    
    val_log_col = "neg_log_p"
    log_cutoff = -np.log10(sig)
    df = df.assign(**{val_log_col: - np.log10(df[val_col])})
    
    if y:
        chart_y = alt.Y(
            val_log_col,
            title="-log(p)"
        )
        
    else:
        chart_y = alt.Y(
            val_log_col,
            axis=alt.Axis(
                labels=False,
                ticks=False,
                title=None
            )
        )
        
    chart = alt.Chart(df).mark_point().encode(
        x=group_col,
        y=chart_y,
        color=color_col
    )
    
    chart_text = chart.transform_filter(
        alt.datum.neg_log_p >= log_cutoff
    ).mark_text(
        align='left',
        baseline='middle',
        dx=7
    ).encode(
        text='variable'
    )


    line = alt.Chart(pd.DataFrame({
        'y': [log_cutoff],
        "label": [f"-log({sig})"]
    })).mark_rule(color="crimson").encode(
        y="y"
    )

    text = line.mark_text(
        align="right",
        dx=-175
    ).encode(
        text="label"
    )

    if y:
        return (chart + chart_text + line + text).properties(title=title)
    else:
        return (chart + chart_text + line).properties(title=title)

alt.hconcat(
    pval_plot(all_results[all_results["cnv_event"] == "8p_loss"], "8p loss", "variable", "adj_p", "cancer_type"),
    pval_plot(
        all_results[all_results["cnv_event"] == "8q_gain"], "8q gain", "variable", "adj_p", "cancer_type", False)
).resolve_scale(y="shared").configure_axisY(titleY=175, titleX=-35).properties(
    title="Chi-squared results for correlation of chr8 CNV events with clinical attributes"
).configure_title(
    anchor="middle"
)