# Check for confounding variables

This notebook uses chi-squared tests to look for clinical variables that are associated with having a chromosome event or not.

- Get clinical tables
- Get event tables
- Binarize clinical columns as needed
- For each binary column in the clinical table, make a contingency table of that column and the event table
- Run chi squared test and save results

In [1]:
import pandas as pd
import numpy as np
import os
import cptac
import altair as alt
import scipy.stats

In [2]:
pd.options.display.max_columns = None
pd.options.display.max_colwidth = None

In [3]:
dss = {
    "brca": cptac.Brca,
#     "ccrcc": cptac.Ccrcc,
    "colon": cptac.Colon,
#     "endometrial": cptac.Endometrial,
#     "gbm": cptac.Gbm,
    "hnscc": cptac.Hnscc,
    "lscc": cptac.Lscc,
    "luad": cptac.Luad,
    "ovarian": cptac.Ovarian
}

In [4]:
def load_tables(cancer_type):
    
    # Load the dataset
    ds = dss[cancer_type]()
    
    # Get the clinical table
    clin = ds.get_clinical()
    
    # Get the event table
    event = pd.\
    read_csv(f"{cancer_type}_has_event.tsv", sep="\t", index_col=0).\
    rename(columns={"gain_event": "8q_gain", "loss_event": "8p_loss"})
    
    joined = clin.join(event, how="inner")
    
    return joined

In [5]:
def test_cnv_association(df, test_cols, cnv_col):
    
    pvals = {}
    efs = {}
    
    for col in test_cols:
        
        # Create contingency table
        contingency_table = pd.crosstab(df[cnv_col], df[col])
        
        # Run test
        chi2, p, dof, exp_freq = scipy.stats.chi2_contingency(contingency_table)
        
        # Check assumptions: No group has expected value < 1, and no more than
        # 20% of groups have expected frequency < 5.
        exp_freq = pd.DataFrame(exp_freq)
        
        if (exp_freq < 1).any().any():
            pvals[col] = "Not all expected frequencies were > 1."
        elif (exp_freq < 5).sum().sum() > 0.2 * exp_freq.shape[0] * exp_freq.shape[1]:
            pvals[col] = "More than 20% of groups had expected frequency < 5."
        else:
            pvals[col] = p
            
        efs[col] = exp_freq
        
    pvals = pd.DataFrame(pvals.values(), index=pvals.keys())
    pvals = pvals.rename(columns={0: "pval"})
    
    return pvals

## BRCA

In [6]:
brca = load_tables("brca")

                                         

### Simplify the age column
For the age column, we will create groups of 15 years, and combine all > 75 years.

In [7]:
brca = brca.assign(Age=brca["Age.in.Month"] // 12)
brca = brca.assign(Age_group=(brca["Age"] // 15) * 15)
brca = brca.assign(Age_group=brca["Age_group"].where(cond=(brca["Age_group"] < 75) | (pd.isnull(brca["Age"])), other=75))

In [8]:
brca["Age_group"].value_counts(dropna=False).sort_index()

30.0    12
45.0    36
60.0    38
75.0    19
NaN     17
Name: Age_group, dtype: int64

### Simplify the stage column

We will also simplify the "Stage" column.

In [9]:
brca["Stage"].value_counts(dropna=False).sort_index()

Stage IA       4
Stage IIA     50
Stage IIB     20
Stage III      4
Stage IIIA    22
Stage IIIB     3
Stage IIIC     4
NaN           15
Name: Stage, dtype: int64

Because there are only 4 Stage I samples, we will group them with Stage II.

In [10]:
def simplify_stage_brca(row):
    if pd.isna(row):
        return row
    elif row.startswith("Stage III"):
        return "III"
    elif row.startswith("Stage II"):
        return "I or II"
    elif row.startswith("Stage I"):
        return "I or II"
    else:
        return row
    
brca = brca.assign(Stage=brca["Stage"].apply(simplify_stage_brca))

In [11]:
brca["Stage"].value_counts(dropna=False).sort_index()

I or II    74
III        33
NaN        15
Name: Stage, dtype: int64

### Race column

There aren't enough people in the hispanic.or.latino group to satisfy the requirements of the chi-squared test, so we have to drop the category.

In [12]:
brca["Race"].value_counts(dropna=False)

white                        78
asian                        19
black.or.african.american    14
NaN                           7
hispanic.or.latino            4
Name: Race, dtype: int64

In [13]:
brca["Race"] = brca["Race"].replace(
    to_replace="hispanic.or.latino",
    value=np.nan
)

In [14]:
brca["Race"].value_counts(dropna=False)

white                        78
asian                        19
black.or.african.american    14
NaN                          11
Name: Race, dtype: int64

### Run chi-squared tests
Now we will run chi-squared tests to look for association between each variable and CNV events.

In [15]:
brca_cols = [
    "Age_group",
    "Race",
    "Stage",
    "PAM50",
    "NMF.v2.1",
]
# Don't use gender because all female

In [16]:
test_cnv_association(
    df=brca,
    test_cols=brca_cols,
    cnv_col="8p_loss"
)

Unnamed: 0,pval
Age_group,0.245893
Race,0.664741
Stage,0.973413
PAM50,0.668204
NMF.v2.1,0.937665


In [17]:
test_cnv_association(
    df=brca,
    test_cols=brca_cols,
    cnv_col="8q_gain"
)

Unnamed: 0,pval
Age_group,0.837975
Race,0.223218
Stage,0.67463
PAM50,0.001121
NMF.v2.1,0.000461


## Colon

In [18]:
colon = load_tables("colon")

                                          

### Simplify the Age column

In [19]:
colon = colon.assign(Age_years=colon["Age"] // 12)
colon = colon.assign(Age_group=(colon["Age_years"] // 15) * 15)

In [20]:
colon["Age_group"].value_counts(dropna=False).sort_index()

30.0     4
45.0    31
60.0    48
75.0    19
90.0     1
NaN      2
Name: Age_group, dtype: int64

In [21]:
colon = colon.assign(
    Age_group=colon["Age_group"].where(cond=(colon["Age_group"] < 75) | (pd.isnull(colon["Age"])), other=75)
)
colon = colon.assign(
    Age_group=colon["Age_group"].where(cond=(colon["Age_group"] > 45) | (pd.isnull(colon["Age"])), other=30)
)

In [22]:
colon["Age_group"].value_counts(dropna=False).sort_index()

30.0    35
60.0    48
75.0    20
NaN      2
Name: Age_group, dtype: int64

### Simplify the Stage column

In [23]:
colon["Stage"].value_counts(dropna=False).sort_index()

Stage I      12
Stage II     42
Stage III    44
Stage IV      7
Name: Stage, dtype: int64

In [24]:
colon = colon.assign(Stage=colon["Stage"].where(colon["Stage"] != "Stage IV", "Stage III or IV"))
colon = colon.assign(
    Stage=colon["Stage"].where(colon["Stage"] != "Stage III", "Stage III or IV")
)

In [25]:
colon["Stage"].value_counts(dropna=False).sort_index()

Stage I            12
Stage II           42
Stage III or IV    51
Name: Stage, dtype: int64

### Run chi-squared tests

In [26]:
colon_cols = [
    "Age_group",
    "Gender",
    "Stage",
    "Mucinous"
]

In [27]:
test_cnv_association(
    df=colon,
    test_cols=colon_cols,
    cnv_col="8p_loss"
)

Unnamed: 0,pval
Age_group,0.04483
Gender,0.299017
Stage,0.249238
Mucinous,0.230789


In [28]:
test_cnv_association(
    df=colon,
    test_cols=colon_cols,
    cnv_col="8q_gain"
)

Unnamed: 0,pval
Age_group,0.81527
Gender,0.765412
Stage,0.677828
Mucinous,0.626902


## HNSCC

In [29]:
hnscc = load_tables("hnscc")

                                          



### Group ages

In [30]:
hnscc = hnscc.assign(Age_group=(hnscc["age"] // 10) * 10)

In [31]:
hnscc["Age_group"].value_counts(dropna=False).sort_index()

20.0     1
40.0     5
50.0    37
60.0    48
70.0    14
80.0     3
NaN      1
Name: Age_group, dtype: int64

In [32]:
hnscc = hnscc.assign(
    Age_group=hnscc["Age_group"].where(cond=(hnscc["Age_group"] < 70) | (pd.isnull(hnscc["age"])), other=70)
)
hnscc = hnscc.assign(
    Age_group=hnscc["Age_group"].where(cond=(hnscc["Age_group"] > 50) | (pd.isnull(hnscc["age"])), other=50)
)

In [33]:
hnscc["Age_group"].value_counts(dropna=False).sort_index()

50.0    43
60.0    48
70.0    17
NaN      1
Name: Age_group, dtype: int64

### Simplify alcohol consumption column

We are going to combine the past drinker group with the current but less group.

Also replace the history not available group with NaN.

In [34]:
hnscc["alcohol_consum"].value_counts(dropna=False)

Alcohol consumption equal to or less than 2 drinks per day for men and 1 drink or less per day for women    44
Alcohol consumption history not available                                                                   23
Lifelong non-drinker                                                                                        21
Alcohol consumption more than 2 drinks per day for men and more than 1 drink per day for women              11
NaN                                                                                                          7
Consumed alcohol in the past, but currently a non-drinker                                                    3
Name: alcohol_consum, dtype: int64

In [35]:
hnscc["alcohol_consum"] = hnscc["alcohol_consum"].replace(
    to_replace="Consumed alcohol in the past, but currently a non-drinker",
    value="Alcohol consumption equal to or less than 2 drinks per day for men and 1 drink or less per day for women"
).replace(
    to_replace="Alcohol consumption history not available",
    value=np.nan
)

In [36]:
hnscc["alcohol_consum"].value_counts(dropna=False)

Alcohol consumption equal to or less than 2 drinks per day for men and 1 drink or less per day for women    47
NaN                                                                                                         30
Lifelong non-drinker                                                                                        21
Alcohol consumption more than 2 drinks per day for men and more than 1 drink per day for women              11
Name: alcohol_consum, dtype: int64

### Simplify smoking history column

Combine all the "current reformed" groups and set the "history not available" group to NaN.

In [37]:
hnscc["smoking_history"].value_counts(dropna=False)

Current smoker: Includes daily and non-daily smokers                38
Lifelong non-smoker: Less than 100 cigarettes smoked in lifetime    21
Smoking history not available                                       21
Current reformed smoker within past 15 years                        14
Current reformed smoker, more than 15 years                         10
Current reformed smoker, years unknown                               4
NaN                                                                  1
Name: smoking_history, dtype: int64

In [38]:
hnscc["smoking_history"] = hnscc["smoking_history"].replace(
    to_replace="Smoking history not available",
    value=np.nan
).replace(
    to_replace=[
        "Current reformed smoker, years unknown", 
        "Current reformed smoker within past 15 years",
        "Current reformed smoker, more than 15 years"
    ],
    value="Current reformed smoker"
)

In [39]:
hnscc["smoking_history"].value_counts(dropna=False)

Current smoker: Includes daily and non-daily smokers                38
Current reformed smoker                                             28
NaN                                                                 22
Lifelong non-smoker: Less than 100 cigarettes smoked in lifetime    21
Name: smoking_history, dtype: int64

### Simplify tumor site column

Combine the two pharynx categories, and put lip with Oral cavity.

In [40]:
hnscc["tumor_site_curated"].value_counts(dropna=False)

Oral cavity    49
Larynx         47
Oropharynx      6
Lip             4
Hypopharynx     2
NaN             1
Name: tumor_site_curated, dtype: int64

In [41]:
hnscc["tumor_site_curated"] = hnscc["tumor_site_curated"].replace(
    to_replace=["Oropharynx", "Hypopharynx"],
    value="Pharynx"
).replace(
    to_replace="Lip",
    value="Oral cavity"
)

In [42]:
hnscc["tumor_site_curated"].value_counts(dropna=False)

Oral cavity    53
Larynx         47
Pharynx         8
NaN             1
Name: tumor_site_curated, dtype: int64

### Simplify stage column

Combine stage I and stage II groups

In [43]:
hnscc["patho_staging_curated"].value_counts(dropna=False)

Stage IV     45
Stage III    32
Stage II     24
Stage I       7
NaN           1
Name: patho_staging_curated, dtype: int64

In [44]:
hnscc["patho_staging_curated"] = hnscc["patho_staging_curated"].replace(
    to_replace=["Stage I", "Stage II"],
    value="Stage I/II"
)

In [45]:
hnscc["patho_staging_curated"].value_counts(dropna=False)

Stage IV      45
Stage III     32
Stage I/II    31
NaN            1
Name: patho_staging_curated, dtype: int64

### Run chi-squared tests

In [46]:
hnscc_cols = [
    "Age_group",
    "alcohol_consum",
#     "gender", # There are only 14 women and 94 men. Chi square assumption not met: More than 20% of groups had expected frequency < 5.
    "histologic_grade",
#     "histologic_type", # 97 out of 104 are all "Squamous cell carcinoma, conventional"
    "patho_staging_curated",
    "smoking_history",
    "tumor_site_curated"
]

In [47]:
test_cnv_association(
    df=hnscc,
    test_cols=hnscc_cols,
    cnv_col="8p_loss"
)

Unnamed: 0,pval
Age_group,0.611143
alcohol_consum,0.565015
histologic_grade,0.320023
patho_staging_curated,0.053354
smoking_history,0.294374
tumor_site_curated,0.56374


In [48]:
test_cnv_association(
    df=hnscc,
    test_cols=hnscc_cols,
    cnv_col="8q_gain"
)

Unnamed: 0,pval
Age_group,0.104835
alcohol_consum,0.26648
histologic_grade,0.930855
patho_staging_curated,0.272547
smoking_history,0.078895
tumor_site_curated,0.739403


## LSCC

In [49]:
lscc = load_tables("lscc")

                                         



In [50]:
lscc["Age"] = lscc["Age"].astype(int)

### Group ages

In [51]:
lscc = lscc.assign(Age_group=(lscc["Age"] // 10) * 10)

In [52]:
lscc["Age_group"].value_counts(dropna=False).sort_index()

40     5
50    18
60    46
70    37
80     2
Name: Age_group, dtype: int64

In [53]:
lscc = lscc.assign(
    Age_group=lscc["Age_group"].where(cond=(lscc["Age_group"] < 70) | (pd.isnull(lscc["Age"])), other=70)
)
lscc = lscc.assign(
    Age_group=lscc["Age_group"].where(cond=(lscc["Age_group"] > 50) | (pd.isnull(lscc["Age"])), other=50)
)

In [54]:
lscc["Age_group"].value_counts(dropna=False).sort_index()

50    23
60    46
70    39
Name: Age_group, dtype: int64

### Simplify the Stage column

In [55]:
lscc["Stage"].value_counts(dropna=False).sort_index()

I        3
IA      12
IB      22
II       2
IIA     22
IIB     20
III      2
IIIA    17
IIIB     2
IV       1
NaN      5
Name: Stage, dtype: int64

In [56]:
def simplify_stage_lscc(row):
    if pd.isna(row):
        return row
    elif row.startswith("IV"):
        return "III or IV"
    elif row.startswith("III"):
        return "III or IV"
    elif row.startswith("II"):
        return "II"
    elif row.startswith("I"):
        return "I"
    else:
        return row
    
lscc = lscc.assign(Stage=lscc["Stage"].apply(simplify_stage_lscc))

In [57]:
lscc["Stage"].value_counts(dropna=False).sort_index()

I            37
II           44
III or IV    22
NaN           5
Name: Stage, dtype: int64

### Ethnicity column

Mark the "not collected" group as NaN. Combine the Asian and Han groups, and "caucasian" and "white(caucasian)". We'll have to drop the other categories, because we don't have enough patients in them.

In [58]:
lscc["Ethnicity"].value_counts(dropna=False).sort_index()

asian                               6
black                               1
caucasian                          68
han                                17
slavic                              9
tssdidnotcollectthisinformation     1
white(caucasian)                    3
NaN                                 3
Name: Ethnicity, dtype: int64

In [59]:
lscc["Ethnicity"] = lscc["Ethnicity"].replace(
    to_replace=["tssdidnotcollectthisinformation", "black", "slavic"],
    value=np.nan
).replace(
    to_replace="han",
    value="asian"
).replace(
    to_replace=["caucasian", "white(caucasian)"],
    value="white"
)

In [60]:
lscc["Ethnicity"].value_counts(dropna=False).sort_index()

asian    23
white    71
NaN      14
Name: Ethnicity, dtype: int64

### Country.of.Origin column

Combine by region--North America, Asian, or Eurasia.

In [61]:
lscc["Country.of.Origin"].value_counts(dropna=False).sort_index()

bulgaria    25
china       17
other        2
poland      14
russia       2
ukraine     10
usa         32
vietnam      6
Name: Country.of.Origin, dtype: int64

In [62]:
lscc["Country.of.Origin"] = lscc["Country.of.Origin"].replace(
    to_replace="usa",
    value="North America"
).replace(
    to_replace=["bulgaria", "poland", "russia", "ukraine"],
    value="Eurasia"
).replace(
    to_replace=["china", "vietnam"],
    value="Asia"
).replace(
    to_replace="other",
    value=np.nan
)

In [63]:
lscc["Country.of.Origin"].value_counts(dropna=False).sort_index()

Asia             23
Eurasia          51
North America    32
NaN               2
Name: Country.of.Origin, dtype: int64

### Simplify Smoking.History column

Mark the "not available" and "reformed years unknown" groups as NaN.

In [64]:
lscc["Smoking.History"].value_counts(dropna=False).sort_index()

Current reformed smoker within past 15 years                        28
Current reformed smoker, more than 15 years                         17
Current reformed smoker, years unknown                               1
Current smoker: Includes daily and non-daily smokers                34
Lifelong non-smoker: Less than 100 cigarettes smoked in lifetime    16
Smoking history not available                                        7
NaN                                                                  5
Name: Smoking.History, dtype: int64

In [65]:
lscc["Smoking.History"] = lscc["Smoking.History"].replace(
    to_replace=[
        "Current reformed smoker, years unknown",
        "Smoking history not available",
    ], 
    value=np.nan
)

In [66]:
lscc["Smoking.History"].value_counts(dropna=False).sort_index()

Current reformed smoker within past 15 years                        28
Current reformed smoker, more than 15 years                         17
Current smoker: Includes daily and non-daily smokers                34
Lifelong non-smoker: Less than 100 cigarettes smoked in lifetime    16
NaN                                                                 13
Name: Smoking.History, dtype: int64

### Cigarettes.per.Day column

In [67]:
lscc["Cigarettes.per.Day"].value_counts(dropna=False).sort_index()

0           1
1           1
10          4
15          1
20         37
25          1
3           1
30         11
35          1
40         13
5           2
50          1
Unknown     6
NaN        28
Name: Cigarettes.per.Day, dtype: int64

In [68]:
lscc["Cigarettes.per.Day"] = lscc["Cigarettes.per.Day"].replace(
    to_replace="Unknown",
    value=np.nan
).astype(float)

lscc = lscc.assign(**{"Cigarettes.per.Day": (lscc["Cigarettes.per.Day"] // 10) * 10})

In [69]:
lscc["Cigarettes.per.Day"].value_counts(dropna=False).sort_index()

0.0      5
10.0     5
20.0    38
30.0    12
40.0    13
50.0     1
NaN     34
Name: Cigarettes.per.Day, dtype: int64

In [70]:
lscc = lscc.assign(**{
    "Cigarettes.per.Day": lscc["Cigarettes.per.Day"].where(
        cond=(lscc["Cigarettes.per.Day"] < 40) | (pd.isnull(lscc["Cigarettes.per.Day"])), other=40
    )
})
lscc = lscc.assign(**{
    "Cigarettes.per.Day": lscc["Cigarettes.per.Day"].where(
        cond=(lscc["Cigarettes.per.Day"] > 20) | (pd.isnull(lscc["Cigarettes.per.Day"])), other=20
    )
})

In [71]:
lscc["Cigarettes.per.Day"].value_counts(dropna=False).sort_index()

20.0    48
30.0    12
40.0    14
NaN     34
Name: Cigarettes.per.Day, dtype: int64

### Pack.Years.Smoked column

In [72]:
lscc["Pack.Years.Smoked"].value_counts(dropna=False).sort_index()

1.5      1
10       1
100      1
108      1
19       1
2.5      1
20       4
24       1
29       1
3        1
3.2      1
31       1
33       1
35       2
35.3     1
36       2
38       1
39       1
40       6
42       3
43.5     2
45       3
48.8     1
5        2
52.5     1
56       2
57       1
58.5     1
60       1
67.5     2
70       2
72       1
78       1
79.5     2
84       1
90       1
96.3     1
NaN     51
Name: Pack.Years.Smoked, dtype: int64

In [73]:
lscc["Pack.Years.Smoked"] = lscc["Pack.Years.Smoked"].astype(float)

lscc = lscc.assign(**{"Pack.Years.Smoked": (lscc["Pack.Years.Smoked"] // 20) * 20})

lscc = lscc.assign(**{
    "Pack.Years.Smoked": lscc["Pack.Years.Smoked"].where(
        cond=(lscc["Pack.Years.Smoked"] < 60) | (pd.isnull(lscc["Pack.Years.Smoked"])), other=60
    )
})
lscc = lscc.assign(**{
    "Pack.Years.Smoked": lscc["Pack.Years.Smoked"].where(
        cond=(lscc["Pack.Years.Smoked"] > 20) | (pd.isnull(lscc["Pack.Years.Smoked"])), other=20
    )
})

In [74]:
lscc["Pack.Years.Smoked"].value_counts(dropna=False).sort_index()

20.0    23
40.0    20
60.0    14
NaN     51
Name: Pack.Years.Smoked, dtype: int64

### Run chi squared tests

In [75]:
lscc_cols = [
    "Age_group",
    "Gender",
    "Stage",
    "Ethnicity",
    "Country.of.Origin",
    "Smoking.History",  
    "Cigarettes.per.Day",
    "Pack.Years.Smoked",
    "Secondhand.Smoke"
    
]

In [76]:
test_cnv_association(
    df=lscc,
    test_cols=lscc_cols,
    cnv_col="8p_loss"
)

Unnamed: 0,pval
Age_group,0.562964
Gender,0.893751
Stage,0.194889
Ethnicity,0.556727
Country.of.Origin,0.395868
Smoking.History,0.022584
Cigarettes.per.Day,0.544314
Pack.Years.Smoked,0.808111
Secondhand.Smoke,0.892513


In [77]:
test_cnv_association(
    df=lscc,
    test_cols=lscc_cols,
    cnv_col="8q_gain"
)

Unnamed: 0,pval
Age_group,0.023878
Gender,0.463236
Stage,0.397277
Ethnicity,0.08509
Country.of.Origin,0.006063
Smoking.History,0.264843
Cigarettes.per.Day,0.557984
Pack.Years.Smoked,0.21961
Secondhand.Smoke,0.536243


## LUAD

In [None]:
luad = load_tables("luad")

## Ovarian

In [None]:
ovarian = load_tables("ovarian")