# TP53 Methods - Sweep of Clinical Data

### Following our protein analysis, we looked for any correlations within our clinical data that were worth exploring.

### Standard imports, including CTPAC package

In [5]:
import pandas as pd
import numpy as np
import re
import scipy.stats
import statsmodels.stats.multitest
import matplotlib.pyplot as plt
import math
import seaborn as sns
import CPTAC.Endometrial as CPTAC

## Set-up of Initial Dataframes and Variables

In [14]:
gene = 'TP53'
protein = 'TP53'
somatic = CPTAC.get_somatic()
proteomics = CPTAC.get_proteomics()
clinical_full = CPTAC.get_clinical() 

### Creating a dataframe of patient ID's for patients that have mutation(s) in the TP53 hotspot

In [19]:
# A list of mutations present in the hotspot according to our Hotspot3D output
hotspot_mutations = ['p.P250L', 'p.R248W', 'p.S241C', 'p.S241del', 'p.K164E', 'p.R249S', 'p.R273H', 'p.R282W',
                    'p.S127Y', 'p.G244D', 'p.A159P']
mutated_hotspot = somatic.loc[(somatic['Location'].isin(hotspot_mutations)) & (somatic['Gene'] == gene)]
hotspot_patients = mutated_hotspot['Patient_Id']

### Creating a modified clinical dataframe that designates mutation type

In [32]:
# Creating this dataframe to join with the clinical data to include mutation type
loc_compare = CPTAC.compare_mutations(proteomics, protein, gene)

# Joining the two dataframes and selecting only for patients with tumors
clinical_full = clinical_full.join(loc_compare)
clinical_full = clinical_full.loc[clinical_full['Sample_Status'] == 'Tumor']

# Changing the index to be the patient's universal CPTAC ID
clinical_full = clinical_full.set_index('Proteomics_Participant_ID')

# Designating the type of TP53 mutation. True = TP53 Hotspot Mutation. False = TP53 Non-Hotspot Mutation. WT = No TP53 Mutation
clinical_full['Within_Hotspot'] = False
clinical_full.loc[hotspot_patients, 'Within_Hotspot'] = True
clinical_full.loc[clinical_full['Mutation'] == 'Wildtype', 'Within_Hotspot'] = 'WT'

586      C3L-00006
1318     C3L-00098
1404     C3L-00137
4019     C3L-00356
4089     C3L-00358
4736     C3L-00449
21307    C3L-00905
44973    C3N-00339
45005    C3N-00340
46785    C3N-00836
49867    C3N-01346
49948    C3N-01349
Name: Patient_Id, dtype: object
idx
S001    C3L-00006
S002    C3L-00008
S003    C3L-00032
S005    C3L-00090
S006    C3L-00098
S007    C3L-00136
S008    C3L-00137
S009    C3L-00139
S010    C3L-00143
S011    C3L-00145
S012    C3L-00156
S014    C3L-00161
S016    C3L-00358
S017    C3L-00361
S018    C3L-00362
S019    C3L-00413
S020    C3L-00449
S021    C3L-00563
S022    C3L-00586
S023    C3L-00601
S024    C3L-00605
S025    C3L-00767
S026    C3L-00769
S027    C3L-00770
S028    C3L-00771
S029    C3L-00780
S030    C3L-00781
S031    C3L-00905
S032    C3L-00918
S033    C3L-00921
          ...    
S124    C3N-00333
S125    C3N-00383
S126    C3N-00729
S127    C3N-00858
S128    C3N-00866
S129    C3N-01346
S130    C3L-00563
S131    C3L-00605
S132    C3L-00770
S133    C3L-0077

KeyError: "['C3L-00356'] not in index"

## Clinical Data Chi-Square Analysis

### Bonferroni correction of p-value significance cuttoff for chi-square analysis

In [None]:
# Calculating the number of chi-square analyses to be run based on data type
Chi_Tests = 0
for feature in clinical_full:
    if str(clinical_full[feature].dtypes) == 'object':
        Chi_Tests += 1
pcutoff = 0.05/Chi_Tests
print ('p-value cutoff is ' + str(pcutoff))           

### Running the chi-square analysis

In [None]:
# Going through every column of our clinical dataframe
for feature in clinical_full:
    # Only performing the chi-square analysis on columns with the data type of 'object'
    if str(clinical_full[feature].dtypes) == 'object':
        # Creating the table to be used in the chi-square analysis
        chi_clinical = pd.crosstab(clinical_full[feature], clinical_full['Within_Hotspot'])
        chi_test = scipy.stats.chi2_contingency(observed = chi_clinical)
        if str(feature) != 'Within_Hotspot':
            # Printing significant values and the table used in the chi-square analysis
            if chi_test[1] <= pcutoff:
                print(feature)
                print ('p-value = ' + str(chi_test[1]))
                chi_test = pd.crosstab(clinical_full[feature], clinical_full['Within_Hotspot'])
                print (str(chi_test) + '\n')

# Clinical Data t-test Analysis

### Bonferroni correction of p-value significance cuttoff for t-test analysis

In [None]:
# Calculating the number of t-test analyses to be run based on data type
t_tests = 0
for feature in clinical_full:
    if str(clinical_full[feature].dtypes) == 'float64' or str(clinical_full[feature].dtypes) == 'int64':
        t_tests += 1
pcutoff = 0.05/t_tests
print ('p-value cutoff is ' + str(pcutoff))

### Running the t-test analysis

In [None]:
# Creating dataframes for the Tp53 hotspot mutations vs. wildtype TP53 comparison
TP53_Hotspot_Mutation = clinical_full.loc[clinical_full['Within_Hotspot'] == True]
Wildtype_TP53 = clinical_full.loc[clinical_full['Within_Hotspot'] != 'WT']

# TP53 Hotspot Mutations vs. Wildtype TP53 t-test
significant_results = 0
for feature in clinical_full:
    if str(clinical_full[feature].dtypes) == 'float64' or str(clinical_full[feature].dtypes) == 'int64':
        ttest = scipy.stats.ttest_ind(TP53_Hotspot_Mutation[feature].dropna(axis=0), Wildtype_TP53[feature].dropna(axis = 0))
        # Printing significant values
        if ttest[1] <= pcutoff:
            significant_results += 1
            print(feature)
            print('p-value = ' + str(ttest[1]))
            print('t-test statistic = ' + str(ttest[0]) + '\n')
            
# Printing the number of significant results
if significant_results == 0:
    print('**There are no significant results based on this comparison**')
else:
    print('**There are ' + str(significant_results) + ' significant results based on this comparison**')

In [None]:
# Creating dataframes for the Mutated TP53 vs Wildtype TP53 comparison        
Wildtype_TP53 = clinical_full.loc[clinical_full['Within_Hotspot'] == 'WT']
Mutated_TP53 = clinical_full.loc[clinical_full['Within_Hotspot'] != 'WT']

# Mutated TP53 vs. Wildtype TP53 t-test
significant_results = 0
for feature in clinical_full:
    if str(clinical_full[feature].dtypes) == 'float64' or str(clinical_full[feature].dtypes) == 'int64':
        ttest = scipy.stats.ttest_ind(Mutated_TP53[feature].dropna(axis=0), Wildtype_TP53[feature].dropna(axis = 0))
        # Printing significant values
        if ttest[1] <= pcutoff:
            significant_results += 1
            print(feature)
            print('p-value = ' + str(ttest[1]))
            print('t-test statistic = ' + str(ttest[0]) + '\n')

# Printing the number of significant results
if significant_results == 0:
    print('**There are no significant results based on this comparison**')
else:
    print('**There are ' + str(significant_results) + ' significant results based on this comparison**') 

## Plotting columns from the clinical data found to be significant after t-test results

In [None]:
# We compiled this list of 5 clinical data columns based on which data was able to be plotted
sig_list = ['Pathway_activity_JAK.STAT', 'Pathway_activity_p53', 'Mutation_signature_C>G', 'Estrogen_Receptor_%', 'Mutation_signature_C>T']

# Plotting each of the clinical data columns from our list
for feature in sig_list:
    Y_var = feature
    a4_dims = (11.7, 8.27)
    fig, ax = plt.subplots(figsize = a4_dims)
    boxplt = sns.boxplot(data = clinical_full, x = 'Within_Hotspot', y = Y_var, ax = ax)
    boxplt = sns.stripplot(data = clinical_full, x = 'Within_Hotspot', y = Y_var, jitter = True, color = '.3')
    boxplt.set_xlabel('Mutation Within Hotspot', fontsize='20')
    boxplt.set_ylabel(Y_var, fontsize='20')
    plt.show()