This algorithm will be primarily for finding two sigma outliers in quantifiable data (e.g. protein, transcriptomics, etc.) in order to find trends.  These trends may include certain proteins having more outliers than others, having a common thread between cancer subtypes, or being randomly distributed.  We will be dropping missing values, and creating a new dataframe based on whether a certain protein concentration (or other quantifiable data) is considered an outlier for a particular sample.  We will then use that dataframe to make binary classifications on cancer based on a user input.  For example, we may want to find if PTEN concentration is abnormally high in women over the age of 50 with a certain type of cancer.  We will then make those binary classifications into a dataframe. Ideally, this will then be mapped to a drug database for people to know if certain drugs may be more helpful to them based on those concentrations.

In [106]:
from __future__ import division
import pandas as pd
import numpy as np
import scipy.stats
import argparse
import datetime
import CPTAC.Endometrial as en
import CPTAC.Colon as co
import CPTAC.Ovarian as ov

#This is a test of one of Lili's functions
def convertToOutliers(df, gene_column_name, sample_names, NUM_IQRs, up_or_down):
    '''
    Calculates the median, and inter-quartile range for each row/isoform.
    Inter-quartile range is defined as the value difference between the 25th and 75th percentile.
    Here, NaNs are ignored for each row, therefore a different number of values may be used for each row.
    '''
    df['row_iqr'] = scipy.stats.iqr(df[sample_names], axis=1, nan_policy='omit')
    df['row_median'] = np.nanquantile(df[sample_names], q=0.5, axis=1)

    outlier_df = pd.DataFrame()
    outlier_df[gene_column_name] = df[gene_column_name]

    if up_or_down == 'up':
        df['row_medPlus'] = (df['row_median'] + (NUM_IQRs*df['row_iqr']))
        for col in sample_names:
            outlier_df[col] = (df[col] > df['row_medPlus']).astype(int)

    elif up_or_down == 'down':
        df['row_medMinus'] = (df['row_median'] - (NUM_IQRs*df['row_iqr']))
        for col in sample_names:
            outlier_df[col] = (df[col] < df['row_medMinus']).astype(int)

    outlier_df[df[sample_names].isnull()] = np.nan

    return outlier_df

In [152]:
#convertToOutliers(transcriptomics_small, 'A1BG', 'S001', 2, 'up')

In [108]:
transcriptomics = en.get_transcriptomics()
proteomics = en.get_proteomics()
clinical = en.get_clinical()
RNA = en.get_miRNA()
CNA = en.get_CNA()
acetyl = en.get_acetylproteomics()
phospho = en.get_phosphoproteomics_gene()

In [138]:
df_list = [transcriptomics, proteomics, 
           clinical, RNA, CNA, acetyl, phospho]

def cleanDF(df):#, sample_names):
    '''
    Convert string nans to np.nan and string numbers to floats.
    '''
    df = df.replace(['na', 'NaN', 'Na', 'nan', 'NA', 'NAN', 'Nan'], np.nan)
    #df[sample_names] = df[sample_names].astype(float)

    return df

for df in df_list:
    cleanDF(df)

In [153]:
transcriptomics_small = transcriptomics.iloc[:, 0:5]
proteomics_small = proteomics.iloc[:, 0:5]
RNA_small = RNA.iloc[:, 0:5]
acetyl_small = acetyl.iloc[:, 0:5]
phospho_small = phospho.iloc[:, 0:5]

small_dfs = [transcriptomics_small, proteomics_small, 
             RNA_small, acetyl_small, phospho_small]

In [13]:
x = np.array([0,1,2,3,4,5,6,7,8,9])
y = np.array([2,4,6,8,10,12,14,16,18,20])

In [14]:
scipy.stats.ttest_ind(x,y)

Ttest_indResult(statistic=-3.036145882229939, pvalue=0.0071038221708211325)

In [15]:
scipy.stats.ttest_ind(transcriptomics['ZWILCH'], transcriptomics['A3GALT2'])

Ttest_indResult(statistic=97.08885345695495, pvalue=3.705492560353925e-180)

In [18]:
def dataFrameTtest(df, gene1, gene2):
    list1 = np.array(df[gene1])
    list2 = np.array(df[gene2])
    print(scipy.stats.ttest_ind(list1, list2))
    return
    
dataFrameTtest(transcriptomics, 'A1BG', 'A2M')

Ttest_indResult(statistic=-64.36577028831042, pvalue=6.499477468665902e-143)


In [150]:
def findOutliers(df):
    sd = 0
    mean = 0
    for col in df:
        outliers = []
        mean = np.mean(df[col])
        sd = np.std(df[col])
        high_threshold = mean + 2*sd
        low_threshold = mean - 2*sd
        for val in df[col]:
            if val > high_threshold:
                outliers.append(val)
            elif val < low_threshold:
                outliers.append(val)
        print("Gene: "+col)
        print("Mean: "+str(mean))
        print("SD: "+str(sd))
        print("Outliers: ")
        print(np.array(outliers))
        print("\n")
        return
        
def outlierTable(df):
    new_df = pd.DataFrame()
    for col in df:
        mean = np.mean(df[col])
        sd = np.std(df[col])
        high_threshold = mean + 2 * sd
        low_threshold = mean - 2 * sd
        
        for val in df[col]:
            #0 = outlier
            #1 = not outlier
            new_df[col] = (df[col] > high_threshold).astype(int)
    return(new_df)
    #print(new_df.value_count())
    #print('Sum of Enriched Proteins: ' + str())
    #print('Enriched Protein name: ' + str() + ' and sample: ' + str())

In [154]:
count = 0
for df in small_dfs:
    new_table = outlierTable(df)
    count += 1
    print('Table Number {}\n'.format(str(count)))
    print(new_table.sum())
    print('\n')
    print(new_table)

Table Number 1

A1BG        1
A1BG-AS1    1
A1CF        5
A2M         2
A2M-AS1     3
dtype: int64


           A1BG  A1BG-AS1  A1CF  A2M  A2M-AS1
Sample_ID                                    
S001          0         0     0    0        0
S002          0         0     0    0        0
S003          0         0     0    0        0
S005          0         0     0    0        0
S006          1         1     0    0        0
S007          0         0     0    0        0
S008          0         0     0    0        0
S009          0         0     1    0        0
S010          0         0     1    0        0
S011          0         0     0    0        0
S012          0         0     0    0        0
S014          0         0     0    0        0
S016          0         0     0    0        0
S017          0         0     0    0        0
S018          0         0     0    0        0
S019          0         0     0    0        0
S020          0         0     0    0        0
S021          0         0

Table Number 4

A2M-K1168    0
A2M-K1176    4
A2M-K135     0
A2M-K145     1
A2M-K516     1
dtype: int64


           A2M-K1168  A2M-K1176  A2M-K135  A2M-K145  A2M-K516
Sample_ID                                                    
S001               0          0         0         0         0
S002               0          0         0         0         0
S003               0          0         0         0         0
S005               0          0         0         0         0
S006               0          0         0         1         0
S007               0          0         0         0         0
S008               0          0         0         0         0
S009               0          0         0         0         0
S010               0          0         0         0         0
S011               0          0         0         0         0
S012               0          0         0         0         0
S014               0          0         0         0         0
S016               0      

In [139]:
#pd.value_counts(new_table.values, sort=False) Trying to count the Trues


AAAS     2
AACS     1
AAED1    1
AAGAB    1
AAK1     5
AAMDC    6
AARS     6
AASDH    2
AATF     3
ABCA1    2
dtype: int64