This algorithm will be primarily for finding two sigma outliers in quantifiable data (e.g. protein, transcriptomics, etc.) in order to find trends.  These trends may include certain proteins having more outliers than others, having a common thread between cancer subtypes, or being randomly distributed.  We will be dropping missing values, and creating a new dataframe based on whether a certain protein concentration (or other quantifiable data) is considered an outlier for a particular sample.  We will then use that dataframe to make binary classifications on cancer based on a user input.  For example, we may want to find if PTEN concentration is abnormally high in women over the age of 50 with a certain type of cancer.  We will then make those binary classifications into a dataframe. Ideally, this will then be mapped to a drug database for people to know if certain drugs may be more helpful to them based on those concentrations.

In [23]:
from __future__ import division
import pandas as pd
import numpy as np
import scipy.stats
import argparse
import datetime
import CPTAC.Endometrial as en

#This is a test of one of Lili's functions
def convertToOutliers(df, gene_column_name, sample_names, NUM_IQRs, up_or_down):
    '''
    Calculates the median, and inter-quartile range for each row/isoform.
    Inter-quartile range is defined as the value difference between the 25th and 75th percentile.
    Here, NaNs are ignored for each row, therefore a different number of values may be used for each row.
    '''
    df['row_iqr'] = scipy.stats.iqr(df[sample_names], axis=1, nan_policy='omit')
    df['row_median'] = np.nanquantile(df[sample_names], q=0.5, axis=1)

    outlier_df = pd.DataFrame()
    outlier_df[gene_column_name] = df[gene_column_name]

    if up_or_down == 'up':
        df['row_medPlus'] = (df['row_median'] + (NUM_IQRs*df['row_iqr']))
        for col in sample_names:
            outlier_df[col] = (df[col] > df['row_medPlus']).astype(int)

    elif up_or_down == 'down':
        df['row_medMinus'] = (df['row_median'] - (NUM_IQRs*df['row_iqr']))
        for col in sample_names:
            outlier_df[col] = (df[col] < df['row_medMinus']).astype(int)

    outlier_df[df[sample_names].isnull()] = np.nan

    return outlier_df

Welcome to the CPTAC data service package. Available datasets may be
viewed using CPTAC.list_data(). In order to access a specific data
set, import a CPTAC subfolder using either 'import CPTAC.Dataset' or
'from CPTAC import Dataset'.
******
Version: 0.3.1
******
Loading Endometrial CPTAC data:
Loading Dictionary...
Loading Clinical Data...
Loading Acetylation Proteomics Data...
Loading Proteomics Data...
Loading Transcriptomics Data...
Loading CNA Data...
Loading Phosphoproteomics Data...
Loading Somatic Mutation Data...

 ******PLEASE READ******
CPTAC is a community resource project and data are made available
rapidly after generation for community research use. The embargo
allows exploring and utilizing the data, but the data may not be in a
publication until July 1, 2019. Please see
https://proteomics.cancer.gov/data-portal/about/data-use-agreement or
enter embargo() to open the webpage for more details.


In [28]:
en.list_data()

Below are the available endometrial data frames contained in this package:
	 clinical
	 	 Dimensions: (144, 26)
	 derived_molecular
	 	 Dimensions: (144, 118)
	 experimental_setup
	 	 Dimensions: (144, 26)
	 acetylproteomics
	 	 Dimensions: (144, 10862)
	 proteomics
	 	 Dimensions: (144, 10999)
	 transcriptomics
	 	 Dimensions: (109, 28057)
	 circular_RNA
	 	 Dimensions: (109, 4945)
	 miRNA
	 	 Dimensions: (99, 2337)
	 CNA
	 	 Dimensions: (95, 28057)
	 phosphoproteomics
	 	 Dimensions: (144, 73212)
	 phosphoproteomics_gene
	 	 Dimensions: (144, 8466)
	 somatic_mutation_binary
	 	 Dimensions: (95, 51559)
	 somatic_mutation
	 	 Dimensions: (52560, 5)


In [50]:
transcriptomics = en.transcriptomics
transcriptomics

idx,A1BG,A1BG-AS1,A1CF,A2M,A2M-AS1,A2ML1,A2MP1,A3GALT2,A4GALT,A4GNT,...,ZWILCH,ZWINT,ZXDA,ZXDB,ZXDC,ZYG11A,ZYG11B,ZYX,ZZEF1,ZZZ3
S001,4.02,2.16,3.27,13.39,5.88,6.79,1.55,0.97,10.34,1.96,...,11.06,10.73,8.40,9.78,10.88,5.93,11.52,10.23,11.50,11.47
S002,4.81,2.21,4.86,13.24,5.93,6.33,0.93,0.00,10.83,0.00,...,10.87,11.43,8.39,9.14,10.38,7.25,11.64,10.64,11.26,11.57
S003,6.24,6.43,3.68,14.32,6.53,9.42,2.79,0.00,10.98,2.13,...,10.06,10.13,8.35,9.27,10.46,6.85,11.60,10.21,11.51,11.09
S005,5.31,4.87,5.59,13.77,6.35,4.22,2.97,0.00,8.68,1.98,...,10.29,10.41,9.10,9.59,10.15,7.89,11.90,10.21,11.34,11.51
S006,9.84,8.83,7.00,13.12,6.49,6.83,1.80,0.00,11.42,3.28,...,10.36,11.24,8.60,9.44,11.80,9.32,11.97,9.77,11.37,12.35
S007,5.03,5.59,4.82,14.13,7.65,7.75,1.59,1.01,9.61,1.59,...,9.23,9.22,9.07,9.51,11.06,7.29,11.60,10.19,11.57,11.32
S008,3.17,3.56,3.98,13.49,6.48,5.88,2.23,1.16,9.71,1.16,...,10.56,10.65,8.77,9.83,11.36,7.43,11.72,10.87,11.81,11.92
S009,6.03,5.46,7.21,13.67,6.95,6.69,2.93,0.00,8.78,1.53,...,10.76,10.85,9.14,9.83,11.60,9.41,11.78,10.65,11.94,11.57
S010,6.02,5.90,7.73,14.30,7.76,8.47,3.47,1.18,9.98,2.26,...,9.46,7.82,9.32,9.97,11.48,8.07,11.93,10.86,12.20,11.84
S011,5.71,5.43,4.20,14.97,6.87,9.96,2.83,0.00,10.22,3.20,...,9.47,9.61,8.54,9.35,11.08,7.15,11.57,11.24,11.77,11.16


In [47]:
x = np.array([0,1,2,3,4,5,6,7,8,9])
y = np.array([2,4,6,8,10,12,14,16,18,20])

In [49]:
scipy.stats.ttest_ind(x,y)

Ttest_indResult(statistic=-3.036145882229939, pvalue=0.0071038221708211325)

In [51]:
scipy.stats.ttest_ind(transcriptomics.loc['S001'], transcriptomics.loc['S012'])

Ttest_indResult(statistic=-3.476946277095606, pvalue=0.0005075430552923089)

In [62]:
def dataFrameTtest(df, gene1, gene2):
    #What do we need? mean, sd, MAD, two samples to compare, 2 sigma ranges
    #mean_val1 = sum(df.loc[gene]) / len(df.loc[gene])
    list1 = np.array(df[gene1])
    list2 = np.array(df[gene2])
    print(scipy.stats.ttest_ind(list1, list2))
    
    return
    
dataFrameTtest(transcriptomics, 'A1BG', 'A2M')

In [None]:
def findOutliers(df):
    sd = 0
    mean = 0
    for col in df:
        outliers = []
        high_threshold = np.mean(df[col]) + 2*np.std(df[col])
        low_threshold = np.mean(df[col]) - 2*np.std(df[col])
        for val in df[col]:
            if val > high_threshold:
                outliers.append(val)
            elif val < low_threshold:
                outliers.append(val)
        
        print("Gene: "+col)
        print("Mean: "+str(mean))
        print("SD: "+str(sd))
        print("Outliers: ")
        print(np.array(outliers))
        print("\n")
        
    '''
    Not sure what to do with this, it is really struggling
    for col in df:
        for val in df[col]:
            #0 = outlier
            #1 = not outlier
            high_threshold = np.mean(df[col]) + 2*np.std(df[col])
            low_threshold = np.mean(df[col]) - 2*np.std(df[col])
            if val > high_threshold:
                df.replace(val, 0)
            elif val < low_threshold:
                df.replace(val, 0)
            else:
                df.replace(val, 1)
    print(df)
    '''    
findOutliers(transcriptomics)