In [1]:
import sys
from statistics import mean, stdev, variance
import numpy as np
import pandas as pd
from re import sub
import math
import operator

import matplotlib.pyplot as plt
import seaborn as sns


In [2]:
READ_IN_AND_DISPLAY=False
FILE = "proteinGroups_Slavov_Set1_simplified.txt"

In [3]:
COLS_TO_USE = range(1,11)
IGNORE_ZEROS = True

In [4]:
def outName(x):
    return sub(r'(\w+).(\w+)',r'\1_output.\2',x)

def skipZero(alist):
    return [x for x in alist if (x!=0)]

def skipZeroMean(l):
    l = skipZero(l)
    if len(l)==0: return 0
    return mean(skipZero((l)))

def skipZeroStDev(alist):
    alist = skipZero(alist)
    if len(alist) == 0: return 0
    return stdev(alist)

In [5]:
def get_thresholds(alist):
    alist = sorted(alist, reverse=True)
    print("Including Zeros: ")
    print("95% threshold: ", float(alist[math.ceil(float(len(alist))*.05)]))
    print("99% threshold: ", float(alist[math.ceil(float(len(alist))*.01)]))

    alist = [x for x in alist if (x!=0)]
    print("\nIgnoring Zeros: ")
    ninety_five = float(alist[math.ceil(float(len(alist))*.05)])
    print("95% threshold: ", ninety_five)
    print("99% threshold: ", float(alist[math.ceil(float(len(alist))*.01)]))
    return ninety_five

In [6]:
def compare(selfseries, otherseries):
    c_dist = 0.0 #cumulative differences between the two series
    for i in range(0,len(selfseries)):
        c_dist += abs(float(selfseries[i])-float(otherseries[i]))
    ave_dist = float(c_dist)/float((len(selfseries)))
    return ave_dist

In [7]:
def get_lower(series, to_show=5):
    orser = series.sort_values(inplace=False)
    print (orser.iloc[range(0,to_show)], '\n\n')

In [8]:
def readin(fileName):
    inFile = open(fileName, 'r')
    
    line = inFile.readline()
    headings = line.split('\t')
       
    data = {}
    line = inFile.readline()
    while line !="":
        row = line.split('\t')
        vals = row[1:]
        for i in vals: i = float(i)
        data[(row[0])] = vals
        line = inFile.readline()
        
    df = pd.DataFrame.from_dict(data, dtype = float, orient='index',columns = headings[1:])
    return df

def readin_log(fileName):
    df = readin(fileName)
    dfl = (np.log(df)).replace(-np.inf, 0)
    return dfl

In [9]:
if READ_IN_AND_DISPLAY:
    ms3data = readin(FILE)
    

In [10]:
if READ_IN_AND_DISPLAY:
    #Graph data
    plt.figure(figsize=(15,7))
    plt.hist(skipZero(ms3data.values.flatten()), alpha=.5, bins = 100)
    sns.rugplot(skipZero(ms3data.values.flatten()), color="black")
    plt.title("All Non-Zero Data Points")
    plt.show()
    print ("Note that %i zeros are not shown, out of a dataset of %i." % 
           (len([z for z in ms3data.values.flatten() if (z==0)]), len(ms3data.values.flatten())))
    get_thresholds(ms3data.values.flatten())

In [11]:
if READ_IN_AND_DISPLAY:
    means_of_nonZero_by_col = ms3data.apply(skipZeroMean, axis='index')
    plt.hist(means_of_nonZero_by_col, bins=40)
    plt.title("Means of Columns (Log-Normalized)")
    plt.show()
    print(means_of_nonZero_by_col)

In [12]:
if READ_IN_AND_DISPLAY:
    means_of_nonZero_by_row = ms3data.apply(skipZeroMean, axis='columns')
    plt.figure(figsize=(10,7))
    plt.hist(skipZero(means_of_nonZero_by_row), bins=500)
    plt.title("Means by Protein (Log-Normalized)")
    plt.show()


    print ("Note that %i zeros are not shown, out of a dataset of %i." % 
           (len([z for z in means_of_nonZero_by_row if (z==0)]), len(means_of_nonZero_by_row)))