In [6]:
import sys
from statistics import mean, stdev, variance
import numpy as np
import pandas as pd
from re import sub
import math
import operator

import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
def readin(fileName):
    inFile = open(fileName, 'r')
    
    line = inFile.readline().strip()
    headings = line.split('\t')
       
    data = {}
    line = inFile.readline()
    while line !="":
        row = line.split('\t')
        vals = row[1:]
        for i in vals: i = float(i)
        data[(row[0])] = vals
        line = inFile.readline()
        
    df = pd.DataFrame.from_dict(data, dtype = float, orient='index',columns = headings[1:])
    return df

def readin_log(fileName):
    df = readin(fileName)
    dfl = (np.log(df)).replace(-np.inf, 0)
    return dfl

In [3]:
def by_sample(ms3data, technical_replicates):
    #separates the data from readin into the samples
    msSamples = {}
    for sample in technical_replicates:
        reps = {}
        for rep in technical_replicates[sample]:
            reps[ms3data.iloc[:,rep].name] = ms3data.iloc[:,rep]
        msSamples[sample] = pd.DataFrame.from_dict(reps, dtype = float)
    
    return msSamples

## Graphed types

In [None]:
def hist_vs_neg(data, channels,title="Neg Control vs Samples"):
    plt.xscale('log')
    plt.title(title)
    
    for key in channels:
        column = data[key]
        column = np.sort(column.values)
        plt.hist(column, alpha = .5, bins=np.logspace(0, 8), label=channels[key])
    
    plt.legend(loc='upper right')
    plt.xlabel("Intensity Value")
    plt.ylabel("Number of Proteins")

    plt.show()

### ROC graphs

< img src=./figures/ex_curve.jpg width="500">

In [4]:
def ROC_plot(msdata, neg_col_name, technical_replicates, rep_name, as_fraction=False):
    #Generates the points for the curve showing
    #    y-axis: how many sample points are included
    #    x-axis: how many points from the negative control are
    #    as the threshold changes.
    #    See the exaggerated curve above for further clarification.
    #
    #    as_fraction:
    #      True: generates the curves scaled to total number, as decimal
    #      False: generates curves in terms of absolute number of proteins
    #
    #    returns a dictionary of points.
    #    must then be plotted by plt.plot(points.values(), points.keys())
    
    samples = by_sample(msdata, technical_replicates)
    neg_cont = msdata.loc[:,neg_col_name]
    neg_cont = np.array(neg_cont)

    sample = np.array(samples[rep_name].values.flatten())

    all_data = np.concatenate((neg_cont, sample))
    all_data = np.unique(all_data)
    all_data.sort()
    all_data = all_data[::-1]
    
    points = {}
    total = len(all_data)
    for t in all_data:
        x = len([i for i in neg_cont if i > t])
        if as_fraction: x=x / len((neg_cont))
        y = len([i for i in sample if i > t])
        if as_fraction: y=y / len(skipZero(sample))
        points[y] = x
            
    return points

In [7]:
def ROC_all(data, neg_col, cols=list(range(0,10)), boost=None, as_fraction=False, labels=None):
    #Calculates and graphs the ROC-like curve for all columns in range.
    #    specifying the boost draws it first, coloring it blue
    #    as_fraction:
    #      True: generates the curves scaled to total number, as decimal
    #      False: generates curves in terms of absolute number of proteins
    plt.xlabel("Control Proteins")
    plt.ylabel("Sample Proteins")
    
    if boost==None: boost_index = None
    else: boost_index = data.columns.get_loc(boost)
    
    if boost!=None:
        p = ROC_plot(data, neg_col, {'a':[boost_index]}, 'a', as_fraction=as_fraction)
        if labels:
            plt.plot(p.values(), p.keys(), label=labels[boost])
        else:
            plt.plot(p.values(), p.keys())
    for i in cols:
        if i != data.columns.get_loc(neg_col) and i != boost_index:
            p = ROC_plot(data, neg_col, {'a':[i]}, 'a', as_fraction=as_fraction)       
            if labels:
                label = labels[(data.columns.values)[i]]
                plt.plot(p.values(), p.keys(), label=label)
            else:
                plt.plot(p.values(), p.keys())
    if labels: plt.legend(loc='lower right')