In [1]:
import pandas as pd
import altair as alt

In [2]:
import data_loader

In [3]:
def set_cutoff(cutoff, df, prog_type):
    #set cutoff
    if prog_type=="msgf":
        df = df[df.QValue <= cutoff]
    if prog_type=="spectro":
        df = df[df["PEP.QValue"] <= cutoff]

    return df

In [4]:
def print_stats(df, prog_type):
    if prog_type == "msgf":
        number_pep = df['new_peptide'].nunique()
        number_spectra = df["SpecID"].nunique()
  
    if prog_type == "spectro":
        number_pep = df['PEP.StrippedSequence'].nunique()
        number_spectra = df['PSM.MS2ScanNumber'].nunique()
       
    print("Number of unique peptides:", number_pep, "   Number of unique spectra:", number_spectra)
    return([number_pep, number_spectra])
    

In [5]:
def return_stats(df, prog_type):
    if prog_type == "msgf":
        number_pep = df['new_peptide'].nunique()
        number_spectra = df["SpecID"].nunique()
  
    if prog_type == "spectro":
        number_pep = df['PEP.StrippedSequence'].nunique()
        number_spectra = df['PSM.MS2ScanNumber'].nunique()
       
    return([number_pep, number_spectra])

In [6]:
df = data_loader.parse_msgfplus("singleCell_1", .01)
print_stats(df, "msgf")

df_cut = set_cutoff(.01, df, "msgf")
print_stats(df_cut, "msgf")

Number of unique peptides: 2068    Number of unique spectra: 2290
Number of unique peptides: 2068    Number of unique spectra: 2290


[2068, 2290]

In [7]:
df = data_loader.parse_spectromine("singleCell_1", .01)
print_stats(df, "spectro")

df_cut = set_cutoff(.01, df, "spectro")
print_stats(df_cut, "spectro")

Number of unique peptides: 2727    Number of unique spectra: 3252
Number of unique peptides: 2727    Number of unique spectra: 3252


[2727, 3252]

In [8]:
#let's make some graphs or something. I want to be able to compare the peptide and spectra before and after cutoff. 
#Maybe we should also do one for beofre any modifications. 

In [24]:
#look through all the files and get pep and scan #s
files = ["singleCell_1", "singleCell_2", "singleCell_3", "singleCell_4", 
         "50ng_1", "50ng_2", "50ng_3", "50ng_4", "50ng_5"]

file_names = []
pep_no_cutoff = []
scan_no_cutoff = []
pep_with_cutoff =[]
scan_with_cutoff = []
prog_type = []

for file in files:
    print(file)
    file_names.append(file)
    prog_type.append("spectro")
    df = data_loader.parse_spectromine_remove_decoy(file)
    pep_no_cutoff.append(df['PEP.StrippedSequence'].nunique())
    scan_no_cutoff.append(df['PSM.MS2ScanNumber'].nunique())
    
    df = data_loader.parse_spectromine(file, .01)
    pep_with_cutoff.append(df['PEP.StrippedSequence'].nunique())
    scan_with_cutoff.append(df['PSM.MS2ScanNumber'].nunique())
  
    file_names.append(file)
    prog_type.append("msgf")
    df = data_loader.parse_msgfplus_remove_decoy(file)
    pep_no_cutoff.append(df['new_peptide'].nunique())
    scan_no_cutoff.append(df["SpecID"].nunique())

    df = data_loader.parse_msgfplus(file, .01)
    pep_with_cutoff.append(df['new_peptide'].nunique())
    scan_with_cutoff.append(df["SpecID"].nunique())

singleCell_1
singleCell_2
singleCell_3
singleCell_4
50ng_1
50ng_2
50ng_3
50ng_4
50ng_5


In [12]:
source = pd.DataFrame({
    'file': file_names,
    'prog_type': prog_type,
    
    "pep_no_cutoff":pep_no_cutoff,
    "scan_no_cutoff":scan_no_cutoff,
    "pep_with_cutoff":pep_with_cutoff,
    "scan_with_cutoff":scan_with_cutoff
    
})

In [13]:
alt.Chart(source).mark_bar().encode(
    x='prog_type',
    y='pep_with_cutoff',
    column="file"
)


In [25]:
alt.Chart(source).mark_bar().encode(
    x='prog_type',
    y='pep_no_cutoff',
    column="file"
)



ValueError: file encoding field is specified without a type; the type cannot be inferred because it does not match any column in the data.

alt.Chart(...)

In [None]:
stop

In [14]:
source

Unnamed: 0,file,prog_type,pep_no_cutoff,scan_no_cutoff,pep_with_cutoff,scan_with_cutoff
0,singleCell_1,spectro,6230,5809,2727,3252
1,singleCell_1,msgf,5540,5759,2068,2290
2,singleCell_2,spectro,6174,5635,2515,2909
3,singleCell_2,msgf,5391,5503,1641,1759
4,singleCell_3,spectro,5837,5331,2354,2696
5,singleCell_3,msgf,5343,5413,1446,1547
6,singleCell_4,spectro,4694,4414,2254,2578
7,singleCell_4,msgf,4827,5104,1472,1571
8,50ng_1,spectro,24844,15186,8541,10713
9,50ng_1,msgf,13102,13894,8755,9577


In [15]:
#look through all the files and get pep and scan #s
files = ["singleCell_1", "singleCell_2", "singleCell_3", "singleCell_4", 
         "50ng_1", "50ng_2", "50ng_3", "50ng_4", "50ng_5"]

file_names = []
scan_type = []
pep_type = []
pep_count = []
scan_count = []
prog_type = []

for file in files:
    print(file)
    file_names.append(file)
    prog_type.append("spectro")
    scan_type.append("no_cutoff")
    pep_type.append("no_cutoff")
    df = data_loader.parse_spectromine_remove_decoy(file)
    pep_count.append(df['PEP.StrippedSequence'].nunique())
    scan_count.append(df['PSM.MS2ScanNumber'].nunique())
    
    file_names.append(file)
    prog_type.append("spectro")
    scan_type.append("with_cutoff")
    pep_type.append("with_cutoff")
    df = data_loader.parse_spectromine(file, .01)
    pep_count.append(df['PEP.StrippedSequence'].nunique())
    scan_count.append(df['PSM.MS2ScanNumber'].nunique())


    file_names.append(file)
    prog_type.append("msgf")
    scan_type.append("no_cutoff")
    pep_type.append("no_cutoff")
    df = data_loader.parse_msgfplus_remove_decoy(file)
    pep_count.append(df['new_peptide'].nunique())
    scan_count.append(df["SpecID"].nunique())

    file_names.append(file)
    prog_type.append("msgf")
    scan_type.append("with_cutoff")
    pep_type.append("with_cutoff")
    df = data_loader.parse_msgfplus(file, .01)
    pep_count.append(df['new_peptide'].nunique())
    scan_count.append(df["SpecID"].nunique())

singleCell_1
singleCell_2
singleCell_3
singleCell_4
50ng_1
50ng_2
50ng_3
50ng_4
50ng_5


In [16]:
source = pd.DataFrame({
    'file': file_names,
    'prog_type': prog_type,
    
    "scan_type":scan_type,
    "pep_type":pep_type,

    "pep_count":pep_count,
    "scan_count":scan_count
})

In [17]:
single_cell_1 = source[source["file"]=="singleCell_1"]
single_cell_1

Unnamed: 0,file,prog_type,scan_type,pep_type,pep_count,scan_count
0,singleCell_1,spectro,no_cutoff,no_cutoff,6230,5809
1,singleCell_1,spectro,with_cutoff,with_cutoff,2727,3252
2,singleCell_1,msgf,no_cutoff,no_cutoff,5540,5759
3,singleCell_1,msgf,with_cutoff,with_cutoff,2068,2290


In [18]:
alt.Chart(source).mark_bar().encode(
    x='prog_type:O',
    y='pep_count',
    column='pep_type',
)

In [19]:
source = pd.DataFrame({
    'file': file_names,
    'prog_type': prog_type,
    
    "scan_type":scan_type,
    "pep_type":pep_type,

    "pep_count":pep_count,
    "scan_count":scan_count
})

In [20]:
alt.Chart(source).mark_bar().encode(
    x='a',
    y='b'
)

ValueError: a encoding field is specified without a type; the type cannot be inferred because it does not match any column in the data.

alt.Chart(...)

In [21]:
import altair as alt
from vega_datasets import data

source = data.barley()

alt.Chart(source).mark_bar().encode(
    x='year:O',
    y='sum(yield):Q',
    color='year:N',
    column='site:N'
)

In [22]:
source

Unnamed: 0,yield,variety,year,site
0,27.00000,Manchuria,1931,University Farm
1,48.86667,Manchuria,1931,Waseca
2,27.43334,Manchuria,1931,Morris
3,39.93333,Manchuria,1931,Crookston
4,32.96667,Manchuria,1931,Grand Rapids
...,...,...,...,...
115,58.16667,Wisconsin No. 38,1932,Waseca
116,47.16667,Wisconsin No. 38,1932,Morris
117,35.90000,Wisconsin No. 38,1932,Crookston
118,20.66667,Wisconsin No. 38,1932,Grand Rapids
