In [1]:
import pandas as pd
import data_loader as dl
import altair as alt

In [2]:
def filter_data(df, cutoff, msfragger=False, prob_column='probability'):
    #drop decoy
    df = df[df["decoy"]==False]
    #sort by qvalue
    df = df.sort_values(prob_column)
    #drop duplicate scans
    df = df.drop_duplicates(subset=["scan"], keep="first") #keep highest coring
    #filter based on qvalue
    if msfragger == False:
        df = df[df[prob_column] <= cutoff]
    else:
        df = df[df[prob_column] >= 1-cutoff]
    return df


In [3]:
def decoy_cutoff(df, cutoff, msfragger=False):
    #drop decoy
    df = df[df["decoy"]==False]
    #sort by qvalue
    df = df.sort_values('probability')
    #drop duplicate scans
    df = df.drop_duplicates(subset=["scan"], keep="first") #keep highest scoring
    #filter based on qvalue
    if msfragger == False:
        df = df[df['probability'] <= cutoff]
    else:
        df = df[df['probability'] >= 1-cutoff]
    return df

In [4]:
def count_decoy_helper(row, total):
    if row['decoy'] == False:
        return(0)
    else:
        return(1)

In [5]:
def count_decoy(df):
    #make sure df is sorted by qvalue
    df = df.sort_values('probability')
    #drop duplicate spectra
    df = df.drop_duplicates(subset=["scan"], keep="first") #keep highest coring
    total_rows = len(df.index)
    df['decoy_count'] = df.apply(lambda row: count_decoy_helper(row, total_rows), axis=1)
    df['sum'] = df['decoy_count'].cumsum()
    df['new_prob'] = df['sum']/total_rows
    df = df.drop(columns=["sum", "decoy_count"])
    return df

#Look at identical spectra and see if the peptides are similar
#Also You need to count peptide with modifications as 2 separate peptides. 

In [14]:
msf = dl.clean_msfragger("2ng")
msf_filter = filter_data(msf, .01, msfragger=True)
msf_filter #Why are there only 10 decoys

Unnamed: 0,decoy,scan,peptide,probability
2641,False,Ex_Auto_DrM3_30umT4_2ngQC_60m_half.17615.17615.2,FIMESGAK,0.99
13173,False,Ex_Auto_DrM3_30umT4_2ngQC_60m_half.40365.40365.2,VFLENVIR,0.99
7305,False,Ex_Auto_DrM3_30umT4_2ngQC_60m_half.25429.25429.2,LVVLATPQVSDSMR,0.99
9596,False,Ex_Auto_DrM3_30umT4_2ngQC_60m_half.29090.29090.2,QVEDDIQQLLK,0.99
2946,False,Ex_Auto_DrM3_30umT4_2ngQC_60m_half.18190.18190.2,CLTQSGIAGGYK,0.99
...,...,...,...,...
6754,False,Ex_Auto_DrM3_30umT4_2ngQC_60m_half.24567.24567.2,DIISDTSGDFR,1.00
6755,False,Ex_Auto_DrM3_30umT4_2ngQC_60m_half.24568.24568.2,GPVEGYEENEEFLR,1.00
6756,False,Ex_Auto_DrM3_30umT4_2ngQC_60m_half.24569.24569.2,LVLLGESAVGK,1.00
6750,False,Ex_Auto_DrM3_30umT4_2ngQC_60m_half.24561.24561.2,THSDQFLVAFK,1.00


In [15]:
meta = dl.clean_metamorph("2ng")
meta_filter = filter_data(meta, .01)
meta_filter #what does Y|N mean?

Unnamed: 0,decoy,scan,peptide,probability
0,False,38422,AEGSDVANAVLDGADCIMLSGETAK,0.000000
1376,False,38502,VTIAQGGVLPNIQAVLLPK,0.000000
1374,False,21218,KPTDGASSSNCVTDISHLVR,0.000000
5873,False,32203,LVSSPCCIVTSTYGWTANMER,0.000000
5876,False,15587,NM+15.995M+15.995AACDPR,0.000000
...,...,...,...,...
17040,False,24809,SIDFPLTK,0.009843
14587,False,40505,DPTAVIFGEDVAFGGVFR,0.009887
14075,False,30563,EQQHVM+15.995EELFQSSFR,0.009910
18845,False,13999,EGGGGKRK,0.009977


In [16]:
mq = dl.clean_maxquant("2ng")
mq_filtered = filter_data(mq, .01)
mq_filtered #no duplicate scans found

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['temp_peptide'] = df.apply(lambda row: format_oxidation(row, "Modified sequence", "(Oxidation (M))"), axis=1)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df["decoy"] = df.apply(lambda row: make_decoy_col_maxquant(row), axis=1)


Unnamed: 0,decoy,scan,peptide,probability
33067,False,18289,_PGNQNTQVTEAWNK_,1.771913e-113
53134,False,40678,_EILFSVDSSINQSIGGVILFHETLYQK_,2.478126e-82
31499,False,16565,_TNQELQEINR_,4.134555e-76
37796,False,23490,_VDATEESDLAQQYGVR_,9.773177e-67
35548,False,21018,_INEELESQYQQSMDSK_,1.043751e-66
...,...,...,...,...
34912,False,20318,_FGISSVPTK_,9.964967e-03
47473,False,34134,_DLEALMFDR_,9.964967e-03
42420,False,28576,_DIFDQLAK_,9.996017e-03
30613,False,15590,_VEEAFNCR_,9.996017e-03


In [9]:
#look through all the files and get pep and scan #s
files = [".2ng", "2ng"]

file_names = []
prog_type = []
pep =[]
scan = []

for file in files:
    print(file)
    file_names.append(file)
    prog_type.append("msgf")
    msgf = dl.clean_msgfplus(file)
    msgf_filter = filter_data(msgf, .01)
    pep.append(msgf_filter["peptide"].nunique())
    scan.append(msgf_filter["scan"].nunique())

    prog_type.append("spectro")
    file_names.append(file)
    spec = dl.clean_spectromine(file)
    spec_filter = filter_data(spec, .01)
    pep.append(spec_filter["peptide"].nunique())
    scan.append(spec_filter["scan"].nunique())

    prog_type.append("msfragger")
    file_names.append(file)
    msf = dl.clean_msfragger(file)
    msf_filter = filter_data(msf, .01, msfragger=True)
    pep.append(msf_filter["peptide"].nunique())
    scan.append(msf_filter["scan"].nunique())

    prog_type.append("metamorph")
    file_names.append(file)
    meta = dl.clean_metamorph(file)
    meta_filter = filter_data(meta, .01)
    pep.append(meta_filter["peptide"].nunique())
    scan.append(meta_filter["scan"].nunique())
    
    prog_type.append("maxq")
    file_names.append(file)
    mq = dl.clean_maxquant(file)
    mq_filter = filter_data(mq, .01)
    pep.append(mq_filter["peptide"].nunique())
    scan.append(mq_filter["scan"].nunique())

source = pd.DataFrame({
    'file': file_names,
    'prog_type': prog_type,
    
    "pep":pep,
    "scan":scan
})
#completely filtered
alt.Chart(source).mark_bar().encode(
    x='prog_type',
    y='scan',
    column="file"
)

.2ng


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['temp_peptide'] = df.apply(lambda row: format_oxidation(row, "Modified sequence", "(Oxidation (M))"), axis=1)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df["decoy"] = df.apply(lambda row: make_decoy_col_maxquant(row), axis=1)


2ng


In [10]:
#completely filtered
alt.Chart(source).mark_bar().encode(
    x='prog_type',
    y='pep',
    column="file"
)

In [11]:
#look through all the files and get pep and scan #s
files = [".2ng", "2ng"]

file_names = []
prog_type = []
pep =[]
scan = []

for file in files:
    print(file)
    file_names.append(file)
    prog_type.append("msgf")
    msgf = dl.clean_msgfplus(file)
    msgf = count_decoy(msgf)
    msgf_filter = filter_data(msgf, .01, prob_column="new_prob")
    pep.append(msgf_filter["peptide"].nunique())
    scan.append(msgf_filter["scan"].nunique())

    prog_type.append("spectro")
    file_names.append(file)
    spec = dl.clean_spectromine(file)
    spec = count_decoy(spec)
    spec_filter = filter_data(spec, .01, prob_column="new_prob")
    pep.append(spec_filter["peptide"].nunique())
    scan.append(spec_filter["scan"].nunique())

    prog_type.append("msfragger")
    file_names.append(file)
    msf = dl.clean_msfragger(file)
    msf = count_decoy(msf)
    msf_filter = filter_data(msf, .01, prob_column="new_prob")
    pep.append(msf_filter["peptide"].nunique())
    scan.append(msf_filter["scan"].nunique())

    prog_type.append("metamorph")
    file_names.append(file)
    meta = dl.clean_metamorph(file)
    meta = count_decoy(meta)
    meta_filter = filter_data(meta, .01, prob_column="new_prob")
    pep.append(meta_filter["peptide"].nunique())
    scan.append(meta_filter["scan"].nunique())
    
    prog_type.append("maxq")
    file_names.append(file)
    mq = dl.clean_maxquant(file)
    mq = count_decoy(mq)
    mq_filter = filter_data(mq, .01, prob_column="new_prob")
    pep.append(mq_filter["peptide"].nunique())
    scan.append(mq_filter["scan"].nunique())

source2 = pd.DataFrame({
    'file': file_names,
    'prog_type': prog_type,
    
    "pep":pep,
    "scan":scan
})
#completely filtered
alt.Chart(source2).mark_bar().encode(
    x='prog_type',
    y='scan',
    column="file"
)

.2ng
2ng


In [12]:
#completely filtered
alt.Chart(source2).mark_bar().encode(
    x='prog_type',
    y='pep',
    column="file"
)