In [1]:
import pandas as pd
import data_loader as dl
import altair as alt

In [2]:
def filter_data(df, cutoff, msfragger=False):
    #drop decoy
    df = df[df["decoy"]==False]
    #drop duplicate scans
    df = df.drop_duplicates(subset=["scan"], keep="first") #keep highest coring
    #filter based on qvalue
    if msfragger == False:
        df = df[df[df.columns[3]] <= cutoff]
    else:
        df = df[df[df.columns[3]] >= 1-cutoff]
    return df


In [3]:
msgf = dl.clean_msgfplus("2ng")
msgf_filter = filter_data(msgf, .01)


In [4]:
spec = dl.clean_spectromine("2ng")
spec_filter = filter_data(spec, .01)

In [5]:
msf = dl.clean_msfragger("2ng")
msf_filter = filter_data(msf, .01)

In [6]:
meta = dl.clean_metamorph("2ng")
meta_filter = filter_data(meta, .01)

In [7]:
mq = dl.clean_maxquant("2ng")
mq

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df["decoy"] = df.apply(lambda row: make_decoy_col_maxquant(row), axis=1)


Unnamed: 0,decoy,scan,peptide,PEP
19243,True,5,DNLCSCCSGM,0.289592
19244,True,6,CSMALWQDVCLQHFLHMCPR,0.296596
19245,False,7,NEDEDSPNK,1.773420
19246,True,8,VAHMYLASLFSAEHTRDSINTSEYEWGK,0.112992
19247,False,9,VLLPAIKK,0.175843
...,...,...,...,...
59385,True,49333,HRFLLIQIR,0.098582
59386,False,49335,,
59387,True,49342,GUGGAMPPPSPR,0.281515
59388,False,49347,,


In [8]:
#look through all the files and get pep and scan #s
files = [".2ng", "2ng"]

file_names = []
prog_type = []
pep =[]
scan = []

for file in files:
    print(file)
    file_names.append(file)
    prog_type.append("msgf")
    msgf = dl.clean_msgfplus(file)
    msgf_filter = filter_data(msgf, .01)
    pep.append(msgf_filter["peptide"].nunique())
    scan.append(msgf_filter["scan"].nunique())

    prog_type.append("spectro")
    file_names.append(file)
    spec = dl.clean_spectromine(file)
    spec_filter = filter_data(spec, .01)
    pep.append(spec_filter["peptide"].nunique())
    scan.append(spec_filter["scan"].nunique())

    prog_type.append("msfragger")
    file_names.append(file)
    msf = dl.clean_msfragger(file)
    msf_filter = filter_data(msf, .01, msfragger=True)
    pep.append(msf_filter["peptide"].nunique())
    scan.append(msf_filter["scan"].nunique())

    prog_type.append("metamorph")
    file_names.append(file)
    meta = dl.clean_metamorph(file)
    meta_filter = filter_data(meta, .01)
    pep.append(meta_filter["peptide"].nunique())
    scan.append(meta_filter["scan"].nunique())
    
    prog_type.append("maxq")
    file_names.append(file)
    mq = dl.clean_maxquant(file)
    mq_filter = filter_data(mq, .01)
    pep.append(mq_filter["peptide"].nunique())
    scan.append(mq_filter["scan"].nunique())

source = pd.DataFrame({
    'file': file_names,
    'prog_type': prog_type,
    
    "pep":pep,
    "scan":scan
})
#completely filtered
alt.Chart(source).mark_bar().encode(
    x='prog_type',
    y='scan',
    column="file"
)

.2ng


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df["decoy"] = df.apply(lambda row: make_decoy_col_maxquant(row), axis=1)


2ng


In [9]:
def filter_data_no_cutoff(df):
    #drop decoy
    df = df[df["decoy"]==False]
    #drop duplicate scans
    df = df.drop_duplicates(subset=["scan"], keep="first") #keep highest coring
    #filter based on qvalue
#     df = df[df[df.columns[3]] <= cutoff]
    return df

In [10]:
#look through all the files and get pep and scan #s
files = [".2ng", "2ng"]

file_names = []
prog_type = []
pep =[]
scan = []

for file in files:
    print(file)
    file_names.append(file)
    prog_type.append("msgf")
    msgf = dl.clean_msgfplus(file)
    msgf_filter = filter_data_no_cutoff(msgf)
    pep.append(msgf_filter["peptide"].nunique())
    scan.append(msgf_filter["scan"].nunique())

    prog_type.append("spectro")
    file_names.append(file)
    spec = dl.clean_spectromine(file)
    spec_filter = filter_data_no_cutoff(spec)
    pep.append(spec_filter["peptide"].nunique())
    scan.append(spec_filter["scan"].nunique())

    prog_type.append("msfragger")
    file_names.append(file)
    msf = dl.clean_msfragger(file)
    msf_filter = filter_data_no_cutoff(msf)
    pep.append(msf_filter["peptide"].nunique())
    scan.append(msf_filter["scan"].nunique())

    prog_type.append("metamorph")
    file_names.append(file)
    meta = dl.clean_metamorph(file)
    meta_filter = filter_data_no_cutoff(meta)
    pep.append(meta_filter["peptide"].nunique())
    scan.append(meta_filter["scan"].nunique())
    
    prog_type.append("maxq")
    file_names.append(file)
    mq = dl.clean_maxquant(file)
    mq_filter = filter_data_no_cutoff(mq)
    pep.append(mq_filter["peptide"].nunique())
    scan.append(mq_filter["scan"].nunique())

source = pd.DataFrame({
    'file': file_names,
    'prog_type': prog_type,
    
    "pep":pep,
    "scan":scan
})
#completely filtered
alt.Chart(source).mark_bar().encode(
    x='prog_type',
    y='scan',
    column="file"
)

.2ng
2ng
