In [1]:
import os
import mokapot
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import sys
sys.path.append("..")
sys.path
import data_loader as dl


The purpose of this function is to clean up the original 'before' data so that we are not counting decoys or duplicate scans.

In [2]:
def filter_data(df, prob_column):
    #drop decoys
    #df = df[df["decoy"]==False]
 #Not dropping decoys because MP will need them. 

    #sort by qvalue
    df = df.sort_values(prob_column)
    #drop duplicate scans
    df = df.drop_duplicates(subset=["scan"], keep="first") #keep highest scoring
    #rename scan column
    df = df.rename(columns = {"scan": "ScanNr"})
    
    #setting column to int to be able to join with the other dfs
    df = df.astype({'ScanNr': int})
    
    return df

Each tool shows modifications in a different way. We have to set them uniformly so that we can compare the peptides

In [3]:
def format_oxidation(row, column, to_replace):
    peptide = row[column]

    replace_with = "+15.995"
    if pd.isna(peptide):
        new_pep = peptide
    else:
        if to_replace in peptide:
            new_pep = peptide.replace(to_replace, replace_with)
        else:
            new_pep = peptide
    return new_pep

def format_carbamidomethyl(row, column, to_replace):
    peptide = row[column]
    replace_with = ""
    if pd.isna(peptide):
        new_pep = peptide
    else:
        if to_replace in peptide:
            new_pep = peptide.replace(to_replace, replace_with)
        else:
            new_pep = peptide
    return new_pep

MetaMorpheus' peptide column has to be reformatted to be able to check if the other tools' peptides match with it

In [4]:
def extractPeptide(row):
    string = row
    spot = string.find('.')
    new_st = string[spot + 1:]
    spot = new_st.find('.')
    final_st = new_st[:spot]
 
    return final_st

The next functions read in the dataframes that hold the 'before' data. This is the data that has not been run through MokaPot. We will use this to compare whether MokaPot was able to improve the number or scans below a certain cutoff. 

In [5]:
#Reading in and formatting the original MetaMorpheus data
def get_mm_pin_data(file):
    df = dl.get_pin_file('2ng_rep1')
    df = df.iloc[1: :]

    #reformatting the peptide column to be comparable with the other tools
    df['temp_peptide'] = df.apply(lambda row: format_oxidation(row, "Peptide", "[Common Variable:Oxidation on M]"), axis=1)
    df["peptide"] = df.apply(lambda row: format_carbamidomethyl(row, "temp_peptide", "[Common Fixed:Carbamidomethyl on C]"), axis=1)

    df['peptide'] = df['peptide'].apply(extractScanNum) 



    #setting column to int to be able to join with the other dfs
    df = df.astype({'ScanNr': int})

    df = df.drop_duplicates(subset=["ScanNr"], keep="first")
    
    
    return df

In [6]:
#Reading in and formatting the orginial MsFragger data
def set_probablility(row):
    new_prob = 1 - row["PeptideProphet Probability"]
    return new_prob

#pulling only scan numbers out
def extractScanNum(row):
    string = row
    spot = string.find('.')
    new_st = string[spot + 1:]
    spot = new_st.find('.')
    final_st = new_st[:spot]
    
    if final_st[0] == "0":
        final_st = final_st[1:]
    return final_st

def get_original_msf_data(file):
    msf_df = dl.clean_msfragger(file)
    
    #Extracting scan number from file number
    msf_df['scan'] = msf_df['scan'].apply(extractScanNum) 
    
    msf_df = filter_data(msf_df, 'PeptideProphet Probability')
    
    
    #renaming columns that exist in other tools
    msf_df = msf_df.rename(columns = {"peptide": "msf_peptide", 'Charge' : "msf_charge", 
                                      'Intensity': 'msf_intensity'})
    
    #filter out just the ScanNr, peptide, and probability column
    msf_df = msf_df.filter(['ScanNr', 'msf_peptide','msf_charge', 
                                              'Peptide Length', 'Retention', 'Delta Mass', 
                                              'Expectation', 'Hyperscore', 'Nextscore', 
                                              'Number of Enzymatic Termini',
                                              'Number of Missed Cleavages', 'msf_intensity'])
    
    return  msf_df

In [7]:
#Reading in and formatting the orginial MsgfPlus data
def get_original_msg_data(file):
    msg_df = dl.clean_msgfplus(file)
    msg_df = filter_data(msg_df, "QValue")
    
    #renaming columns that have identical names to columns in other tools
    msg_df = msg_df.rename(columns = {"peptide": "msg_peptide", 'Charge':'msg_charge'})
    
    #filter out just the ScanNr, peptide, and QValue columns
    msg_df = msg_df.filter(["ScanNr", 'msg_peptide', 'msg_charge', 'IsotopeError',  
                                            'PrecursorError(ppm)', 'DeNovoScore', 'MSGFScore',
                                             'SpecEValue'])
    
    return msg_df

In [8]:
#Reading in and formatting the orginial MaxQuant data
def get_original_mq_data(file):
    mq_df =  dl.clean_maxquant(file)
    
#Formatting and dropping any rows that are missing the sequence
    mq_df['Sequence'].replace(' ', np.nan, inplace = True)
    mq_df.dropna(subset=['Sequence'], inplace=True)
    
    mq_df = filter_data(mq_df, 'PEP')
        
    #replacing any precursor intensity that have a "nan" value with a 0
    mq_df['Precursor Intensity'].replace(np.nan, 0, inplace = True)
        
    mq_df = mq_df.rename(columns = {"peptide": "mq_peptide", 'Charge':'mq_charge'})
    
    #filter out just the ScanNr, peptide and PEP columns
    mq_df = mq_df.filter(["ScanNr", 'mq_peptide', 'mq_charge', "Precursor Intensity", 
                                             'Score', 'Length', 'Missed cleavages', 'm/z', 'Mass', 
                                             'Retention time', 'Delta score'])
    
    return  mq_df

This will check if the peptides for all the tools match for a specific scan

In [9]:
def check_peptides(row):
    if row['peptide'] == row['msg_peptide'] == row['msf_peptide'] == row['mq_peptide']:
        return 1
    else:
        return 0

This cell has the names to all the saved data files after the data has been ran through MokaPot. 

In [10]:
data_files = ["2ng_rep1"] #, "2ng_rep2", "2ng_rep3", "2ng_rep4", "2ng_rep5", "2ng_rep6",
             #"0.2ng_rep1", "0.2ng_rep2", "0.2ng_rep3", "0.2ng_rep4", "0.2ng_rep5", "0.2ng_rep6"]

Here we read in the results from the differnt tools for each file. After we read in all the data we format the data so it can all be read into a large megascript. The dataframes are joined based on their scan number. Any columns that do not have identical peptides for each tool are dropped. We then join all of the data together and save the data from each raw file into its own new megascript.  

In [11]:
for file in data_files:


    mm_df = get_mm_pin_data(file)
    msf_df = get_original_msf_data(file)
    msg_df = get_original_msg_data(file)
    mq_df = get_original_mq_data(file)

    #Switching index to ScanNr to join dataframes based on their scan numbers. 
    mm_df = mm_df.set_index('ScanNr')
    msg_df = msg_df.set_index('ScanNr')
    msf_df = msf_df.set_index('ScanNr')
    mq_df = mq_df.set_index('ScanNr')

    #joining everything together
    megaTable = mm_df.join(msg_df, how = 'outer')
    megaTable = megaTable.join(msf_df, how = 'outer')
    megaTable = megaTable.join(mq_df, how = 'outer')
    
    megaTable.reset_index(inplace=True)

    #megaScript.to_csv("ultimate_megatables/ultimate_megatable_" + file + ".csv")

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['temp_peptide'] = df.apply(lambda row: format_oxidation(row, "Modified sequence", "(Oxidation (M))"), axis=1)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df["temp_peptide"] = df["temp_peptide"].str[1:-1]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['Reverse'] = df['Reverse'].astype(str)


In [12]:
megaTable.to_csv('deleteMe.csv')

In [13]:
import copy
megaTable1 = copy.deepcopy(megaTable) #copy is for testing purposes only, change when done

#dropping all the nans. We can't use something that has missing values.
megaTable1.dropna(how = 'any', inplace = True)
megaTable1

Unnamed: 0,ScanNr,SpecId,Label,TotalMatchingFragmentCount,Intensity,PrecursorChargeDiffToMode,DeltaScore,Notch,PsmCount,ModsCount,...,mq_peptide,mq_charge,Precursor Intensity,Score,Length,Missed cleavages,m/z,Mass,Retention time,Delta score
746,5764,15142,1,6.0,3.0,0.0,1.0,0.0,8.0,1.0,...,SRHWYSDMR,2.0,20019.470703,0.00000,9.0,1.0,619.28019,1236.5458,26.550,0.00000
809,5863,17892,1,3.0,2.0,-1.0,0.0,0.0,2.0,0.0,...,NRPEPHSDENGSTTPK,3.0,42954.707031,16.88000,16.0,0.0,589.27484,1764.8027,26.836,12.41100
840,5902,5958,1,5.0,1.0,-1.0,3.0,0.0,8.0,0.0,...,GDTPGHATPGHGGATSSAR,3.0,97417.492188,38.27100,19.0,0.0,578.60318,1732.7877,26.933,35.96800
846,5909,5684,1,5.0,1.0,-2.0,3.0,0.0,8.0,0.0,...,GDTPGHATPGHGGATSSAR,4.0,162474.906250,45.25700,19.0,0.0,434.20421,1732.7877,26.950,40.94100
847,5910,760,1,9.0,4.0,0.0,6.0,0.0,2.0,0.0,...,AVQAQGGESQQEAQR,2.0,136786.531250,122.51000,15.0,0.0,793.87951,1585.7445,26.952,116.27000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
27581,37056,17732,1,3.0,2.0,0.0,0.0,0.0,3.0,0.0,...,VPSTEAEALASSLMGLFEK,2.0,33769.656250,29.68900,19.0,0.0,990.50335,1978.9921,105.040,29.68900
27584,37060,2103,1,6.0,2.0,-1.0,4.0,0.0,3.0,0.0,...,VPSTEAEALASSLMGLFEK,3.0,135830.937500,52.78400,19.0,0.0,660.67132,1978.9921,105.050,50.86700
27589,37067,14944,1,3.0,2.0,-1.0,1.0,0.0,1.0,0.0,...,QITDNIFLTTAEVIAQQVSDK,3.0,59460.105469,22.11800,21.0,0.0,778.74443,2333.2115,105.070,19.67700
27598,37089,17689,1,5.0,3.0,0.0,0.0,0.0,1.0,0.0,...,DITYFIQQLLR,2.0,38699.367188,60.97300,11.0,0.0,705.39300,1408.7715,105.130,41.18600


In [14]:
#keeping only the scans that have matching peptides
megaTable1['peptide_match'] = megaTable1.apply(check_peptides, axis = 1)

In [15]:
#must take out all the the main peptide file. 
megaTable1.drop(columns = {'msg_peptide', 'msf_peptide', 'mq_peptide','Peptide', 'temp_peptide'}, inplace = True)
#pulling out only scans that have all matching peptides to be ran through. 
megaTable1 = megaTable1.loc[megaTable1['peptide_match'] == 1]

In [16]:
ultimate_df = mokapot.read_pin(megaTable1)
ultimate_results, models = mokapot.brew(ultimate_df)

Traceback (most recent call last):
  File "/Users/daishavanderwatt/anaconda3/lib/python3.8/site-packages/sklearn/model_selection/_validation.py", line 531, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/Users/daishavanderwatt/anaconda3/lib/python3.8/site-packages/sklearn/svm/_classes.py", line 233, in fit
    self.coef_, self.intercept_, self.n_iter_ = _fit_liblinear(
  File "/Users/daishavanderwatt/anaconda3/lib/python3.8/site-packages/sklearn/svm/_base.py", line 926, in _fit_liblinear
    raise ValueError("This solver needs samples of at least 2 classes"
ValueError: This solver needs samples of at least 2 classes in the data, but the data contains only one class: 1.0

Traceback (most recent call last):
  File "/Users/daishavanderwatt/anaconda3/lib/python3.8/site-packages/sklearn/model_selection/_validation.py", line 531, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/Users/daishavanderwatt/anaconda3/lib/python3.8/site-packag

Traceback (most recent call last):
  File "/Users/daishavanderwatt/anaconda3/lib/python3.8/site-packages/sklearn/model_selection/_validation.py", line 531, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/Users/daishavanderwatt/anaconda3/lib/python3.8/site-packages/sklearn/svm/_classes.py", line 233, in fit
    self.coef_, self.intercept_, self.n_iter_ = _fit_liblinear(
  File "/Users/daishavanderwatt/anaconda3/lib/python3.8/site-packages/sklearn/svm/_base.py", line 926, in _fit_liblinear
    raise ValueError("This solver needs samples of at least 2 classes"
ValueError: This solver needs samples of at least 2 classes in the data, but the data contains only one class: 1.0



ValueError: No decoy PSMs were available for training.

In [None]:
len(ultimate_results.psms[ultimate_results.psms['mokapot q-value'] <= 0.01])

In [None]:
ultimate_df

In [17]:
megaTable1

Unnamed: 0,ScanNr,SpecId,Label,TotalMatchingFragmentCount,Intensity,PrecursorChargeDiffToMode,DeltaScore,Notch,PsmCount,ModsCount,...,mq_charge,Precursor Intensity,Score,Length,Missed cleavages,m/z,Mass,Retention time,Delta score,peptide_match
809,5863,17892,1,3.0,2.0,-1.0,0.0,0.0,2.0,0.0,...,3.0,42954.707031,16.880,16.0,0.0,589.27484,1764.8027,26.836,12.411,1
840,5902,5958,1,5.0,1.0,-1.0,3.0,0.0,8.0,0.0,...,3.0,97417.492188,38.271,19.0,0.0,578.60318,1732.7877,26.933,35.968,1
846,5909,5684,1,5.0,1.0,-2.0,3.0,0.0,8.0,0.0,...,4.0,162474.906250,45.257,19.0,0.0,434.20421,1732.7877,26.950,40.941,1
847,5910,760,1,9.0,4.0,0.0,6.0,0.0,2.0,0.0,...,2.0,136786.531250,122.510,15.0,0.0,793.87951,1585.7445,26.952,116.270,1
849,5912,484,1,10.0,4.0,0.0,7.0,0.0,10.0,0.0,...,2.0,238593.312500,183.000,15.0,0.0,856.38515,1710.7558,26.957,176.470,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
27579,37052,9670,1,4.0,2.0,-1.0,2.0,1.0,3.0,0.0,...,3.0,49541.355469,27.324,19.0,0.0,660.67132,1978.9921,105.020,23.290,1
27581,37056,17732,1,3.0,2.0,0.0,0.0,0.0,3.0,0.0,...,2.0,33769.656250,29.689,19.0,0.0,990.50335,1978.9921,105.040,29.689,1
27584,37060,2103,1,6.0,2.0,-1.0,4.0,0.0,3.0,0.0,...,3.0,135830.937500,52.784,19.0,0.0,660.67132,1978.9921,105.050,50.867,1
27589,37067,14944,1,3.0,2.0,-1.0,1.0,0.0,1.0,0.0,...,3.0,59460.105469,22.118,21.0,0.0,778.74443,2333.2115,105.070,19.677,1
