In [1]:
import pandas as pd
import sqlite3
import numpy as np
import matplotlib.pyplot as plt

In [29]:
mscypher_filename = "/home/ubuntu/UPS2_allion/MSCypher.txt"
mscypher_df = pd.read_table(mscypher_filename, sep="\t", dtype="str", engine='python')
feature_id=23
DB_NAME = "/home/ubuntu/UPS2_allion/UPS2_allion-features-1-1097.sqlite"

In [30]:
# convert the numeric fields and sort the features by their score
mscypher_df = mscypher_df.apply(pd.to_numeric, errors='ignore').sort_values(by=['FeatureNum','DiggerPepScore'], ascending=False)

In [31]:
# remove feature-duplicates from the MSC output, keeping the highest score
mscypher_df.drop_duplicates(subset=['FeatureNum'], keep='first', inplace=True)

In [32]:
mscypher_df.head()

Unnamed: 0,QueryNum,Target,Rank,NumPrecMatches,PepIDConsensus,PepPIF,MS2PIF,NumScoringFragIonMatches,NumScoringFragIonPeaks,PercentSimilarity,...,EndPos,PreRes,PostRes,EnzymeSpecificity,ProteinLength,AccNums,FastaEntryNum,UniqueID,deltaCn,XCorr
15158,15031,Target,1,0,30,0.0,0.0,0,0,-1.0,...,,,,,,,,21019,-1.0,0.0
5484,5420,Target,1,0,30,0.0,0.0,0,0,-1.0,...,,,,,,,,21018,-1.0,0.0
17443,17301,Target,1,0,30,0.0,0.0,0,0,-1.0,...,,,,,,,,21017,-1.0,0.0
16605,16465,Target,1,0,30,0.0,0.0,0,0,-1.0,...,,,,,,,,21016,-1.0,0.0
16592,16452,Target,1,0,30,0.0,0.0,0,0,-1.0,...,,,,,,,,21015,-1.0,0.0


In [33]:
len(mscypher_df)

19617

In [34]:
mscypher_df.columns.tolist()

['QueryNum',
 'Target',
 'Rank',
 'NumPrecMatches',
 'PepIDConsensus',
 'PepPIF',
 'MS2PIF',
 'NumScoringFragIonMatches',
 'NumScoringFragIonPeaks',
 'PercentSimilarity',
 'PepSeq',
 'deNovoSeq',
 'rnkScr',
 'pnvScr',
 'EludePepSeq',
 'VMLScore',
 'Mods',
 'NumMods',
 'NumPhosphos',
 'PhosphoMods',
 'DiggerPepScore',
 'DiggerPepHomologyScore',
 'PercolatorQValue',
 'PercolatorPEP',
 'FeatureIntensityAUC',
 'ObsRtMS2',
 'ObsRtApex',
 'PredRt',
 'MZ',
 'Charge',
 'ExpMassDa',
 'PepMassDa',
 'DeltaMassDa',
 'DeltaMassPPM',
 'C13',
 'NumDataPoints',
 'NumScans',
 'NumIsotopePeaks',
 'FileName',
 'MS2Scan',
 'RMSErrorPPM',
 'RMSErrorNumMatches',
 'TIC',
 'CompBondCleavage',
 'DeltaScore',
 'DeltaRt',
 'ChimericStatus',
 'FeatureNum',
 'MissedCleavages',
 'RPMS',
 'SVMScore',
 'SVMResult',
 'SVMSig',
 'RFFeatureScore',
 'RFPSMScore',
 'ModPepID',
 'Rawfilename',
 'RFRank',
 'RFFeatureFDR',
 'RFPsmFDR',
 'TargetVotes',
 'DecoyVotes',
 'RFPrediction',
 'MAErrorPPM',
 'AltDeltaScore',
 'FragMZ'

In [35]:
msc_subset_df = mscypher_df[["DiggerPepScore", "FeatureNum", "FragMZ", "FragInt", "FragError", "FragIonTypes", "FragPos", "FragCharge", "DeltaMassDa", "DeltaMassPPM"]].copy()

In [36]:
msc_subset_df.head()

Unnamed: 0,DiggerPepScore,FeatureNum,FragMZ,FragInt,FragError,FragIonTypes,FragPos,FragCharge,DeltaMassDa,DeltaMassPPM
15158,0.0,21928,0.0,0.0,0.0,x;x,0,0,0.0,0.0
5484,0.0,21927,0.0,0.0,0.0,x;x,0,0,0.0,0.0
17443,0.0,21926,0.0,0.0,0.0,x;x,0,0,0.0,0.0
16605,0.0,21925,0.0,0.0,0.0,x;x,0,0,0.0,0.0
16592,0.0,21924,0.0,0.0,0.0,x;x,0,0,0.0,0.0


In [37]:
msc_subset_df[msc_subset_df.FeatureNum==feature_id]

Unnamed: 0,DiggerPepScore,FeatureNum,FragMZ,FragInt,FragError,FragIonTypes,FragPos,FragCharge,DeltaMassDa,DeltaMassPPM
9849,0.0,23,0.0,0.0,0.0,x;x,0,0,0.0,0.0


In [56]:
db_conn = sqlite3.connect(DB_NAME)
ms2_peaks_df = pd.read_sql_query("select feature_id,peak_id,centroid_mz,intensity from deconvoluted_ions where feature_id={}".format(feature_id), db_conn)
db_conn.close()

In [57]:
ms2_peaks_df.head()

Unnamed: 0,feature_id,peak_id,centroid_mz,intensity
0,23,166,201.122,182097
1,23,798,229.117,34762
2,23,11827,229.119,20207
3,23,13963,234.15,18504
4,23,275,235.107,112724


In [58]:
msc_fragments_df = pd.DataFrame([], columns=['FragMZ', 'FragInt', 'FragError', 'FragIonTypes', 'FragPos', 'FragCharge'])

In [59]:
msc_fragments_df

Unnamed: 0,FragMZ,FragInt,FragError,FragIonTypes,FragPos,FragCharge


In [42]:
# break out the fragments reported by MSC for this feature
msc_fragments = msc_subset_df[msc_subset_df.FeatureNum==feature_id].iloc[0]
msc_fragments

DiggerPepScore           0
FeatureNum              23
FragMZ            0.000000
FragInt           0.000000
FragError         0.000000
FragIonTypes           x;x
FragPos                  0
FragCharge               0
DeltaMassDa              0
DeltaMassPPM             0
Name: 9849, dtype: object

In [43]:
FragMZ = list(map(float, msc_fragments.FragMZ.split(';')))
FragInt = list(map(float, msc_fragments.FragInt.split(';')))
FragError = list(map(float, msc_fragments.FragError.split(';')))
FragIonTypes = list(map(str, msc_fragments.FragIonTypes.split(';')))
FragPos = list(map(int, msc_fragments.FragPos.split(';')))
FragCharge = list(map(int, msc_fragments.FragCharge.split(';')))

In [44]:
msc_fragments_df = pd.DataFrame(list(zip(FragMZ, FragInt, FragError, FragIonTypes, FragPos, FragCharge)), columns=['FragMZ', 'FragInt', 'FragError', 'FragIonTypes', 'FragPos', 'FragCharge'])

In [45]:
msc_fragments_df

Unnamed: 0,FragMZ,FragInt,FragError,FragIonTypes,FragPos,FragCharge
0,0.0,0.0,0.0,x,0,0


In [46]:
# round the join column to match the ms2 peaks with the fragments reported by MSC
msc_fragments_df["FragMZ_round"] = msc_fragments_df.FragMZ.round(3)
ms2_peaks_df["centroid_mz_round"] = ms2_peaks_df.centroid_mz.round(3)

In [47]:
# match up the ms2 peaks with the fragments reported by MSC
ms2_peaks_msc_fragments_df = pd.merge(ms2_peaks_df, msc_fragments_df, how='left', left_on=['centroid_mz_round'], right_on=['FragMZ_round'])

In [48]:
ms2_peaks_msc_fragments_df

Unnamed: 0,feature_id,peak_id,centroid_mz,intensity,centroid_mz_round,FragMZ,FragInt,FragError,FragIonTypes,FragPos,FragCharge,FragMZ_round
0,23,1,588.361,8997131,588.361,,,,,,,
1,23,2,861.007,10594937,861.007,,,,,,,
2,23,3,861.508,10319433,861.508,,,,,,,
3,23,4,775.952,8247937,775.952,,,,,,,
4,23,5,776.454,7359559,776.454,,,,,,,
5,23,6,588.862,6130181,588.862,,,,,,,
6,23,7,641.379,5337905,641.379,,,,,,,
7,23,8,862.008,5489020,862.008,,,,,,,
8,23,9,641.045,4903578,641.045,,,,,,,
9,23,10,641.713,3218387,641.713,,,,,,,


In [49]:
ms2_peaks_msc_fragments_df[ms2_peaks_msc_fragments_df.FragMZ.notnull()]

Unnamed: 0,feature_id,peak_id,centroid_mz,intensity,centroid_mz_round,FragMZ,FragInt,FragError,FragIonTypes,FragPos,FragCharge,FragMZ_round


In [50]:
db_conn = sqlite3.connect(DB_NAME)
feature_list_df = pd.read_sql_query("select * from feature_list", db_conn)
db_conn.close()

In [51]:
feature_list_df.head()

Unnamed: 0,feature_id,charge_state,monoisotopic_mass,retention_time_secs,isotope_count,cluster_mz_centroid,cluster_summed_intensity,minimum_error,minimum_error_sulphur
0,1,2,1066.606152,590.4,6,534.310352,1985980902,0.1647301,0
1,2,3,1928.009933,535.6,7,643.677254,2425291014,7.140533e-06,0
2,3,3,2044.113664,1023.2,7,682.378497,2881543763,1.953337e-07,2
3,4,2,1741.892439,720.6,6,871.953496,1248224845,0.009673931,2
4,5,3,1927.994175,536.2,8,643.672001,461668922,1.976963e-05,0


In [52]:
# isolate the feature-level attributes reported by MSC
msc_feature_df = msc_subset_df[["FeatureNum","DiggerPepScore","DeltaMassDa","DeltaMassPPM"]]
msc_feature_df.head()

Unnamed: 0,FeatureNum,DiggerPepScore,DeltaMassDa,DeltaMassPPM
15158,21928,0.0,0.0,0.0
5484,21927,0.0,0.0,0.0
17443,21926,0.0,0.0,0.0
16605,21925,0.0,0.0,0.0
16592,21924,0.0,0.0,0.0


In [53]:
# annotate the features with the attributes reported by MSC.
# msc_DiggerPepScore of zero or more is what MSC reported. NaN means MSC didn't report the feature at all
feature_list_msc_df = pd.merge(feature_list_df, msc_feature_df, how='left', left_on=['feature_id'], right_on=['FeatureNum'])
feature_list_msc_df.drop(['FeatureNum'], inplace=True, axis=1)
feature_list_msc_df.rename(columns={'DiggerPepScore': 'msc_DiggerPepScore', 'DeltaMassDa': 'msc_DeltaMassDa', 'DeltaMassPPM': 'msc_DeltaMassPPM'}, inplace=True)

In [54]:
feature_list_msc_df.head()

Unnamed: 0,feature_id,charge_state,monoisotopic_mass,retention_time_secs,isotope_count,cluster_mz_centroid,cluster_summed_intensity,minimum_error,minimum_error_sulphur,msc_DiggerPepScore,msc_DeltaMassDa,msc_DeltaMassPPM
0,1,2,1066.606152,590.4,6,534.310352,1985980902,0.1647301,0,0.0,0.0,0.0
1,2,3,1928.009933,535.6,7,643.677254,2425291014,7.140533e-06,0,29.583,0.009592,4.975
2,3,3,2044.113664,1023.2,7,682.378497,2881543763,1.953337e-07,2,,,
3,4,2,1741.892439,720.6,6,871.953496,1248224845,0.009673931,2,320.189,-0.007459,-4.282
4,5,3,1927.994175,536.2,8,643.672001,461668922,1.976963e-05,0,0.0,0.0,0.0


In [55]:
feature_list_msc_df[feature_list_msc_df.msc_DiggerPepScore >= 0].head()

Unnamed: 0,feature_id,charge_state,monoisotopic_mass,retention_time_secs,isotope_count,cluster_mz_centroid,cluster_summed_intensity,minimum_error,minimum_error_sulphur,msc_DiggerPepScore,msc_DeltaMassDa,msc_DeltaMassPPM
0,1,2,1066.606152,590.4,6,534.310352,1985980902,0.16473,0,0.0,0.0,0.0
1,2,3,1928.009933,535.6,7,643.677254,2425291014,7e-06,0,29.583,0.009592,4.975
3,4,2,1741.892439,720.6,6,871.953496,1248224845,0.009674,2,320.189,-0.007459,-4.282
4,5,3,1927.994175,536.2,8,643.672001,461668922,2e-05,0,0.0,0.0,0.0
9,10,2,986.560834,790.0,6,494.287693,1855507701,0.020531,0,38.209,-0.003978,-4.032
