In [1]:
import pandas as pd
import data_loader as dl
import pyteomics.mzml
import spectrum_utils.spectrum as sus



# Download Data
 dl.download_parsed_psm() will load the psm files (found in data/metaMorpheus_output/) into a pandas dataframe, and remove unnecessary columns. It also parses the correpsonding mzML file (found in data/mzMLs/) and appends data that we need data, such as precursor intensity, intensity array, and mz array. 
 
The dataframe is then saved as a tsv in data/parsed_psm/

# Access Data

After running dl.download_parsed_psm(), dl.load_joined_psm_mzml(file_name) will available to use. It access the parsed psms (found in data/parsed_psm) and returns it as a pandas dataframe.

The file_name is a replicate from any file type. The format is filetype_replicate#. A list of all possible file_names is below:

    bulk_rep1
    bulk_rep2
    bulk_rep3

    2ng_rep1
    2ng_rep2
    2ng_rep3
    2ng_rep4
    2ng_rep5
    2ng_rep6

    0.2ng_rep1
    0.2ng_rep2
    0.2ng_rep3
    0.2ng_rep4
    0.2ng_rep5
    0.2ng_rep6

    sc_rep1
    sc_rep2
    sc_rep3
    sc_rep4
    sc_rep5

In [2]:
dl.download_parsed_psm()

In [3]:
rep1_2ng = dl.load_joined_psm_mzml("2ng_rep1")

rep1_2ng.columns

Index(['File Name', 'scan', 'Scan Retention Time', 'Num Experimental Peaks',
       'Total Ion Current', 'Precursor Scan Number', 'Precursor Charge',
       'Precursor MZ', 'Precursor Mass', 'peptide',
       'PSM Count (unambiguous, <0.01 q-value)', 'Peptide Monoisotopic Mass',
       'Mass Diff (Da)', 'Mass Diff (ppm)', 'Protein Accession',
       'Protein Name', 'Gene Name', 'Decoy', 'Matched Ion Series',
       'Matched Ion Mass-To-Charge Ratios', 'Matched Ion Mass Diff (Da)',
       'Matched Ion Mass Diff (Ppm)', 'Matched Ion Intensities',
       'Matched Ion Counts', 'QValue', 'minute', 'mz_array', 'intensity_array',
       'precursor_intenisty'],
      dtype='object')

In [4]:
rep1_2ng.head()

Unnamed: 0,File Name,scan,Scan Retention Time,Num Experimental Peaks,Total Ion Current,Precursor Scan Number,Precursor Charge,Precursor MZ,Precursor Mass,peptide,...,Matched Ion Mass-To-Charge Ratios,Matched Ion Mass Diff (Da),Matched Ion Mass Diff (Ppm),Matched Ion Intensities,Matched Ion Counts,QValue,minute,mz_array,intensity_array,precursor_intenisty
0,Ex_Auto_J3_30umTB_2ngQC_60m_1-calib,16668,52.52059,101.0,335251.91071,16649,2.0,1280.62814,2559.24172,LVQDVANNTNEEAGDGTTTATVLAR,...,"[b2+1:213.15994, b3+1:341.21793, b4+1:456.2455...","[b2+1:0.00019, b3+1:-0.00040, b4+1:0.00031, b5...","[b2+1:0.89, b3+1:-1.17, b4+1:0.68, b5+1:2.38, ...","[b2+1:4194, b3+1:3583, b4+1:14005, b5+1:3302, ...",28,0.0,52,"[118.0188980102539, 124.82211303710938, 175.11...","[813.1907348632812, 718.8142700195312, 5288.52...",106894.523438
4224,Ex_Auto_J3_30umTB_2ngQC_60m_1-calib,11505,40.21101,93.0,280621.26892,11487,3.0,496.24615,1485.71662,RDPHLAC[Common Fixed:Carbamidomethyl on C]VAYER,...,"[b2+1:272.13607, b3+1:369.18884, b4+1:506.2461...","[b2+1:0.00074, b3+1:0.00074, b4+1:-0.00082];[y...","[b2+1:2.74, b3+1:2.02, b4+1:-1.63];[y1+1:-0.08...","[b2+1:7612, b3+1:1894, b4+1:6145];[y1+1:4380, ...",11,0.0,40,"[110.07154083251953, 126.54723358154297, 128.1...","[3708.7587890625, 637.7166748046875, 798.39727...",576431.75
4223,Ex_Auto_J3_30umTB_2ngQC_60m_1-calib,6188,27.61035,70.0,135619.72083,6170,2.0,505.76641,1009.51826,VEQHVVDGK,...,"[b2+1:229.11820, b3+1:357.17747, b4+1:494.2371...","[b2+1:-0.00008, b3+1:0.00061, b4+1:0.00138, b5...","[b2+1:-0.36, b3+1:1.71, b4+1:2.79, b5+1:-1.40,...","[b2+1:1630, b3+1:713, b4+1:1758, b5+1:1344, b7...",11,0.0,27,"[110.07170104980469, 117.48556518554688, 127.0...","[3160.310546875, 684.11572265625, 5652.1494140...",34953.804688
4222,Ex_Auto_J3_30umTB_2ngQC_60m_1-calib,26968,77.17926,74.0,177492.46899,26948,2.0,786.88324,1571.75192,FGFPEGSVELYAEK,...,"[b2+1:205.09730];[y1+1:147.11265, y2+1:276.157...","[b2+1:0.00014];[y1+1:-0.00015, y2+1:0.00222, y...","[b2+1:0.70];[y1+1:-1.04, y2+1:8.06, y3+1:0.35,...","[b2+1:3222];[y1+1:1959, y2+1:910, y3+1:1815, y...",11,0.0,77,"[120.0811538696289, 121.08496856689453, 125.22...","[17983.341796875, 1540.97705078125, 680.275878...",44407.707031
4221,Ex_Auto_J3_30umTB_2ngQC_60m_1-calib,12997,43.77077,82.0,202263.41882,12984,3.0,577.27851,1728.8137,NVTDVVNTC[Common Fixed:Carbamidomethyl on C]HD...,...,"[b2+1:214.11863];[y1+1:147.11272, y2+1:234.144...","[b2+1:0.00001];[y1+1:-0.00008, y2+1:-0.00019, ...","[b2+1:0.04];[y1+1:-0.57, y2+1:-0.83, y5+1:-0.4...","[b2+1:4897];[y1+1:2650, y2+1:4067, y5+1:1936, ...",11,0.0,43,"[110.07170867919922, 127.0870590209961, 127.38...","[1100.1298828125, 2687.216064453125, 728.96966...",76686.992188
