In [1]:
import pandas as pd
import re
import numpy
import pickle

In [2]:
from nltk.corpus import stopwords
import spacy

In [3]:
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer

In [4]:
df =  pd.read_csv('base_data/pride_table.csv').astype(str)

In [5]:
df.head(2)

Unnamed: 0,dataset_id,sample_protocol,data_protocol,description,instruments,exp_types,quant_methods,labhead_fullname
0,PXD000001,Not available,Two extra files have been added post-publicati...,Expected reporter ion ratios: Erwinia peptides...,"LTQ Orbitrap Velos, instrument model",Bottom-up proteomics,,
1,PXD000002,Not available,Not available,Human saliva samples from a 36 year old male w...,"instrument model, LTQ Velos",Bottom-up proteomics,,


In [6]:
method = df['sample_protocol'][(df['sample_protocol'] != 'Not available') & (df['sample_protocol'] != 'nan')].reset_index(drop=True)

In [7]:
# list of stop words
stop_words = stopwords.words('english')

# Import spacy model
nlp = spacy.load('en_core_web_lg')

In [8]:
def lemmatize_text(text):
    doc = nlp(text)
    lemm_text = [token.lemma_.lower() for token in doc]
    
    lemm_text = ' '.join(lemm_text)
    lemm_text = lemm_text.replace(' - ', '-')
    lemm_text = lemm_text.replace(' .', '.')
    lemm_text = lemm_text.replace(' ,', ',')
    lemm_text = lemm_text.replace('( ', '(')
    lemm_text = lemm_text.replace(' )', ')')
    lemm_text = lemm_text.replace(' / ', '/')
    
    lemm_text = lemm_text.replace('\u2009', '')    # This is a special case applicable to iloc[0]
    
    return lemm_text

In [9]:
test_text = method.iloc[2]
test_text

'In-gel digests were performed as described in standard protocols. Briefly, following the SDS-PAGE and washing of the excised gel slices proteins were reduced by adding 10 mM DTT (Sigma Aldrich) prior to alkylation with 55 mM iodoacetamide (Sigma Aldrich). After washing and shrinking of the gel pieces with 100% acetonitrile trypsin (Sequencing Grade Modified, Promega) was added and proteins were digested overnight in 40 mM ammoniumbicarbonate at 37°C. For protein identification samples were directly used for nano-ESI-LC-MS/MS. Each sample was first desalted on-line by a C18 microcolumn (300 µm i.d. x 5 mm, packed with C18 PepMap™, 5 µm, 100 Å by LC Packings). Peptides were then separated on a C18 reversed phase column via a linear acetonitrile gradient (UltiMate 3000 system (Dionex) and column (75 µm i.d. x 15 cm, packed with C18 PepMap™, 3 µm, 100 Å by LC Packings) before MS and MS/MS spectra were recorded on an Oribitrap mass spectrometer (Thermo Electron). '

In [10]:
lemmatize_text(test_text)

'in-gel digest be perform as describe in standard protocol. briefly, follow the sds-page and washing of the excise gel slice protein be reduce by add 10 mm dtt (sigma aldrich) prior to alkylation with 55 mm iodoacetamide (sigma aldrich). after wash and shrink of the gel piece with 100 % acetonitrile trypsin (sequencing grade modified, promega) be add and protein be digest overnight in 40 mm ammoniumbicarbonate at 37 ° c. for protein identification sample be directly use for nano-esi-lc-ms/ms. each sample be first desalt on-line by a c18 microcolumn (300 µm i.d. x 5 mm, pack with c18 pepmap ™, 5 µm, 100 å by lc packings). peptide be then separate on a c18 reverse phase column via a linear acetonitrile gradient (ultimate 3000 system (dionex) and column (75 µm i.d. x 15 cm, pack with c18 pepmap ™, 3 µm, 100 å by lc packings) before ms and ms/ms spectra be record on an oribitrap mass spectrometer (thermo electron).'

In [11]:
test_doc = [lemmatize_text(method.iloc[i]) for i in range(10)]

In [12]:
test_doc

['the crude membrane from 5 p56-p70 glun1tap/tap mouse forebrain be re-suspend in 12.5  ml buffer h and extract with 12.5  ml 2 % deoxycholate, 100  mm nacl, 50  mm tris. cl ph8 for 1  h at 6  ° c. total extract be centrifuge at 120,000 g. for 40  min at 8  ° c. condition for immuno-capture, wash and peptide-antigen exchange elution be screen use a high-throughput purification robot (magic sample processor, invitrogen). for 25  ml glun1tap/tap extract supernatant, 80  μg mouse flag antibody be couple to 30  mg (500  μl) protein g magnetic bead (invitrogen). receptor be capture from extract supernatant for 2  h. the bead be wash three time with 5  ml wash buffer (0.37 % w/v sodium deoxycholate, 0.05  mg.ml−1 lipid (1:1:3 popc : pope : pog), 150  mm nacl, 50  mm tris. cl ph8). flag capture complex be elute with 600  μl wash buffer supplement with 0.2  mg.ml−1 flag peptide for 2  h at 6  ° c. eluate be buffer exchange to remove the flag elution peptide and concentrate with a 100-kda mwco 

In [13]:
vectorizer = TfidfVectorizer(
    analyzer='word', 
    strip_accents='ascii',
    ngram_range=(2, 2),
    stop_words='english')

In [14]:
vectorizer.fit(test_doc)
bigram_names = vectorizer.get_feature_names()

In [15]:
print(len(bigram_names))
bigram_names

1877


['000 000',
 '000 10',
 '000 40',
 '000 400',
 '000 automatic',
 '000 count',
 '000 ion',
 '000 ms',
 '000g supernatant',
 '000g ultracentrifugation',
 '030 ml',
 '05 mg',
 '075 mm',
 '10 15',
 '10 25',
 '10 50',
 '10 75',
 '10 acetonitrile',
 '10 bis',
 '10 cm',
 '10 fbs',
 '10 healthy',
 '10 lane',
 '10 min',
 '10 minute',
 '10 mm',
 '10 result',
 '10 select',
 '10 silicatip',
 '10 solvent',
 '100 000',
 '100 10',
 '100 20',
 '100 25',
 '100 250',
 '100 30',
 '100 80',
 '100 acetonitrile',
 '100 id',
 '100 kda',
 '100 lc',
 '100 mg',
 '100 mm',
 '100 pore',
 '100 sample',
 '100 xp',
 '1000 laser',
 '1000 mm',
 '100200 thaliana',
 '104 ion',
 '106 maximum',
 '12 cm',
 '12 fraction',
 '12 light',
 '12 ml',
 '12 nupage',
 '12 sds',
 '12 setup',
 '120 000',
 '135 minute',
 '15 20',
 '15 cm',
 '15 min',
 '15 ml',
 '15 ng',
 '150 mm',
 '150 molm2s1',
 '150 ms',
 '16 100',
 '1600 laser',
 '17 000g',
 '17 fix',
 '176 spot',
 '180 20',
 '1d lc',
 '1d plus',
 '20 000',
 '20 20',
 '20 25',
 '20

In [16]:
vectorizer.vocabulary_

{'crude membrane': 529,
 'membrane p56': 1013,
 'p56 p70': 1202,
 'p70 glun1tap': 1203,
 'glun1tap tap': 770,
 'tap mouse': 1678,
 'mouse forebrain': 1124,
 'forebrain suspend': 717,
 'suspend 12': 1668,
 '12 ml': 54,
 'ml buffer': 1071,
 'buffer extract': 381,
 'extract 12': 680,
 'ml deoxycholate': 1074,
 'deoxycholate 100': 561,
 '100 mm': 42,
 'mm nacl': 1097,
 'nacl 50': 1146,
 '50 mm': 161,
 'mm tris': 1103,
 'tris cl': 1753,
 'cl ph8': 457,
 'ph8 total': 1271,
 'total extract': 1729,
 'extract centrifuge': 681,
 'centrifuge 120': 434,
 '120 000': 58,
 '000 40': 2,
 '40 min': 148,
 'min condition': 1035,
 'condition immuno': 497,
 'immuno capture': 823,
 'capture wash': 415,
 'wash peptide': 1843,
 'peptide antigen': 1244,
 'antigen exchange': 309,
 'exchange elution': 660,
 'elution screen': 643,
 'screen use': 1495,
 'use high': 1804,
 'high throughput': 796,
 'throughput purification': 1705,
 'purification robot': 1367,
 'robot magic': 1436,
 'magic sample': 972,
 'sample proc

In [17]:
vectorizer.idf_

array([2.70474809, 2.70474809, 2.70474809, ..., 2.70474809, 2.70474809,
       2.70474809])

In [18]:
tfidf = vectorizer.transform(test_doc)

In [19]:
print(tfidf.toarray().shape)
tfidf.toarray()

(10, 1877)


array([[0.        , 0.        , 0.05627674, ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       ...,
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.05281834],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ]])

#### Test case to figure out processing time

In [20]:
len(method)

4390

In [21]:
n = len(method)

In [22]:
%%time
doc = [lemmatize_text(method.iloc[i]) for i in range(n)]

CPU times: user 23min 38s, sys: 21min 58s, total: 45min 36s
Wall time: 5min 54s


In [23]:
%%time
vectorizer = TfidfVectorizer(analyzer='word',  strip_accents='ascii', ngram_range=(2, 2), stop_words='english')
bigram_tfidf = vectorizer.fit_transform(doc)

CPU times: user 1.5 s, sys: 277 ms, total: 1.78 s
Wall time: 1.43 s


In [24]:
%%time
vectorizer2 = CountVectorizer(analyzer='word',  strip_accents='ascii', ngram_range=(2, 2), stop_words='english')
bigram_count= vectorizer2.fit_transform(doc)

CPU times: user 1.45 s, sys: 9.29 ms, total: 1.46 s
Wall time: 1.46 s


In [25]:
x = pd.DataFrame(bigram_count.toarray(), columns=vectorizer2.get_feature_names())

In [49]:
x.shape

(4390, 219664)

In [26]:
x.sum(axis=0).sort_values(ascending=False).head(50)

ms ms                    5004
formic acid              3378
mass spectrometer        2594
lc ms                    2479
50 mm                    1995
thermo fisher            1805
thermo scientific        1801
flow rate                1663
ms analysis              1617
fisher scientific        1604
et al                    1526
10 mm                    1519
30 min                   1465
10 min                   1438
room temperature         1432
mass spectrometry        1316
nl min                   1163
mm tris                  1154
100 mm                   1143
ammonium bicarbonate     1121
reverse phase            1102
ms scan                  1097
collision energy         1078
ltq orbitrap             1073
mm ammonium              1031
sds page                  917
data dependent            916
tris hcl                  916
lysis buffer              914
peptide elute             885
20 mm                     882
spectrometer thermo       869
dynamic exclusion         866
final conc

In [27]:
[col for col in x.columns if 'label' in col]

['04 label',
 '09 label',
 '10 label',
 '100 unlabeled',
 '10plex label',
 '10plex labeling',
 '10plex labelling',
 '11 label',
 '116 label',
 '117 label',
 '118 label',
 '12 label',
 '12 labelling',
 '121 label',
 '126 label',
 '127 label',
 '127 labelling',
 '128 label',
 '129n label',
 '130 label',
 '131 label',
 '131 labeling',
 '13c label',
 '13c15n label',
 '13cd3 label',
 '14g label',
 '14n label',
 '14n labeling',
 '14n unlabelled',
 '150 label',
 '15mm unlabeled',
 '15n label',
 '15n labeling',
 '15n labelling',
 '15n2 label',
 '15n4 labell',
 '16 label',
 '16 labelling',
 '16h label',
 '17odya label',
 '1830 label',
 '18o label',
 '18o labeling',
 '18o labelling',
 '20 label',
 '200 label',
 '2005 label',
 '2009 label',
 '2012 label',
 '2013 label',
 '213 label',
 '24 labelling',
 '2425 label',
 '24h label',
 '25 label',
 '28 label',
 '2dge label',
 '2mega labeling',
 '30 labeling',
 '30 labelling',
 '300 unlabeled',
 '300ul labeling',
 '35 label',
 '350 unlabelled',
 '37 lab

In [28]:
x['label free'].sum(axis=0)

183

In [29]:
[col for col in x.columns if 'spectra' in col]

['000 spectra',
 '10 spectra',
 '100ms spectra',
 '145 spectra',
 '1500 spectra',
 '1600 spectra',
 '2017 spectral',
 '250 spectra',
 '27 spectra',
 '275 spectra',
 '28 spectra',
 '2kv spectra',
 '30 spectra',
 '30 spectral',
 '3spectra maximum',
 '45 spectra',
 '50 spectra',
 '60 spectra',
 '6965 spectral',
 '70 spectra',
 '8902345 spectra',
 'abundant spectra',
 'acquire spectra',
 'acquisition spectral',
 'active spectra',
 'additional spectral',
 'agilent spectra',
 'aldrich spectra',
 'analysis spectra',
 'analysis spectral',
 'assay spectramax',
 'assign spectra',
 'base spectral',
 'cad spectra',
 'charge spectra',
 'cid spectra',
 'cluster spectra',
 'collect spectra',
 'configuration spectra',
 'construct spectral',
 'coverage spectral',
 'dda spectra',
 'dda spectral',
 'deconvolute spectra',
 'dependent spectra',
 'dependent spectral',
 'dionex spectra',
 'disabled spectra',
 'dissociation spectra',
 'duration spectra',
 'ecd spectra',
 'energy spectra',
 'etd spectra',
 'ex

In [30]:
x[['spectral count', 'spectral counting', 'spectral abundance']].sum(axis=0)

spectral count        3
spectral counting     5
spectral abundance    1
dtype: int64

Some regex to filter columns

In [46]:
with_nubers = [col for col in x.columns if re.match(r'(?:.*\d+.* [A-z]*)|(?:[A-z]* .*\d+.*)', col)]

In [48]:
print(len(with_nubers))
with_nubers

62258


['00 00',
 '00 01',
 '00 12',
 '00 15',
 '00 1800',
 '00 2000',
 '00 2425',
 '00 2426',
 '00 38',
 '00 50',
 '00 5l',
 '00 65',
 '00 assess',
 '00 centrifugation',
 '00 concentrated',
 '00 correspond',
 '00 flow',
 '00 ftms',
 '00 hrs',
 '00 instrument',
 '00 kv',
 '00 ld',
 '00 measurement',
 '00 ml',
 '00 mobile',
 '00 noon',
 '00 ph',
 '00 place',
 '00 pron',
 '00 sample',
 '00 soft',
 '00 sun',
 '00 tryptic',
 '000 000',
 '000 0000',
 '000 0e4',
 '000 0e5',
 '000 10',
 '000 100',
 '000 10min',
 '000 10minutes',
 '000 110',
 '000 12',
 '000 120',
 '000 15',
 '000 150',
 '000 15min',
 '000 15oc',
 '000 17',
 '000 17500',
 '000 18',
 '000 1h',
 '000 1h30',
 '000 20',
 '000 200',
 '000 200m',
 '000 20a',
 '000 20min',
 '000 21',
 '000 22',
 '000 24',
 '000 25',
 '000 250',
 '000 2aa',
 '000 2min',
 '000 30',
 '000 300',
 '000 30min',
 '000 35',
 '000 350',
 '000 375',
 '000 3801',
 '000 3e6',
 '000 3min',
 '000 40',
 '000 400',
 '000 4001',
 '000 400ms',
 '000 400th',
 '000 45',
 '000 

In [50]:
len(with_nubers)/x.shape[1]

0.28342377449195133

In [52]:
with_nubers2 = [col for col in x.columns if re.match(r'.*\d+.*', col)]
print(len(with_nubers2))
len(with_nubers2)/x.shape[1]

62258


0.28342377449195133

In [53]:
with_nubers2

['00 00',
 '00 01',
 '00 12',
 '00 15',
 '00 1800',
 '00 2000',
 '00 2425',
 '00 2426',
 '00 38',
 '00 50',
 '00 5l',
 '00 65',
 '00 assess',
 '00 centrifugation',
 '00 concentrated',
 '00 correspond',
 '00 flow',
 '00 ftms',
 '00 hrs',
 '00 instrument',
 '00 kv',
 '00 ld',
 '00 measurement',
 '00 ml',
 '00 mobile',
 '00 noon',
 '00 ph',
 '00 place',
 '00 pron',
 '00 sample',
 '00 soft',
 '00 sun',
 '00 tryptic',
 '000 000',
 '000 0000',
 '000 0e4',
 '000 0e5',
 '000 10',
 '000 100',
 '000 10min',
 '000 10minutes',
 '000 110',
 '000 12',
 '000 120',
 '000 15',
 '000 150',
 '000 15min',
 '000 15oc',
 '000 17',
 '000 17500',
 '000 18',
 '000 1h',
 '000 1h30',
 '000 20',
 '000 200',
 '000 200m',
 '000 20a',
 '000 20min',
 '000 21',
 '000 22',
 '000 24',
 '000 25',
 '000 250',
 '000 2aa',
 '000 2min',
 '000 30',
 '000 300',
 '000 30min',
 '000 35',
 '000 350',
 '000 375',
 '000 3801',
 '000 3e6',
 '000 3min',
 '000 40',
 '000 400',
 '000 4001',
 '000 400ms',
 '000 400th',
 '000 45',
 '000 

So use the short form of regex...