In [1]:
import pandas as pd
import re
import numpy
import pickle

In [2]:
from nltk.corpus import stopwords
import spacy

In [3]:
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer

In [4]:
df =  pd.read_csv('base_data/pride_table.csv').astype(str)

In [5]:
df.head(2)

Unnamed: 0,dataset_id,sample_protocol,data_protocol,description,instruments,exp_types,quant_methods,labhead_fullname
0,PXD000001,Not available,Two extra files have been added post-publicati...,Expected reporter ion ratios: Erwinia peptides...,"LTQ Orbitrap Velos, instrument model",Bottom-up proteomics,,
1,PXD000002,Not available,Not available,Human saliva samples from a 36 year old male w...,"instrument model, LTQ Velos",Bottom-up proteomics,,


In [6]:
method = df['sample_protocol'][(df['sample_protocol'] != 'Not available') & (df['sample_protocol'] != 'nan')].reset_index(drop=True)

In [7]:
# list of stop words
stop_words = stopwords.words('english')

# Import spacy model
nlp = spacy.load('en_core_web_lg')

In [8]:
def lemmatize_text(text):
    doc = nlp(text)
    lemm_text = [token.lemma_.lower() for token in doc]
    
    lemm_text = ' '.join(lemm_text)
    lemm_text = lemm_text.replace(' - ', '-')
    lemm_text = lemm_text.replace(' .', '.')
    lemm_text = lemm_text.replace(' ,', ',')
    lemm_text = lemm_text.replace('( ', '(')
    lemm_text = lemm_text.replace(' )', ')')
    lemm_text = lemm_text.replace(' / ', '/')
    
    lemm_text = lemm_text.replace('\u2009', '')    # This is a special case applicable to iloc[0]
    
    return lemm_text

In [9]:
test_doc = [lemmatize_text(method.iloc[i]) for i in range(10)]

#### Test case to figure out processing time

In [10]:
len(method)

4390

In [11]:
n = len(method)

In [12]:
%%time
doc = [lemmatize_text(method.iloc[i]) for i in range(n)]

CPU times: user 10min 52s, sys: 12min 2s, total: 22min 55s
Wall time: 3min 12s


In [13]:
%%time
vectorizer = TfidfVectorizer(analyzer='word',  strip_accents='ascii', ngram_range=(3, 3), stop_words='english')
bigram_tfidf = vectorizer.fit_transform(doc)

CPU times: user 984 ms, sys: 231 ms, total: 1.21 s
Wall time: 941 ms


In [14]:
%%time
vectorizer2 = CountVectorizer(analyzer='word',  strip_accents='ascii', ngram_range=(3, 3), stop_words='english')
bigram_count= vectorizer2.fit_transform(doc)

CPU times: user 877 ms, sys: 18.5 ms, total: 896 ms
Wall time: 896 ms


In [15]:
x = pd.DataFrame(bigram_count.toarray(), columns=vectorizer2.get_feature_names())

In [16]:
x.sum(axis=0).sort_values(ascending=False).head(50)

lc ms ms                           866
thermo fisher scientific           514
ms ms analysis                     422
mm ammonium bicarbonate            352
acetonitrile formic acid           325
mass spectrometer thermo           315
ltq orbitrap velos                 287
tris hcl ph                        248
ms ms scan                         237
mm tris hcl                        235
300 nl min                         224
50 mm ammonium                     180
protease inhibitor cocktail        165
50 mm tris                         162
flow rate 300                      160
linear ion trap                    159
rate 300 nl                        154
collision induce dissociation      154
protein concentration determine    151
normalize collision energy         148
sds page gel                       142
10 mm dtt                          141
orbitrap mass spectrometer         140
tandem mass spectrometry           140
accord manufacturer instruction    140
ltq orbitrap xl          

In [20]:
[col for col in x.columns if col.startswith('label free')]

['label free amino',
 'label free analysis',
 'label free approach',
 'label free data',
 'label free differential',
 'label free experiment',
 'label free expression',
 'label free itraq',
 'label free lc',
 'label free mass',
 'label free multiscreen',
 'label free optional',
 'label free protein',
 'label free proteome',
 'label free proteomic',
 'label free quantification',
 'label free quantitation',
 'label free quantitative',
 'label free relative',
 'label free repeat',
 'label free silac',
 'label free strategy']

In [21]:
x[[col for col in x.columns if col.startswith('label free')]].sum(axis=0)

label free amino              2
label free analysis           6
label free approach           2
label free data               1
label free differential       1
label free experiment         1
label free expression         4
label free itraq              1
label free lc                 4
label free mass               4
label free multiscreen        1
label free optional           1
label free protein            1
label free proteome           1
label free proteomic          2
label free quantification    20
label free quantitation       5
label free quantitative       4
label free relative           2
label free repeat             1
label free silac              1
label free strategy           1
dtype: int64

In [22]:
[col for col in x.columns if col.startswith('spectra')]

['spectra 000 000',
 'spectra 000 count',
 'spectra 10 abundant',
 'spectra 106 charge',
 'spectra 12 precursor',
 'spectra 15 ion',
 'spectra 17500 resolution',
 'spectra 20 abundant',
 'spectra 20 datum',
 'spectra 200 precursor',
 'spectra 2002000 acquire',
 'spectra 300 1200',
 'spectra 300 1600',
 'spectra 300 1650',
 'spectra 300 1700',
 'spectra 300 1800',
 'spectra 300 2000',
 'spectra 300 500',
 'spectra 3001500 acquire',
 'spectra 3001650 70',
 'spectra 3001650 analyze',
 'spectra 3001700 acquire',
 'spectra 3002000 acquire',
 'spectra 3002000 intact',
 'spectra 3102000 acquire',
 'spectra 335 1800',
 'spectra 3351800 profile',
 'spectra 350 1250',
 'spectra 350 1250m',
 'spectra 350 1500',
 'spectra 350 1600',
 'spectra 350 1850',
 'spectra 350 2000',
 'spectra 350 600',
 'spectra 375 1600',
 'spectra 375 2000',
 'spectra 400 2000',
 'spectra 400 800',
 'spectra 4001600 acquire',
 'spectra 50 5000',
 'spectra 7500 resolution',
 'spectra 8plex 300',
 'spectra abundant ion',
 

In [25]:
x[[col for col in x.columns if col.startswith('spectral count')]].sum(axis=0)

spectral count analysis       1
spectral count great          1
spectral counting approach    1
spectral counting depict      1
spectral counting3 briefly    1
dtype: int64

In [26]:
[col for col in x.columns if col.startswith('spectrum')]

['spectrum 100 2000',
 'spectrum 100ms maximum',
 'spectrum 100ms minimum',
 'spectrum 20 abundant',
 'spectrum 35 collision',
 'spectrum 3501700 orbitrap',
 'spectrum 3505000 resolution',
 'spectrum 4002000 follow',
 'spectrum 8s cycle',
 'spectrum abundant multiply',
 'spectrum accumulate mass',
 'spectrum acquire mass',
 'spectrum acquire orbitrap',
 'spectrum acquire precursor',
 'spectrum acquire resolution',
 'spectrum average 1600',
 'spectrum charge state',
 'spectrum choose subsequent',
 'spectrum collect orbitrap',
 'spectrum complete fully',
 'spectrum count 73',
 'spectrum cysteine cathepsin',
 'spectrum data type',
 'spectrum dynamic exclusion',
 'spectrum follow ethcd',
 'spectrum fragmentation ms',
 'spectrum isolation fragmentation',
 'spectrum isolation window',
 'spectrum ltq orbitrap',
 'spectrum make pseudo',
 'spectrum mass range',
 'spectrum match import',
 'spectrum match unique',
 'spectrum maximum 20',
 'spectrum meet specific',
 'spectrum metalloproteinase inh