In [1]:
import pandas as pd
import re
import numpy
import pickle

In [2]:
df =  pd.read_csv('base_data/pride_table.csv').astype(str)

In [3]:
df.head(2)

Unnamed: 0,dataset_id,sample_protocol,data_protocol,description,instruments,exp_types,quant_methods,labhead_fullname
0,PXD000001,Not available,Two extra files have been added post-publicati...,Expected reporter ion ratios: Erwinia peptides...,"LTQ Orbitrap Velos, instrument model",Bottom-up proteomics,,
1,PXD000002,Not available,Not available,Human saliva samples from a 36 year old male w...,"instrument model, LTQ Velos",Bottom-up proteomics,,


In [4]:
method = df['sample_protocol'][(df['sample_protocol'] != 'Not available') & (df['sample_protocol'] != 'nan')].reset_index(drop=True)

In [5]:
method.iloc[3]

'Conditioned media preparation and HPLC-isobaric tags for relative and absolute quantitation mass-spectrometry (iTRAQ MS). Twenty million cells were transfected with either control or Gαs siRNA, then were seeded in 100-mm cell culture plates in DMEM + 10% FBS. Two days later, cells were washed twice with pre-warmed PBS. Conditioned media was harvested another 24 h later, and centrifuged at 20,000 g for 10 min at 4°C to remove cell debris. To enrich for secreted proteins in the conditioned media, conditioned media samples were centrifuged using Amicon centrifugal filters with a 3kDa cutoff (Millipore, Billerica, MA). A total of 8 concentrated conditioned media samples were independently prepared: 4 samples from control siRNA-treated cells, and 4 samples from Gαs siRNA-treated cells. Samples were stored at –70°C until they were analyzed by iTRAQ labeling and LC-MS/MS. iTRAQ labeling – Total protein concentration of each sample was measured using the Bradford assay. A quantity of 100 µg w

In [6]:
# finding concentrations
regex1 = r'((?:0\.)?(?:\d{1,4}) ?(?:mM|µM|nM|%))'
text = method.iloc[3]
res = [s for s in re.findall(regex1, text)]
for conc in res:
    print(conc)

10%
0.1%
50 mM
200 mM
1%
2%
0.1%
98%
0.1%
10%
50%
90%
90%
10%
10%
70%


In [7]:
# finding buffers1
regex2 = r'((?:0\.)?(?:\d{1,4}) ?(?:mM|µM|nM|%) [A-Za-z]+)'
text = method.iloc[3]
res = [s for s in re.findall(regex2, text)]
for buffer in res:
    print(buffer)

10% FBS
0.1% SDS
50 mM Tris
200 mM methyl
1% trifluoroacetic
2% v
0.1% v
98% v
0.1% v
10% to
50% gradient
90% by
90% solvent
10% at
10% solvent
70% ACN


In [8]:
# finding buffers2: add w/v, v/v
regex2 = r'((?:0\.)?(?:\d{1,4}) ?(?:mM|µM|nM|%) \(?(?:w/v |v/v )?\)?[A-Za-z]+)'
text = method.iloc[3]
res = [s for s in re.findall(regex2, text)]
for buffer in res:
    print(buffer)

10% FBS
0.1% SDS
50 mM Tris
200 mM methyl
1% trifluoroacetic
2% v/v acetonitrile
0.1% v/v TFA
98% v/v ACN
0.1% v/v TFA
10% to
50% gradient
90% by
90% solvent
10% at
10% solvent
70% ACN


Except '10% at' it seems to work!

In [9]:
# Test on another text
regex2 = r'((?:0\.)?(?:\d{1,4}) ?(?:mM|µM|nM|%) \(?(?:w/v |v/v )?\)?[A-Za-z]+)'
text = method.iloc[4]
res = [s for s in re.findall(regex2, text)]
for buffer in res:
    print(buffer)

10% Bis
5% formic
50% acetonitrile
0.1% trifluoroacetic
40% acetonitrile
0.1% formic
0.67% min


In [10]:
# some more improvements
buffer_pattern = r'((?:0\.)?(?:\d{1,4}) ?(?:mM|µM|nM|%) (?:w/v\)? |\(?v/v\)? )?(?:of )?[A-Za-z0-9-]+)'
for i in range(30):
    text = method.iloc[i]
    res = [s for s in re.findall(buffer_pattern, text)]
    print('===== for text %d =====' % i)
    for b in res:
        print(b)

===== for text 0 =====
2% deoxycholate
0.37% w/v sodium
50% v/v acetonitrile
2% v/v formic
2% v/v acetonitrile
===== for text 1 =====
===== for text 2 =====
10 mM DTT
55 mM iodoacetamide
100% acetonitrile
40 mM ammoniumbicarbonate
===== for text 3 =====
10% FBS
0.1% SDS
50 mM Tris
200 mM methyl
1% trifluoroacetic
2% v/v acetonitrile
0.1% v/v TFA
98% v/v ACN
0.1% v/v TFA
10% to
50% gradient
90% by
90% solvent
10% at
10% solvent
70% ACN
===== for text 4 =====
10% Bis-Tris
5% formic
50% acetonitrile
0.1% trifluoroacetic
40% acetonitrile
0.1% formic
0.67% min
===== for text 5 =====
100 mM solution
250 mM sucrose
10 mM triethanolamine
12% SDS
0.1% formic
0.1% formic
50% B
===== for text 6 =====
2% acetonitrile
0.1% formic
2% to
50% Acetonitrile
0.1% formic
20 mM KH2PO4
20% ACN
1000 mM KCl
2% ACN
0.1% FA
2% to
36% ACN
0.1% FA
===== for text 7 =====
===== for text 8 =====
20 mM Tris
20 mM NaCl
12% NuPAGE
25 mM ammonium
30% acetonitrile
25 mM ABC
80% acetonitrile
25 mM ABC
10 mM DTT
25 mM ABC


#### cases to remove
'10% at'
'0.67% min' (min-1)
'90% by'
'10% to'
'50% B' (could be included)
'35% for'
'12% gel' (could be included in a different category, not as buffer)
'12% gradient' (could be included in a different category, not as buffer)

In [11]:
method.iloc[6]

"Low resolution mass spectrometry: For the shotgun proteomics analysis, six cell pellets were subject to differential detergent fractionation as described by McCarthy et al. using the detergents digitonin, Tween 20 and SDS. After each detergent application, samples were centrifuged to separate solublized proteins from cellular debris. The insoluble pellet left after treatment was subject to trypsin digestion along with the soluble fractions, but could not be quantified. Fractions were normalized to 20 μg each and trypsin digestion as described by McCarthy et al. Following digestion, each fraction was desalted using a peptide microtrap (Michrom BioResources) according to the manufacturer's instructions. After desalting, each fraction was further cleaned using a strong cation exchange (SCX) microtrap (Michrom BioResources) to remove any residual detergent, which could interfere with the mass spectrometry. Fractions were dried and resuspended in 10 μL of 2% acetonitrile (ACN), 0.1% formic

In [12]:
method.iloc[10]

'Protein Extraction from Catheter Biofilm Small pieces of the catheter tip (two times one cm) were transferred to 1 ml of urea-containing buffer (11 M urea, 3 M thiourea, 70 mM dichlorodiphenyltrichloroethane (DDT), 4% (w/v) 3-[(3-cholamidopropyl)dimethyl¬ammonio]-1-propanesulfonate (CHAPS)). The sample was incubated on ice for 5 min, mixed intensively for 30 s and then incubated on ice for 10 min; those steps were repeated twice. Sonication was performed for 30 s and repeated five times to dissolve and lyse the cells using a Sonicator (Type UW 2070, Bandelin electronics, Berlin). Subsequently, the catheter pieces were removed and the remaining cell lysate was mixed with ice-cold acetone (1:7, v/v). After centrifugation (14,000 x g, 40 min) the resulting protein pellet was dried and solubilized in urea-containing buffer supplemented with 1% SDS. Total protein concentration was determined according to Bradford (42) employing the Coomassie PlusTM Protein Assay (Thermo Fisher Scientific I