In [1]:
import pandas as pd
import re
import numpy
import pickle

In [2]:
df =  pd.read_csv('base_data/pride_table.csv').astype(str)

In [3]:
df.head(2)

Unnamed: 0,dataset_id,sample_protocol,data_protocol,description,instruments,exp_types,quant_methods,labhead_fullname
0,PXD000001,Not available,Two extra files have been added post-publicati...,Expected reporter ion ratios: Erwinia peptides...,"LTQ Orbitrap Velos, instrument model",Bottom-up proteomics,,
1,PXD000002,Not available,Not available,Human saliva samples from a 36 year old male w...,"instrument model, LTQ Velos",Bottom-up proteomics,,


In [4]:
method = df['sample_protocol'][(df['sample_protocol'] != 'Not available') & (df['sample_protocol'] != 'nan')].reset_index(drop=True)

In [5]:
method.iloc[3]

'Conditioned media preparation and HPLC-isobaric tags for relative and absolute quantitation mass-spectrometry (iTRAQ MS). Twenty million cells were transfected with either control or Gαs siRNA, then were seeded in 100-mm cell culture plates in DMEM + 10% FBS. Two days later, cells were washed twice with pre-warmed PBS. Conditioned media was harvested another 24 h later, and centrifuged at 20,000 g for 10 min at 4°C to remove cell debris. To enrich for secreted proteins in the conditioned media, conditioned media samples were centrifuged using Amicon centrifugal filters with a 3kDa cutoff (Millipore, Billerica, MA). A total of 8 concentrated conditioned media samples were independently prepared: 4 samples from control siRNA-treated cells, and 4 samples from Gαs siRNA-treated cells. Samples were stored at –70°C until they were analyzed by iTRAQ labeling and LC-MS/MS. iTRAQ labeling – Total protein concentration of each sample was measured using the Bradford assay. A quantity of 100 µg w

In [6]:
# finding concentrations
regex1 = r'((?:0\.)?(?:\d{1,4}) ?(?:mM|µM|nM|%))'
text = method.iloc[3]
res = [s for s in re.findall(regex1, text)]
for conc in res:
    print(conc)

10%
0.1%
50 mM
200 mM
1%
2%
0.1%
98%
0.1%
10%
50%
90%
90%
10%
10%
70%


In [7]:
# finding buffers1
regex2 = r'((?:0\.)?(?:\d{1,4}) ?(?:mM|µM|nM|%) [A-Za-z]+)'
text = method.iloc[3]
res = [s for s in re.findall(regex2, text)]
for buffer in res:
    print(buffer)

10% FBS
0.1% SDS
50 mM Tris
200 mM methyl
1% trifluoroacetic
2% v
0.1% v
98% v
0.1% v
10% to
50% gradient
90% by
90% solvent
10% at
10% solvent
70% ACN


In [8]:
# finding buffers2: add w/v, v/v
regex2 = r'((?:0\.)?(?:\d{1,4}) ?(?:mM|µM|nM|%) \(?(?:w/v |v/v )?\)?[A-Za-z]+)'
text = method.iloc[3]
res = [s for s in re.findall(regex2, text)]
for buffer in res:
    print(buffer)

10% FBS
0.1% SDS
50 mM Tris
200 mM methyl
1% trifluoroacetic
2% v/v acetonitrile
0.1% v/v TFA
98% v/v ACN
0.1% v/v TFA
10% to
50% gradient
90% by
90% solvent
10% at
10% solvent
70% ACN


Except '10% at' it seems to work!

In [9]:
# Test on another text
regex2 = r'((?:0\.)?(?:\d{1,4}) ?(?:mM|µM|nM|%) \(?(?:w/v |v/v )?\)?[A-Za-z]+)'
text = method.iloc[4]
res = [s for s in re.findall(regex2, text)]
for buffer in res:
    print(buffer)

10% Bis
5% formic
50% acetonitrile
0.1% trifluoroacetic
40% acetonitrile
0.1% formic
0.67% min


In [10]:
# some more improvements
buffer_pattern = r'((?:0\.)?(?:\d{1,4}) ?(?:mM|µM|nM|%) (?:w/v\)? |\(?v/v\)? )?(?:of )?[A-Za-z0-9-]+)'
for i in range(30):
    text = method.iloc[i]
    res = [s for s in re.findall(buffer_pattern, text)]
    print('===== for text %d =====' % i)
    for b in res:
        print(b)

===== for text 0 =====
2% deoxycholate
0.37% w/v sodium
50% v/v acetonitrile
2% v/v formic
2% v/v acetonitrile
===== for text 1 =====
===== for text 2 =====
10 mM DTT
55 mM iodoacetamide
100% acetonitrile
40 mM ammoniumbicarbonate
===== for text 3 =====
10% FBS
0.1% SDS
50 mM Tris
200 mM methyl
1% trifluoroacetic
2% v/v acetonitrile
0.1% v/v TFA
98% v/v ACN
0.1% v/v TFA
10% to
50% gradient
90% by
90% solvent
10% at
10% solvent
70% ACN
===== for text 4 =====
10% Bis-Tris
5% formic
50% acetonitrile
0.1% trifluoroacetic
40% acetonitrile
0.1% formic
0.67% min
===== for text 5 =====
100 mM solution
250 mM sucrose
10 mM triethanolamine
12% SDS
0.1% formic
0.1% formic
50% B
===== for text 6 =====
2% acetonitrile
0.1% formic
2% to
50% Acetonitrile
0.1% formic
20 mM KH2PO4
20% ACN
1000 mM KCl
2% ACN
0.1% FA
2% to
36% ACN
0.1% FA
===== for text 7 =====
===== for text 8 =====
20 mM Tris
20 mM NaCl
12% NuPAGE
25 mM ammonium
30% acetonitrile
25 mM ABC
80% acetonitrile
25 mM ABC
10 mM DTT
25 mM ABC


#### cases to remove
'10% at'
'0.67% min' (min-1)
'90% by'
'10% to'
'50% B' (could be included)
'35% for'
'12% gel' (could be included in a different category, not as buffer)
'12% gradient' (could be included in a different category, not as buffer)

In [19]:
# evensome more improvements
buffer_pattern = r'(?:(?:0\.)?(?:\d{1,4}) ?(?:mM|µM|nM|%) (?:w/v\)? |\(?v/v\)? )?(?:and )?(?:of )?[A-Za-z0-9-]+(?: [Aa]cid)?(?: with | ?, )?)+'

for i in range(50):
    text = method.iloc[i]
    res = [s for s in re.findall(buffer_pattern, text)]
    print('===== for text %d =====' % i)
    for b in res:
        print(b)

===== for text 0 =====
2% deoxycholate, 
0.37% w/v sodium
50% v/v acetonitrile
2% v/v formic acid, 2% v/v acetonitrile
===== for text 1 =====
===== for text 2 =====
10 mM DTT
55 mM iodoacetamide
100% acetonitrile
40 mM ammoniumbicarbonate
===== for text 3 =====
10% FBS
0.1% SDS, 
50 mM Tris
200 mM methyl
1% trifluoroacetic acid
2% v/v acetonitrile
0.1% v/v TFA
98% v/v ACN, 0.1% v/v TFA
10% to
50% gradient
90% by
90% solvent
10% at
10% solvent
70% ACN
===== for text 4 =====
10% Bis-Tris
5% formic acid
50% acetonitrile, 
0.1% trifluoroacetic acid
40% acetonitrile
0.1% formic acid
0.67% min
===== for text 5 =====
100 mM solution
250 mM sucrose
10 mM triethanolamine
12% SDS
0.1% formic acid
0.1% formic acid
50% B
===== for text 6 =====
2% acetonitrile
0.1% formic acid
2% to
50% Acetonitrile with 0.1% formic acid
20 mM KH2PO4, 20% ACN, 
1000 mM KCl
2% ACN, 0.1% FA
2% to
36% ACN, 0.1% FA
===== for text 7 =====
===== for text 8 =====
20 mM Tris
20 mM NaCl
12% NuPAGE
25 mM ammonium
30% acetoni

In [43]:
# pattern to remove
remove_pattern = r'(?:0\.)?(?:\d{1,4}) ?(?:mM|µM|nM|%) (?:w/v\)? |\(?v/v\)? )?(?:A|B|at|by|to|for|around|gel|gradient|solvent)'

for i in range(50):
    text = method.iloc[i]
    res = [s for s in re.findall(remove_pattern, text)]
    print('===== for text %d =====' % i)
    for b in res:
        print(b)

===== for text 0 =====
2% v/v for
===== for text 1 =====
===== for text 2 =====
===== for text 3 =====
98% v/v A
10% to
50% gradient
90% by
90% solvent
10% at
10% solvent
70% A
===== for text 4 =====
10% B
5% for
0.1% for
===== for text 5 =====
0.1% for
0.1% for
50% B
===== for text 6 =====
0.1% for
2% to
50% A
0.1% for
20% A
2% A
2% to
36% A
===== for text 7 =====
===== for text 8 =====
25 mM A
25 mM A
25 mM A
25 mM A
===== for text 9 =====
===== for text 10 =====
100% A
1% (v/v) for
10% (v/v) for
5% (v/v) A
0.1% (v/v) for
===== for text 11 =====
12% B
100% A
28% solvent
50% solvent
100% solvent
100% solvent
35% for
===== for text 12 =====
===== for text 13 =====
===== for text 14 =====
===== for text 15 =====
===== for text 16 =====
12% gradient
1% for
0.1% for
0.1% for
===== for text 17 =====
===== for text 18 =====
===== for text 19 =====
0.5% B
0.2% B
12% gel
0.1% for
0.1% for
===== for text 20 =====
===== for text 21 =====
===== for text 22 =====
===== for text 23 =====
===== for

It seems like this regex is picking up things like '0.1% formic acid' and '100% ACN' so try the one below with $

In [46]:
# with remove_pattern with $
buffer_pattern = r'(?:(?:0\.)?(?:\d{1,4}) ?(?:mM|µM|nM|%) (?:w/v\)? |\(?v/v\)? )?(?:and )?(?:of )?[A-Za-z0-9-]+(?: [Aa]cid)?(?: with | ?, )?)+'
remove_pattern = r'(?:0\.)?(?:\d{1,4}) ?(?:mM|µM|nM|%) (?:w/v\)? |\(?v/v\)? )?(?:A|B|at|by|to|for|around|gel|gradient|solvent){1}$'
remove_pattern_short = r'.*(?:A|B|at|by|to|for|around|gel|gradient|solvent){1}$'

for i in range(50):
    text = method.iloc[i]
    res = [s for s in re.findall(buffer_pattern, text) if re.match(remove_pattern, s)]
    print('===== for text %d =====' % i)
    for b in res:
        print(b)

===== for text 0 =====
===== for text 1 =====
===== for text 2 =====
===== for text 3 =====
10% to
50% gradient
90% by
90% solvent
10% at
10% solvent
===== for text 4 =====
===== for text 5 =====
50% B
===== for text 6 =====
2% to
2% to
===== for text 7 =====
===== for text 8 =====
===== for text 9 =====
===== for text 10 =====
===== for text 11 =====
28% solvent
50% solvent
100% solvent
100% solvent
35% for
===== for text 12 =====
===== for text 13 =====
===== for text 14 =====
===== for text 15 =====
===== for text 16 =====
12% gradient
===== for text 17 =====
===== for text 18 =====
===== for text 19 =====
12% gel
===== for text 20 =====
===== for text 21 =====
===== for text 22 =====
===== for text 23 =====
===== for text 24 =====
===== for text 25 =====
===== for text 26 =====
===== for text 27 =====
===== for text 28 =====
===== for text 29 =====
4 mM at
8 mM at
100% solvent
7% to
30% solvent
===== for text 30 =====
===== for text 31 =====
===== for text 32 =====
===== for text 3

In [49]:
# with shorter remove_pattern
buffer_pattern = r'(?:(?:0\.)?(?:\d{1,4}) ?(?:mM|µM|nM|%) (?:w/v\)? |\(?v/v\)? )?(?:and )?(?:of )?[A-Za-z0-9-]+(?: [Aa]cid)?(?: with | ?, )?)+'
remove_pattern_short = r'^.* (?:A|B|at|by|to|for|around|gel|gradient|solvent){1}$'

for i in range(50):
    text = method.iloc[i]
    res = [s for s in re.findall(buffer_pattern, text) if re.match(remove_pattern_short, s)]
    print('===== for text %d =====' % i)
    for b in res:
        print(b)

===== for text 0 =====
===== for text 1 =====
===== for text 2 =====
===== for text 3 =====
10% to
50% gradient
90% by
90% solvent
10% at
10% solvent
===== for text 4 =====
===== for text 5 =====
50% B
===== for text 6 =====
2% to
2% to
===== for text 7 =====
===== for text 8 =====
===== for text 9 =====
===== for text 10 =====
===== for text 11 =====
28% solvent
50% solvent
100% solvent
100% solvent
35% for
===== for text 12 =====
===== for text 13 =====
===== for text 14 =====
===== for text 15 =====
===== for text 16 =====
12% gradient
===== for text 17 =====
===== for text 18 =====
===== for text 19 =====
12% gel
===== for text 20 =====
===== for text 21 =====
===== for text 22 =====
===== for text 23 =====
===== for text 24 =====
===== for text 25 =====
===== for text 26 =====
===== for text 27 =====
===== for text 28 =====
===== for text 29 =====
4 mM at
8 mM at
100% solvent
7% to
30% solvent
===== for text 30 =====
===== for text 31 =====
===== for text 32 =====
===== for text 3

This shorter pattern seems to be equally effective.

In [73]:
# same as above but this time remove the ones that match remove_
buffer_pattern = r'(?:(?:0\.)?(?:\d{1,4}) ?(?:mM|µM|nM|%) (?:w/v\)? |\(?v/v\)? )?(?:and )?(?:of )?[A-Za-z0-9-]+(?: [Aa]cid)?(?: with | ?, )?)+'
remove_pattern_short = r'^.* (?:A|B|at|by|to|for|around|gel|gradient|solvent|NuPAGE|buffer|in){1}$'

for i in range(50):
    text = method.iloc[i, 0]
    res = [s for s in re.findall(buffer_pattern, text) if not re.match(remove_pattern_short, s)]
    print('===== for text %d =====' % i)
    for b in res:
        print(b)

===== for text 0 =====
2% deoxycholate, 
0.37% w/v sodium
50% v/v acetonitrile
2% v/v formic acid, 2% v/v acetonitrile
===== for text 1 =====
===== for text 2 =====
10 mM DTT
55 mM iodoacetamide
100% acetonitrile
40 mM ammoniumbicarbonate
===== for text 3 =====
10% FBS
0.1% SDS, 
50 mM Tris
200 mM methyl
1% trifluoroacetic acid
2% v/v acetonitrile
0.1% v/v TFA
98% v/v ACN, 0.1% v/v TFA
70% ACN
===== for text 4 =====
10% Bis-Tris
5% formic acid
50% acetonitrile, 
0.1% trifluoroacetic acid
40% acetonitrile
0.1% formic acid
0.67% min
===== for text 5 =====
100 mM solution
250 mM sucrose
10 mM triethanolamine
12% SDS
0.1% formic acid
0.1% formic acid
===== for text 6 =====
2% acetonitrile
0.1% formic acid
50% Acetonitrile with 0.1% formic acid
20 mM KH2PO4, 20% ACN, 
1000 mM KCl
2% ACN, 0.1% FA
36% ACN, 0.1% FA
===== for text 7 =====
===== for text 8 =====
20 mM Tris
20 mM NaCl
25 mM ammonium
30% acetonitrile
25 mM ABC
80% acetonitrile
25 mM ABC
10 mM DTT
25 mM ABC
55 mM iodoacetamide
25 m

In [74]:
def extract_buffers(text):
    buffer_pattern = r'(?:(?:0\.)?(?:\d{1,4}) ?(?:mM|µM|nM|%) (?:w/v\)? |\(?v/v\)? )?(?:and )?(?:of )?[A-Za-z0-9-]+(?: [Aa]cid)?(?: with | ?, )?)+'
    remove_pattern_short = r'^.* (?:A|B|at|by|to|for|around|gel|gradient|solvent|NuPAGE|buffer|in){1}$'
    
    # some mapping for acronyms
    acronym_map = {
        'formic acid': 'FA',
        'acetonitrile': 'ACN',
        'Acetonitrile': 'ACN',
        'trifluoroacetic acid': 'TFA'
    }
    
    if type(text) == str:
        buffers = ['(' + s + ')' for s in re.findall(buffer_pattern, text) if not re.match(remove_pattern_short, s)]
        return buffers
    else:
        return []

In [58]:
method = method.to_frame()

In [75]:
method['buffers'] = method['sample_protocol'].apply(lambda x: extract_buffers(x))

In [76]:
method.head(50)

Unnamed: 0,sample_protocol,buffers
0,The crude membranes from 5 P56-P70 Glun1TAP/TA...,"[(2% deoxycholate, ), (0.37% w/v sodium), (50%..."
1,"Breast cancer tissue lysates, reduction, alkyl...",[]
2,In-gel digests were performed as described in ...,"[(10 mM DTT), (55 mM iodoacetamide), (100% ace..."
3,Conditioned media preparation and HPLC-isobari...,"[(10% FBS), (0.1% SDS, ), (50 mM Tris), (200 m..."
4,Proteins were eluted from washed beads by addi...,"[(10% Bis-Tris), (5% formic acid), (50% aceton..."
5,Exosomes were isolated from 10 healthy volunte...,"[(100 mM solution), (250 mM sucrose), (10 mM t..."
6,Low resolution mass spectrometry: For the shot...,"[(2% acetonitrile), (0.1% formic acid), (50% A..."
7,"Arabidopsis plants, Wassilewskija background (...",[]
8,Zebrafish larvae were ground with a pestle in ...,"[(20 mM Tris), (20 mM NaCl), (25 mM ammonium),..."
9,Affinity purification and nano-liquid chromato...,[]


In [64]:
method.iloc[1, 0]

'Breast cancer tissue lysates, reduction, alkylation by methyl-methanethiosulfonate, trypsin digestion, iTRAQ 8-plex labeling, HILIC fractionation, LC-MS/MS on Orbitrap Velos.  Transcriptomics analysis, immunohistochemistry. Detailed description provided in the manuscript.'