# Example of getting metadata from webpage

Data found here: http://gigadb.org/dataset/100535

In [1]:
import requests
import lxml.html as lh
import pandas as pd
import re

## Website contents

In [3]:
example_url = 'http://gigadb.org/dataset/view/id/100535' #/Sample_page/33

In [4]:
page = requests.get(example_url)

In [5]:
doc = lh.fromstring(page.content)

In [6]:
tr_elements = doc.xpath('//tr')

In [7]:
# tr_elements has:
# -- 6 elements for sample info
# -- 7 elements for file info
# -- 4 elements for funding info
# -- 2 elements for history info
print([len(T) for T in tr_elements[:40]])

[6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 4, 4, 4, 4, 4, 2, 2, 2]


In [8]:
# take the first 11 rows
max_size = len(tr_elements[0])
header_elements = tr_elements[0]
subject_elements = tr_elements[1:11]

In [9]:
print([len(T) for T in subject_elements])

[6, 6, 6, 6, 6, 6, 6, 6, 6, 6]


In [10]:
# column labels for DataFrame
columns=[]

for i,h in enumerate(header_elements):
    name = h.text_content()
    print('%d:"%s"'%(i,name))
    
    if i == max_size-1:
        break
        
    columns.append((name,[]))

# replace final column with specific column headers
nwb_col_headers = ['Description','Age','Sex','Collection date']
for h in nwb_col_headers:
    columns.append((h,[]))


0:"Sample ID"
1:"Taxonomic ID"
2:"Common Name"
3:"Genbank Name"
4:"Scientific Name"
5:"Sample Attributes"


In [11]:
columns

[('Sample ID', []),
 ('Taxonomic ID', []),
 ('Common Name', []),
 ('Genbank Name', []),
 ('Scientific Name', []),
 ('Description', []),
 ('Age', []),
 ('Sex', []),
 ('Collection date', [])]

In [12]:
example_metadata = subject_elements[0]

In [13]:
def get_reduced_split(metadata,headers):
    
    headers = ['Age','Description','Life stage','Sex','Collection date']
    delimiters = [head+':' for head in headers] + ['+','...','\t\t','\n']
    regexPattern = '|'.join(map(re.escape, delimiters))
    
    split = re.split(regexPattern,metadata)
    split = [o for o in split if not o.isspace()] # remove whitespace
    split = [o for o in split if len(o)>0][3:] # remove empty strings + first 3 are redudant

    new_split = []
    
    for i, s in enumerate(split):
        
        if i ==1:
            new_split.append(split[1] + ' ' + split[2])
        elif i==2:
            continue
        else:
            new_split.append(s)
            
    return new_split

In [14]:
columns

[('Sample ID', []),
 ('Taxonomic ID', []),
 ('Common Name', []),
 ('Genbank Name', []),
 ('Scientific Name', []),
 ('Description', []),
 ('Age', []),
 ('Sex', []),
 ('Collection date', [])]

In [15]:
for i, subject in enumerate(subject_elements):
    if len(subject)!=max_size:
        print('Row %s is not of size %s'%(i,max_size))
        break # should be final element if any

    for j,s in enumerate(subject.iterchildren()):
        metadata = s.text_content()

        # split "Sample attributes"
        if j==max_size-1:
            # get split data
            split = get_reduced_split(metadata,nwb_col_headers)
            
            for k in range(4):
                columns[max_size-1+k][1].append(split[k])      
        
        else:
            columns[j][1].append(metadata)


In [16]:
columns

[('Sample ID',
  ['161104_NC_3',
   '161214_AL_113',
   '161214_NC_7',
   '170130_AL_133',
   '170130_AL_134',
   '170131_AB_216',
   '170131_AB_218',
   '170201_AB_220',
   '170201_AB_221',
   '170201_AB_222']),
 ('Taxonomic ID',
  ['10090',
   '10090',
   '10090',
   '10090',
   '10090',
   '10090',
   '10090',
   '10090',
   '10090',
   '10090']),
 ('Common Name',
  ['Mouse',
   'Mouse',
   'Mouse',
   'Mouse',
   'Mouse',
   'Mouse',
   'Mouse',
   'Mouse',
   'Mouse',
   'Mouse']),
 ('Genbank Name',
  ['house mouse',
   'house mouse',
   'house mouse',
   'house mouse',
   'house mouse',
   'house mouse',
   'house mouse',
   'house mouse',
   'house mouse',
   'house mouse']),
 ('Scientific Name',
  ['Mus musculus',
   'Mus musculus',
   'Mus musculus',
   'Mus musculus',
   'Mus musculus',
   'Mus musculus',
   'Mus musculus',
   'Mus musculus',
   'Mus musculus',
   'Mus musculus']),
 ('Description',
  ['Electrophysiology of the primary somatosensory cortex in adult mice. Curre

## Example dataframe

In [17]:
Dict = {title:col for (title,col) in columns}

sample_df = pd.DataFrame(Dict)

In [18]:
sample_df

Unnamed: 0,Sample ID,Taxonomic ID,Common Name,Genbank Name,Scientific Name,Description,Age,Sex,Collection date
0,161104_NC_3,10090,Mouse,house mouse,Mus musculus,Electrophysiology of the primary somatosensory...,192 days postnatal,M,2016-11-04
1,161214_AL_113,10090,Mouse,house mouse,Mus musculus,Electrophysiology of the primary somatosensory...,210 days postnatal,M,2016-12-14
2,161214_NC_7,10090,Mouse,house mouse,Mus musculus,Electrophysiology of the primary somatosensory...,210 days postnatal,M,2016-12-14
3,170130_AL_133,10090,Mouse,house mouse,Mus musculus,Electrophysiology of the primary somatosensory...,226 days postnatal,F,2017-01-30
4,170130_AL_134,10090,Mouse,house mouse,Mus musculus,Electrophysiology of the primary somatosensory...,226 days postnatal,F,2017-01-30
5,170131_AB_216,10090,Mouse,house mouse,Mus musculus,Electrophysiology of the primary somatosensory...,227 days postnatal,F,2017-01-31
6,170131_AB_218,10090,Mouse,house mouse,Mus musculus,Electrophysiology of the primary somatosensory...,227 days postnatal,F,2017-01-31
7,170201_AB_220,10090,Mouse,house mouse,Mus musculus,Electrophysiology of the primary somatosensory...,228 days postnatal,F,2017-02-01
8,170201_AB_221,10090,Mouse,house mouse,Mus musculus,Electrophysiology of the primary somatosensory...,228 days postnatal,F,2017-02-01
9,170201_AB_222,10090,Mouse,house mouse,Mus musculus,Electrophysiology of the primary somatosensory...,228 days postnatal,F,2017-02-01


In [2]:
samples_df = pd.read_pickle('sample_metadata.pkl')

In [3]:
samples_df.tail()

Unnamed: 0,Sample ID,Taxonomic ID,Common Name,Genbank Name,Scientific Name,Description,Age,Sex,Collection date
321,180815_ME_3,10090,Mouse,house mouse,Mus musculus,Electrophysiology of the primary somatosensory...,139 days postnatal,M,2018-08-15
322,180815_ME_6,10090,Mouse,house mouse,Mus musculus,Electrophysiology of the primary somatosensory...,139 days postnatal,M,2018-08-15
323,180817_ME_3,10090,Mouse,house mouse,Mus musculus,Electrophysiology of the primary somatosensory...,105 days postnatal,F,2018-08-17
324,180817_ME_6,10090,Mouse,house mouse,Mus musculus,Electrophysiology of the primary somatosensory...,105 days postnatal,F,2018-08-17
325,180817_ME_9,10090,Mouse,house mouse,Mus musculus,Electrophysiology of the primary somatosensory...,105 days postnatal,F,2018-08-17
