KM Hall 

This code queries EDI's PASTA database using their API, and pulls down the SEV's most recent data catalog. This catalog is then written to an XML and CSV file.

In [1]:
import pandas as pd
import numpy as np
import requests
import xml.etree.ElementTree as ET
from datetime import date

In [2]:
base_url = 'https://pasta.lternet.edu/package/search/eml'

In [3]:
r = requests.get(base_url + '?q=scope:knb-lter-sev&fl=packageid,title,doi&rows=100000')

In [4]:
date.today().strftime("%Y%m%d")

'20200512'

In [5]:
print("test" + date.today().strftime("%Y%m%d") + ".xml")

test20200512.xml


In [6]:
txt = r.text 
xml_file_name = "../output/PASTA_SEV_Data_Packages_List_" + date.today().strftime("%Y%m%d") + ".xml"
xml_file_name
myfile = open(xml_file_name, "w")  
myfile.write(txt)  

59603

In [7]:
# reads in xml file that was written out in the previous step and converts it to an easier to use format

def getvalueofnode(node):
    return node.text if node is not None else None

def main():
    """ main """
    parsed_xml = ET.parse(xml_file_name)
    dfcols = ['packageid', 'title', 'doi']
    df_xml = pd.DataFrame(columns=dfcols)
 
    for node in parsed_xml.getroot():
        #name = node.attrib.get('name')
        packageid = node.find('packageid')
        title = node.find('title')
        doi = node.find('doi')
        
 
        df_xml = df_xml.append(
            pd.Series([getvalueofnode(packageid), getvalueofnode(title),
                       getvalueofnode(doi)], index=dfcols),
            ignore_index=True)
 
    #print(df_xml)
    return df_xml
    
 
main()

Unnamed: 0,packageid,title,doi
0,knb-lter-sev.201.271014,Point-Quarter Distance and Dimension Measureme...,doi:10.6073/pasta/aa4076ed1ed8cf1150618463981e...
1,knb-lter-sev.202.312958,Point-Quarter Harvested Plant Weight Measureme...,doi:10.6073/pasta/c7334a04af15e27443e9a7e07495...
2,knb-lter-sev.203.111849,Larrea Seedling Monitoring Study at the Sevill...,doi:10.6073/pasta/751d60c674f8750c1d3dfaca1e78...
3,knb-lter-sev.204.154836,Tree Mast Production in Pinyon-Juniper-Oak For...,doi:10.6073/pasta/f6cb97e094966c0af30206e767b0...
4,knb-lter-sev.205.210628,Warming-El Nino-Nitrogen Deposition Experiment...,doi:10.6073/pasta/1dec3c20ab98ad9edbd173252f67...
...,...,...,...
188,knb-lter-sev.145.204489,US-Hungary Grassland Biodiversity (cross-site ...,doi:10.6073/pasta/9f94c178469e01dd1d485eb01aca...
189,knb-lter-sev.146.194599,Ecological Effects of Prescribed Fire on Soils...,doi:10.6073/pasta/9bc0b68f2e5b354d2324e9dc86a6...
190,knb-lter-sev.134.265065,Long-term Dynamics in Soil Field Available Nit...,doi:10.6073/pasta/67b0cfde84800a08d37e4e94a403...
191,knb-lter-sev.200.174699,Discontinued Vegetation Line-Intercept Transec...,doi:10.6073/pasta/5ed2fb6c74258ded76f7c9869d35...


In [8]:
SEV_clean = main()

In [9]:
SEV_clean.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 193 entries, 0 to 192
Data columns (total 3 columns):
packageid    193 non-null object
title        193 non-null object
doi          193 non-null object
dtypes: object(3)
memory usage: 4.6+ KB


In [10]:
SEV_clean.head()

Unnamed: 0,packageid,title,doi
0,knb-lter-sev.201.271014,Point-Quarter Distance and Dimension Measureme...,doi:10.6073/pasta/aa4076ed1ed8cf1150618463981e...
1,knb-lter-sev.202.312958,Point-Quarter Harvested Plant Weight Measureme...,doi:10.6073/pasta/c7334a04af15e27443e9a7e07495...
2,knb-lter-sev.203.111849,Larrea Seedling Monitoring Study at the Sevill...,doi:10.6073/pasta/751d60c674f8750c1d3dfaca1e78...
3,knb-lter-sev.204.154836,Tree Mast Production in Pinyon-Juniper-Oak For...,doi:10.6073/pasta/f6cb97e094966c0af30206e767b0...
4,knb-lter-sev.205.210628,Warming-El Nino-Nitrogen Deposition Experiment...,doi:10.6073/pasta/1dec3c20ab98ad9edbd173252f67...


In [11]:
SEV_clean[['junk1','package_number','version']] = SEV_clean['packageid'].str.split('.', expand=True)

In [12]:
SEV_clean.head()

Unnamed: 0,packageid,title,doi,junk1,package_number,version
0,knb-lter-sev.201.271014,Point-Quarter Distance and Dimension Measureme...,doi:10.6073/pasta/aa4076ed1ed8cf1150618463981e...,knb-lter-sev,201,271014
1,knb-lter-sev.202.312958,Point-Quarter Harvested Plant Weight Measureme...,doi:10.6073/pasta/c7334a04af15e27443e9a7e07495...,knb-lter-sev,202,312958
2,knb-lter-sev.203.111849,Larrea Seedling Monitoring Study at the Sevill...,doi:10.6073/pasta/751d60c674f8750c1d3dfaca1e78...,knb-lter-sev,203,111849
3,knb-lter-sev.204.154836,Tree Mast Production in Pinyon-Juniper-Oak For...,doi:10.6073/pasta/f6cb97e094966c0af30206e767b0...,knb-lter-sev,204,154836
4,knb-lter-sev.205.210628,Warming-El Nino-Nitrogen Deposition Experiment...,doi:10.6073/pasta/1dec3c20ab98ad9edbd173252f67...,knb-lter-sev,205,210628


In [13]:
SEV_clean = SEV_clean.drop(columns=['junk1'])

In [15]:
SEV_clean_final = SEV_clean[['package_number', 'version', 'title', 'packageid', 'doi']].sort_values(['package_number'])

In [18]:
SEV_clean_final['package_number'] = pd.to_numeric(SEV_clean_final['package_number'])

In [19]:
SEV_clean_final = SEV_clean_final.sort_values(['package_number'])

In [20]:
SEV_clean_final.head()

Unnamed: 0,package_number,version,title,packageid,doi
185,1,13,Meteorology Data from the Sevilleta National W...,knb-lter-sev.1.13,doi:10.6073/pasta/4d71c09b242602114fb684c843e9...
92,2,329007,Precipitation Chemistry Data For the Sevilleta...,knb-lter-sev.2.329007,doi:10.6073/pasta/d44f1fc24b685c33d3fdafdab4c8...
93,4,202002,Grassland Vegetation Line-Intercept Transects ...,knb-lter-sev.4.202002,doi:10.6073/pasta/63f506aaf52e7a6ecb3fb296b9e8...
94,6,151654,Pinon Branch Demography Study at the Sevilleta...,knb-lter-sev.6.151654,doi:10.6073/pasta/7f398daeaff9bb36d10921f8b405...
95,7,183180,Plant Water Potentials and Plant Physiology at...,knb-lter-sev.7.183180,doi:10.6073/pasta/8098bba14324c8e7ba6e7dc30518...


In [21]:
csv_file_name = "../output/PASTA_SEV_Data_Packages_List_" + date.today().strftime("%Y%m%d") + ".csv"
csv_file_name
SEV_clean_final.to_csv(csv_file_name, index=False)

In [22]:
SEV_clean_final.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 193 entries, 185 to 85
Data columns (total 5 columns):
package_number    193 non-null int64
version           193 non-null object
title             193 non-null object
packageid         193 non-null object
doi               193 non-null object
dtypes: int64(1), object(4)
memory usage: 9.0+ KB
