KM Hall 

This code queries EDI's PASTA database using their API, and pulls down the SEV's most recent data catalog. This catalog is then written to an XML and CSV file.

In [1]:
import pandas as pd
import numpy as np
import requests
import xml.etree.ElementTree as ET
from datetime import date

In [2]:
base_url = 'https://pasta.lternet.edu/package/search/eml'

In [9]:
r = requests.get(base_url + '?q=scope:knb-lter-sev&fl=packageid,title,doi,pubdate&rows=100000')

In [10]:
date.today().strftime("%Y%m%d")

'20210511'

In [11]:
print("test" + date.today().strftime("%Y%m%d") + ".xml")

test20210511.xml


In [12]:
txt = r.text 
xml_file_name = "../output/PASTA_SEV_Data_Packages_List_" + date.today().strftime("%Y%m%d") + ".xml"
xml_file_name
myfile = open(xml_file_name, "w")  
myfile.write(txt)  

67626

In [13]:
# reads in xml file that was written out in the previous step and converts it to an easier to use format

def getvalueofnode(node):
    return node.text if node is not None else None

def main():
    """ main """
    parsed_xml = ET.parse(xml_file_name)
    dfcols = ['packageid', 'title', 'doi', 'pubdate']
    df_xml = pd.DataFrame(columns=dfcols)
 
    for node in parsed_xml.getroot():
        #name = node.attrib.get('name')
        packageid = node.find('packageid')
        title = node.find('title')
        doi = node.find('doi')
        pubdate = node.find('pubdate')
        
 
        df_xml = df_xml.append(
            pd.Series([getvalueofnode(packageid), getvalueofnode(title),
                       getvalueofnode(doi), getvalueofnode(pubdate)], index=dfcols),
            ignore_index=True)
 
    #print(df_xml)
    return df_xml
    
 
main()

Unnamed: 0,packageid,title,doi,pubdate
0,knb-lter-sev.202.312958,Point-Quarter Harvested Plant Weight Measureme...,doi:10.6073/pasta/c7334a04af15e27443e9a7e07495...,2016
1,knb-lter-sev.203.111849,Larrea Seedling Monitoring Study at the Sevill...,doi:10.6073/pasta/751d60c674f8750c1d3dfaca1e78...,2011
2,knb-lter-sev.204.154836,Tree Mast Production in Pinyon-Juniper-Oak For...,doi:10.6073/pasta/f6cb97e094966c0af30206e767b0...,2010
3,knb-lter-sev.205.210628,Warming-El Nino-Nitrogen Deposition Experiment...,doi:10.6073/pasta/1dec3c20ab98ad9edbd173252f67...,2016
4,knb-lter-sev.206.217925,Monsoon Rainfall Manipulation Experiment (MRME...,doi:10.6073/pasta/9372a5dea415f041672cdc257c84...,2010
...,...,...,...,...
194,knb-lter-sev.148.131886,2003 Prescribed Burn Effect on Chihuahuan Dese...,doi:10.6073/pasta/e207daa835cd4a46c774fd22d4e3...,2021
195,knb-lter-sev.165.121135,2003 Prescribed Burn Effect on Chihuahuan Dese...,doi:10.6073/pasta/c531e6a0eec5137a0853a8a168d4...,2021
196,knb-lter-sev.166.122110,2003 Prescribed Burn Effect on Chihuahuan Dese...,doi:10.6073/pasta/9e954ae1972c6951e84bc85f4020...,2021
197,knb-lter-sev.238.226095,Gunnison's Prairie Dog Restoration Experiment ...,doi:10.6073/pasta/608b329e394f77c761245402e5a5...,2021


In [14]:
SEV_clean = main()

In [15]:
SEV_clean.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 199 entries, 0 to 198
Data columns (total 4 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   packageid  199 non-null    object
 1   title      199 non-null    object
 2   doi        199 non-null    object
 3   pubdate    196 non-null    object
dtypes: object(4)
memory usage: 6.3+ KB


In [16]:
SEV_clean.head()

Unnamed: 0,packageid,title,doi,pubdate
0,knb-lter-sev.202.312958,Point-Quarter Harvested Plant Weight Measureme...,doi:10.6073/pasta/c7334a04af15e27443e9a7e07495...,2016
1,knb-lter-sev.203.111849,Larrea Seedling Monitoring Study at the Sevill...,doi:10.6073/pasta/751d60c674f8750c1d3dfaca1e78...,2011
2,knb-lter-sev.204.154836,Tree Mast Production in Pinyon-Juniper-Oak For...,doi:10.6073/pasta/f6cb97e094966c0af30206e767b0...,2010
3,knb-lter-sev.205.210628,Warming-El Nino-Nitrogen Deposition Experiment...,doi:10.6073/pasta/1dec3c20ab98ad9edbd173252f67...,2016
4,knb-lter-sev.206.217925,Monsoon Rainfall Manipulation Experiment (MRME...,doi:10.6073/pasta/9372a5dea415f041672cdc257c84...,2010


In [17]:
SEV_clean[['junk1','package_number','version']] = SEV_clean['packageid'].str.split('.', expand=True)

In [18]:
SEV_clean.head()

Unnamed: 0,packageid,title,doi,pubdate,junk1,package_number,version
0,knb-lter-sev.202.312958,Point-Quarter Harvested Plant Weight Measureme...,doi:10.6073/pasta/c7334a04af15e27443e9a7e07495...,2016,knb-lter-sev,202,312958
1,knb-lter-sev.203.111849,Larrea Seedling Monitoring Study at the Sevill...,doi:10.6073/pasta/751d60c674f8750c1d3dfaca1e78...,2011,knb-lter-sev,203,111849
2,knb-lter-sev.204.154836,Tree Mast Production in Pinyon-Juniper-Oak For...,doi:10.6073/pasta/f6cb97e094966c0af30206e767b0...,2010,knb-lter-sev,204,154836
3,knb-lter-sev.205.210628,Warming-El Nino-Nitrogen Deposition Experiment...,doi:10.6073/pasta/1dec3c20ab98ad9edbd173252f67...,2016,knb-lter-sev,205,210628
4,knb-lter-sev.206.217925,Monsoon Rainfall Manipulation Experiment (MRME...,doi:10.6073/pasta/9372a5dea415f041672cdc257c84...,2010,knb-lter-sev,206,217925


In [19]:
SEV_clean = SEV_clean.drop(columns=['junk1'])

In [20]:
SEV_clean_final = SEV_clean[['package_number', 'version', 'title', 'packageid', 'doi', 'pubdate']].sort_values(['package_number'])

In [21]:
SEV_clean_final['package_number'] = pd.to_numeric(SEV_clean_final['package_number'])

In [22]:
SEV_clean_final = SEV_clean_final.sort_values(['package_number'])

In [23]:
SEV_clean_final.head()

Unnamed: 0,package_number,version,title,packageid,doi,pubdate
189,1,14,Meteorology Data from the Sevilleta National W...,knb-lter-sev.1.14,doi:10.6073/pasta/1cbc37ae4d40b3844b5e4be9f6f1...,2021
87,2,329007,Precipitation Chemistry Data For the Sevilleta...,knb-lter-sev.2.329007,doi:10.6073/pasta/d44f1fc24b685c33d3fdafdab4c8...,2010
88,4,202002,Grassland Vegetation Line-Intercept Transects ...,knb-lter-sev.4.202002,doi:10.6073/pasta/63f506aaf52e7a6ecb3fb296b9e8...,2020
89,6,151654,Pinon Branch Demography Study at the Sevilleta...,knb-lter-sev.6.151654,doi:10.6073/pasta/7f398daeaff9bb36d10921f8b405...,2016
90,7,183180,Plant Water Potentials and Plant Physiology at...,knb-lter-sev.7.183180,doi:10.6073/pasta/8098bba14324c8e7ba6e7dc30518...,2011


In [24]:
csv_file_name = "../output/PASTA_SEV_Data_Packages_List_" + date.today().strftime("%Y%m%d") + ".csv"
csv_file_name
SEV_clean_final.to_csv(csv_file_name, index=False)

In [25]:
SEV_clean_final.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 199 entries, 189 to 80
Data columns (total 6 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   package_number  199 non-null    int64 
 1   version         199 non-null    object
 2   title           199 non-null    object
 3   packageid       199 non-null    object
 4   doi             199 non-null    object
 5   pubdate         196 non-null    object
dtypes: int64(1), object(5)
memory usage: 10.9+ KB
