In [2]:
import os
import xml.etree.ElementTree as ET

# Define the path to the XML file
file_path = "/Users/mine/Documents/Projects/Research_Summer/NSF_Grant/NSF_Downloads_July082024/Data/xmls_new/000C130.xml"  

# Function to recursively print the structure of the XML
def print_xml_structure(elem, level=0):
    indent = "  " * level
    print(f"{indent}Tag: {elem.tag}, Attributes: {elem.attrib}")
    for child in elem:
        print_xml_structure(child, level + 1)

# Function to parse the XML and print its structure
def study_xml_structure(file_path):
    try:
        tree = ET.parse(file_path)
        root = tree.getroot()
        print("Root Element:", root.tag)
        print("Structure:")
        print_xml_structure(root)
    except ET.ParseError as e:
        print(f"Error parsing {file_path}: {e}")

# Study the structure of the single XML file
study_xml_structure(file_path)


Root Element: rootTag
Structure:
Tag: rootTag, Attributes: {}
  Tag: Award, Attributes: {}
    Tag: AwardTitle, Attributes: {}
    Tag: AwardEffectiveDate, Attributes: {}
    Tag: AwardExpirationDate, Attributes: {}
    Tag: AwardAmount, Attributes: {}
    Tag: AwardInstrument, Attributes: {}
      Tag: Code, Attributes: {}
      Tag: Value, Attributes: {}
    Tag: Organization, Attributes: {}
      Tag: Code, Attributes: {}
      Tag: Directorate, Attributes: {}
        Tag: Abbreviation, Attributes: {}
        Tag: Code, Attributes: {}
        Tag: LongName, Attributes: {}
      Tag: Division, Attributes: {}
        Tag: Code, Attributes: {}
        Tag: Abbreviation, Attributes: {}
        Tag: LongName, Attributes: {}
    Tag: ProgramOfficer, Attributes: {}
      Tag: SignBlockName, Attributes: {}
    Tag: PropsalInformation, Attributes: {}
      Tag: ID, Attributes: {}
    Tag: AbstractNarration, Attributes: {}
    Tag: MinAmdLetterDate, Attributes: {}
    Tag: MaxAmdLetterDate, A

In [3]:
import pandas as pd

# Define the directory containing the extracted XML files
base_dir = "/Users/mine/Documents/Projects/Research_Summer/NSF_Grant/NSF_Downloads_July082024/Data/xmls_new"  # Change this to your directory path

# Function to parse XML and extract relevant data
def parse_xml(file_path):
    try:
        tree = ET.parse(file_path)
        root = tree.getroot()

        # Extract data based on the XML structure provided
        data = {
            "AwardTitle": root.findtext('.//Award/AwardTitle', default=""),
            "AwardEffectiveDate": root.findtext('.//Award/AwardEffectiveDate', default=""),
            "AwardExpirationDate": root.findtext('.//Award/AwardExpirationDate', default=""),
            "AwardAmount": root.findtext('.//Award/AwardAmount', default=""),
            "AwardInstrumentCode": root.findtext('.//Award/AwardInstrument/Code', default=""),
            "AwardInstrumentValue": root.findtext('.//Award/AwardInstrument/Value', default=""),
            "OrganizationCode": root.findtext('.//Award/Organization/Code', default=""),
            "DirectorateAbbreviation": root.findtext('.//Award/Organization/Directorate/Abbreviation', default=""),
            "DirectorateCode": root.findtext('.//Award/Organization/Directorate/Code', default=""),
            "DirectorateLongName": root.findtext('.//Award/Organization/Directorate/LongName', default=""),
            "DivisionCode": root.findtext('.//Award/Organization/Division/Code', default=""),
            "DivisionAbbreviation": root.findtext('.//Award/Organization/Division/Abbreviation', default=""),
            "DivisionLongName": root.findtext('.//Award/Organization/Division/LongName', default=""),
            "ProgramOfficerSignBlockName": root.findtext('.//Award/ProgramOfficer/SignBlockName', default=""),
            "PropsalInformationID": root.findtext('.//Award/PropsalInformation/ID', default=""),
            "AbstractNarration": root.findtext('.//Award/AbstractNarration', default=""),
            "MinAmdLetterDate": root.findtext('.//Award/MinAmdLetterDate', default=""),
            "MaxAmdLetterDate": root.findtext('.//Award/MaxAmdLetterDate', default=""),
            "IsHistoricalAward": root.findtext('.//Award/IsHistoricalAward', default=""),
            "ARRAAmount": root.findtext('.//Award/ARRAAmount', default=""),
            "InstitutionName": root.findtext('.//Award/Institution/Name', default=""),
            "InstitutionCityName": root.findtext('.//Award/Institution/CityName', default=""),
            "InstitutionStateCode": root.findtext('.//Award/Institution/StateCode', default=""),
            "InstitutionZipCode": root.findtext('.//Award/Institution/ZipCode', default=""),
            "InstitutionPhoneNumber": root.findtext('.//Award/Institution/PhoneNumber', default=""),
            "InstitutionEmailAddress": root.findtext('.//Award/Institution/EmailAddress', default=""),
            "InstitutionStreetAddress": root.findtext('.//Award/Institution/StreetAddress', default=""),
            "InstitutionCountryCode": root.findtext('.//Award/Institution/CountryCode', default=""),
            "InstitutionCountryName": root.findtext('.//Award/Institution/CountryName', default=""),
            "InstitutionStateName": root.findtext('.//Award/Institution/StateName', default=""),
            "InstitutionCountryFlag": root.findtext('.//Award/Institution/CountryFlag', default=""),
            "AwardAmountRangeCategory": root.findtext('.//Award/AwardAmountRangeCategory', default=""),
            "AwardExpirationDateEpoch": root.findtext('.//Award/AwardExpirationDateEpoch', default=""),
            "MinAmdLetterDateEpoch": root.findtext('.//Award/MinAmdLetterDateEpoch', default=""),
            "MaxAmdLetterDateEpoch": root.findtext('.//Award/MaxAmdLetterDateEpoch', default=""),
            "AwardEffectiveDateEpoch": root.findtext('.//Award/AwardEffectiveDateEpoch', default=""),
            "DREREFERENCE": root.findtext('.//Award/DREREFERENCE', default=""),
            "AwardID": root.findtext('.//Award/AwardID', default=""),
        }
        return data
    except ET.ParseError as e:
        print(f"Error parsing {file_path}: {e}")
        return None

# List to hold all parsed data
all_data = []
successful_files = 0
unsuccessful_files = 0

# Traverse through each subfolder in the base directory
for root, dirs, files in os.walk(base_dir):
    for file in files:
        if file.endswith(".xml"):
            file_path = os.path.join(root, file)
            data = parse_xml(file_path)
            if data:  # Only append if data is not None
                all_data.append(data)
                successful_files += 1
            else:
                unsuccessful_files += 1

# Create a DataFrame from the collected data
df = pd.DataFrame(all_data)

# Output the DataFrame to CSV and Pickle files
df.to_csv("combined_data_extracted.csv", index=False)
#df.to_pickle("combined_data.pkl")

# Display the DataFrame and the counts of successful and unsuccessful parses
print(df.head())
print(f"Total successful files: {successful_files}")
print(f"Total unsuccessful files: {unsuccessful_files}")


Error parsing /Users/mine/Documents/Projects/Research_Summer/NSF_Grant/NSF_Downloads_July082024/Data/xmls_new/0834451.xml: no element found: line 138, column 0
Error parsing /Users/mine/Documents/Projects/Research_Summer/NSF_Grant/NSF_Downloads_July082024/Data/xmls_new/1066456.xml: no element found: line 119, column 0
Error parsing /Users/mine/Documents/Projects/Research_Summer/NSF_Grant/NSF_Downloads_July082024/Data/xmls_new/1028076.xml: no element found: line 226, column 0
Error parsing /Users/mine/Documents/Projects/Research_Summer/NSF_Grant/NSF_Downloads_July082024/Data/xmls_new/9318513.xml: not well-formed (invalid token): line 61, column 28
Error parsing /Users/mine/Documents/Projects/Research_Summer/NSF_Grant/NSF_Downloads_July082024/Data/xmls_new/1028510.xml: no element found: line 130, column 0
Error parsing /Users/mine/Documents/Projects/Research_Summer/NSF_Grant/NSF_Downloads_July082024/Data/xmls_new/0506732.xml: no element found: line 141, column 0
Error parsing /Users/mine