# XML Example

This code block is to import:
* FindSpark
* SparkSession
* Spark SQL functions

And then initialises the SparkSession

In [None]:
import findspark
from pyspark.sql import SparkSession
from pyspark.sql.functions import *
import xml.etree.ElementTree as ET

findspark.init()

spark = SparkSession.builder.master("local").appName("FSTExample").getOrCreate()

: 

Read XML file into an RDD

In [None]:
file_rdd = spark.read.text('./persons.xml', wholetext=True).rdd

Function to extract records

In [None]:
ELEMENTS_TO_EXTRACT = ['firstname', 'middlename', 'lastname', 'dob_year', 'dob_month', 'salary', 'gender']
def parse_xml(rdd):
    """
    This function is used to read an xml string from rdd, parse and extract the elements,
    then return a list of lists.
    """
    results = []
    root = ET.fromstring(rdd[0])

    for p in root.findall('person'):
        rec = []
        rec.append(p.attrib['id'])

        for e in ELEMENTS_TO_EXTRACT:
            if p.find(e) is None:
                rec.append(None)
                continue
            value = p.find(e).text
            if e == 'salary':
                value = float(value)
            rec.append(value)
        results.append(rec)
    return results

Pass the input file to be parsed

In [None]:
records_rdd = file_rdd.flatMap(parse_xml)
persons = records_rdd.toDF(("id", "firstName", "middleName", "lastName", "dob_year", "dob_month", "salary", "gender"))
persons.show()