In [1]:
from pyspark.sql.types import *
from pyspark.sql import SparkSession
from pyspark.sql.functions import broadcast

### Load account data

In [2]:
sparkSession = SparkSession.builder.master("local")\
                              .appName("project2")\
                              .getOrCreate()
            
# Explicitly define the schema of the data
schema = StructType([StructField("CDC_FLAG", StringType(), True),\
                     StructField("CDC_DSN", StringType(), True),\
                     StructField("CA_ID", StringType(), True),\
                     StructField("CA_B_ID ", StringType(), True),\
                     StructField("CA_C_ID ", StringType(), True),\
                     StructField("CA_NAME", StringType(), True),\
                     StructField("CA_TAX_ST", StringType(), True),\
                     StructField("CA_ST_ID", StringType(), True)])

# Load account data
account_df = sparkSession.read.format("csv").option("delimiter", "|")\
                           .schema(schema)\
                           .load("Dataset/Batch2/Account.txt")

In [3]:
account_df.show(5)

+--------+-------+-----+--------+--------+--------------------+---------+--------+
|CDC_FLAG|CDC_DSN|CA_ID|CA_B_ID |CA_C_ID |             CA_NAME|CA_TAX_ST|CA_ST_ID|
+--------+-------+-----+--------+--------+--------------------+---------+--------+
|       I|  43490|30470|   16206|   15280|XkRcJWPVFFSGAtTGo...|        1|    ACTV|
|       U|  43491|13857|   35351|    4996|kXUQTTuZHQsJsIDcB...|        1|    ACTV|
|       U|  43492|26685|   23304|    2762|ruXPPxRMDLjswZZHv...|        1|    INAC|
|       I|  43493|30471|   43026|   15281|arQHNWBBCOGMxvWqT...|        2|    ACTV|
|       I|  43494|30472|    5711|   15282|DuQgzgldMMnEnh Fh...|        1|    ACTV|
+--------+-------+-----+--------+--------+--------------------+---------+--------+
only showing top 5 rows



### Load status table

In [4]:
# Explicitly define the schema of the data
schema = StructType([StructField("ST_ID", StringType(), True),\
                     StructField("ST_NAME", StringType(), True)])
# Load account data
status_df = sparkSession.read.format("csv").option("delimiter", "|")\
                           .schema(schema)\
                           .load("Dataset/StatusType.txt")

status_df.show(4)

+-----+---------+
|ST_ID|  ST_NAME|
+-----+---------+
| ACTV|   Active|
| CMPT|Completed|
| CNCL| Canceled|
| PNDG|  Pending|
+-----+---------+
only showing top 4 rows



#### Let's join Account table with status table.

In [5]:
account_with_status = account_df.join(
                        broadcast(status_df), 
                        account_df.CA_ST_ID == status_df.ST_ID,   
                        'inner'
                  )



In [6]:
account_with_status.show(5)

+--------+-------+-----+--------+--------+--------------------+---------+--------+-----+--------+
|CDC_FLAG|CDC_DSN|CA_ID|CA_B_ID |CA_C_ID |             CA_NAME|CA_TAX_ST|CA_ST_ID|ST_ID| ST_NAME|
+--------+-------+-----+--------+--------+--------------------+---------+--------+-----+--------+
|       I|  43490|30470|   16206|   15280|XkRcJWPVFFSGAtTGo...|        1|    ACTV| ACTV|  Active|
|       U|  43491|13857|   35351|    4996|kXUQTTuZHQsJsIDcB...|        1|    ACTV| ACTV|  Active|
|       U|  43492|26685|   23304|    2762|ruXPPxRMDLjswZZHv...|        1|    INAC| INAC|Inactive|
|       I|  43493|30471|   43026|   15281|arQHNWBBCOGMxvWqT...|        2|    ACTV| ACTV|  Active|
|       I|  43494|30472|    5711|   15282|DuQgzgldMMnEnh Fh...|        1|    ACTV| ACTV|  Active|
+--------+-------+-----+--------+--------+--------------------+---------+--------+-----+--------+
only showing top 5 rows



### Load customer.xml file 

In [7]:
import xml.etree.ElementTree as ET
from xml.dom import minidom
import numpy as np

In [8]:
# Get a generator of dict containing account data 
accounts_dict = ET.parse('Dataset/CustomerMgmt.xml').iter("Account")

In [9]:
ca_b_ids = []
for x in accounts_dict:
    ca_b_ids.append(x.attrib["CA_ID"])

In [32]:
xmldoc = minidom.parse('Dataset/CustomerMgmt.xml')
itemlist = xmldoc.getElementsByTagName('TPCDI:Action')

# Get actions types.
action_types = []
# Get surrogent keys of customers.
ca_ids = []

for item in itemlist:
    action_type = item.attributes["ActionType"].value
    if action_type != "INACT" and item.getElementsByTagName("Account"):
        ca_ids.append(item.getElementsByTagName("Account")[0].attributes["CA_ID"].value)
        action_types.append(action_type)
   
    

In [40]:
customer_data_array = np.vstack((ca_b_ids, action_types, ca_ids))

In [42]:
customer_data_array.reshape((-1, 3)).shape

(43490, 3)

In [34]:
len(action_types)

43490