In [1]:
from pyspark.sql.types import *
from pyspark.sql import SparkSession
from pyspark.sql.functions import broadcast
from pyspark.sql.context import SQLContext

## Load preferences 

In [2]:
sparkSession = SparkSession.builder.master("local")\
                              .appName("project2")\
                              .getOrCreate()

In [28]:
def get_schema(column_names, required_columns_type):
    """
    To get schema of the file.
    """
    struct_field_list = [StructField(name, column_type, True)
                         for name, column_type in zip(column_names, required_columns_type)]
    return StructType(struct_field_list)


def load_data(file_path, schema, delimiter):
    return sparkSession.read.format("csv").option("delimiter", delimiter)\
                           .schema(schema)\
                           .load(file_path)

#### Load HR.CSV

In [38]:
column_names = ["EmployeeID", "ManagerID", "EmployeeFirstName", "EmployeeLastName",
               "EmployeeMI", "EmployeeJobCode", "EmployeeBranch", "EmployeeOffice",
               "EmployeePhone"]
required_columns_type = [StringType(), StringType(), StringType(), StringType(),
                        StringType(), StringType(), StringType(), StringType(),
                        StringType(),BooleanType(), IntegerType(), DateType(), DateType()]
file_path= "Dataset/Batch1/HR.csv"
schema = get_schema(column_names, required_columns_type)
hr_df = load_data(file_path, schema, ",")
hr_df.show(5)

+----------+---------+-----------------+----------------+----------+---------------+--------------------+--------------+--------------+
|EmployeeID|ManagerID|EmployeeFirstName|EmployeeLastName|EmployeeMI|EmployeeJobCode|      EmployeeBranch|EmployeeOffice| EmployeePhone|
+----------+---------+-----------------+----------------+----------+---------------+--------------------+--------------+--------------+
|         0|      702|            Ozkan|         Douglas|      null|            647|EGZKSobTeknHCbLuH...|    OFFICE7152|(726) 088-3331|
|         1|     1377|             Suer|         Candice|      null|            314|OfOBVvpzNvHCebxyu...|    OFFICE8586|(344) 999-2652|
|         2|      819|        Somisetty|            Jami|         P|            534|rAHWYkktOXAyPAYHl...|          null|(984) 538-5366|
|         3|      824|          Mazurek|       Rosalinda|         J|            364|TJQqsUQQGqWG QleL...|    OFFICE8487|(860) 037-6897|
|         4|     4345|        Aronovich|        

#### Load Date.txt

In [29]:
# Load Date
column_names = ["SK_DateID", "DateValue", "DateDesc", "CalendarYearID",
               "CalendarYearDesc", "CalendarQtrID", "CalendarQtrDesc", "CalendarMonthID",
               "CalendarMonthDesc", "CalendarWeekID", "CalendarWeekDesc", "DayOfWeekNum",
               "DayOfWeekDesc", "FiscalYearID", "FiscalYearDesc", "FiscalQtrID", 
               "FiscalQtrDesc", "HolidayFlag"]
required_columns_type = [IntegerType(), DateType(), StringType(), IntegerType(), StringType(),
                        IntegerType(), StringType(), IntegerType(), StringType(), IntegerType(),
                        StringType(), IntegerType(), StringType(), IntegerType(), 
                        StringType(), IntegerType(), StringType(), BooleanType()]

file_path= "Dataset/Batch1/Date.txt"
schema = get_schema(column_names, required_columns_type)
date_df = load_data(file_path, schema, "|")
date_df.show(5)

+---------+----------+---------------+--------------+----------------+-------------+---------------+---------------+-----------------+--------------+----------------+------------+-------------+------------+--------------+-----------+-------------+-----------+
|SK_DateID| DateValue|       DateDesc|CalendarYearID|CalendarYearDesc|CalendarQtrID|CalendarQtrDesc|CalendarMonthID|CalendarMonthDesc|CalendarWeekID|CalendarWeekDesc|DayOfWeekNum|DayOfWeekDesc|FiscalYearID|FiscalYearDesc|FiscalQtrID|FiscalQtrDesc|HolidayFlag|
+---------+----------+---------------+--------------+----------------+-------------+---------------+---------------+-----------------+--------------+----------------+------------+-------------+------------+--------------+-----------+-------------+-----------+
| 19500101|1950-01-01|January 1, 1950|          1950|            1950|        19501|        1950 Q1|          19501|     1950 January|         19501|         1950-W1|           7|       Sunday|        1950|          1950

#### Load Time.txt

In [31]:
# Load Time
column_names = ["SK_TimeID", "TimeValue", "HourID", "HourDesc",
               "MinuteID", "MinuteDesc", "SecondID", "SecondDesc",
               "MarketHoursFlag", "OfficeHoursFlag"]
required_columns_type = [IntegerType(),StringType(), IntegerType(),
                        StringType(), IntegerType(), StringType(), IntegerType(), 
                        StringType(), BooleanType(), BooleanType()]
file_path= "Dataset/Batch1/Time.txt"
schema = get_schema(column_names, required_columns_type)
time_df = load_data(file_path, schema, "|")
time_df.show(5)

+---------+---------+------+--------+--------+----------+--------+----------+---------------+---------------+
|SK_TimeID|TimeValue|HourID|HourDesc|MinuteID|MinuteDesc|SecondID|SecondDesc|MarketHoursFlag|OfficeHoursFlag|
+---------+---------+------+--------+--------+----------+--------+----------+---------------+---------------+
|        0| 00:00:00|     0|      00|       0|     00:00|       0|  00:00:00|          false|          false|
|        1| 00:00:01|     0|      00|       0|     00:00|       1|  00:00:01|          false|          false|
|        2| 00:00:02|     0|      00|       0|     00:00|       2|  00:00:02|          false|          false|
|        3| 00:00:03|     0|      00|       0|     00:00|       3|  00:00:03|          false|          false|
|        4| 00:00:04|     0|      00|       0|     00:00|       4|  00:00:04|          false|          false|
+---------+---------+------+--------+--------+----------+--------+----------+---------------+---------------+
only showi

#### Load CashTransaction.txt

In [8]:
# Load CashTransaction
column_names = ["SK_TimeID", "TimeValue", "HourID", "HourDesc",
               "MinuteID", "MinuteDesc", "SecondID", "SecondDesc",
               "MarketHoursFlag", "OfficeHoursFlag"]
file_path= "Dataset/Batch1/Time.txt"
schema = get_schema(column_names)
date_df = load_data(file_path, schema, "|")
date_df.show(5)

+---------+---------+------+--------+--------+----------+--------+----------+---------------+---------------+
|SK_TimeID|TimeValue|HourID|HourDesc|MinuteID|MinuteDesc|SecondID|SecondDesc|MarketHoursFlag|OfficeHoursFlag|
+---------+---------+------+--------+--------+----------+--------+----------+---------------+---------------+
|   000000| 00:00:00|    00|      00|      00|     00:00|      00|  00:00:00|          false|          false|
|   000001| 00:00:01|    00|      00|      00|     00:00|      01|  00:00:01|          false|          false|
|   000002| 00:00:02|    00|      00|      00|     00:00|      02|  00:00:02|          false|          false|
|   000003| 00:00:03|    00|      00|      00|     00:00|      03|  00:00:03|          false|          false|
|   000004| 00:00:04|    00|      00|      00|     00:00|      04|  00:00:04|          false|          false|
+---------+---------+------+--------+--------+----------+--------+----------+---------------+---------------+
only showi

### Load StatusType.txt

In [40]:
column_names= ["ST_ID", "ST_NAME"]
required_columns_type = [StringType(), StringType()]
file_path= "Dataset/Batch1/StatusType.txt"
schema = get_schema(column_names, required_columns_type)
status_df = load_data(file_path, schema, "|")
date_df.show(5)

+-----+---------+
|ST_ID|  ST_NAME|
+-----+---------+
| ACTV|   Active|
| CMPT|Completed|
| CNCL| Canceled|
| PNDG|  Pending|
| SBMT|Submitted|
+-----+---------+
only showing top 5 rows



## Create dimensions

### 1 - Create DimDate

**The source file of this dimention is: Date.txt.**
    
    -- We have already load Date.txt file as dataframe with required types.

In [18]:
DimDate = date_df

### 2 - Create DimTime

**The source file of this dimention is: Date.txt.**
    
    -- We have already load time.txt file as dataframe with required types.

In [32]:
DimTime = time_df

### 3 - Create DimBroker

**The source file of this dimention is: HR.txt.**
    
    -- We have already load HR.txt file as dataframe with required types. But we need to filter employees with code = 314

In [37]:
DimBroker = hr_df.filter(hr_df.EmployeeJobCode == 314)
DimBroker.show(5)

+----------+---------+-----------------+----------------+----------+---------------+--------------------+--------------+--------------+
|EmployeeID|ManagerID|EmployeeFirstName|EmployeeLastName|EmployeeMI|EmployeeJobCode|      EmployeeBranch|EmployeeOffice| EmployeePhone|
+----------+---------+-----------------+----------------+----------+---------------+--------------------+--------------+--------------+
|         1|     1377|             Suer|         Candice|      null|            314|OfOBVvpzNvHCebxyu...|    OFFICE8586|(344) 999-2652|
|         4|     4345|        Aronovich|        Delphine|         M|            314|IEMJHuQgCPDHCwwJk...|    OFFICE9420|(604) 387-9350|
|         8|     2146|           Hansen|        Montreal|         T|            314|sGIpORbLsRjTdhqBN...|    OFFICE6343|(991) 491-4907|
|        11|     2259|       Charchanko|          Sheela|      null|            314|Cw QJMHPgpozCKsFZ...|    OFFICE7705|(977) 726-0106|
|        14|     3663|            Knorp|        

### 4 - Create DimStatusType

In [41]:
DimStatusType = date_df

### 5 - Create DimAccount

In [42]:
import xml.etree.ElementTree as ET
from xml.dom import minidom
import numpy as np
import pandas as pd

In [53]:
# Get action types

In [None]:
xmldoc = minidom.parse('Dataset/CustomerMgmt.xml')
itemlist = xmldoc.getElementsByTagName('@TPCDI:Action')

# Get actions types.
action_types = []
for item in itemlist:
    action_types.append(action_type)
   

## #################################################

## الشغل الي تحت عك 

### Load status table

#### Let's join Account table with status table.

In [7]:
account_with_status = account_df.join(
                        broadcast(status_df), 
                        account_df.CA_ST_ID == status_df.ST_ID,   
                        'inner'
                  )



In [8]:
account_with_status.show(5)

+--------+-------+-----+--------+--------+--------------------+---------+--------+-----+--------+
|CDC_FLAG|CDC_DSN|CA_ID|CA_B_ID |CA_C_ID |             CA_NAME|CA_TAX_ST|CA_ST_ID|ST_ID| ST_NAME|
+--------+-------+-----+--------+--------+--------------------+---------+--------+-----+--------+
|       I|  43490|30470|   16206|   15280|XkRcJWPVFFSGAtTGo...|        1|    ACTV| ACTV|  Active|
|       U|  43491|13857|   35351|    4996|kXUQTTuZHQsJsIDcB...|        1|    ACTV| ACTV|  Active|
|       U|  43492|26685|   23304|    2762|ruXPPxRMDLjswZZHv...|        1|    INAC| INAC|Inactive|
|       I|  43493|30471|   43026|   15281|arQHNWBBCOGMxvWqT...|        2|    ACTV| ACTV|  Active|
|       I|  43494|30472|    5711|   15282|DuQgzgldMMnEnh Fh...|        1|    ACTV| ACTV|  Active|
+--------+-------+-----+--------+--------+--------------------+---------+--------+-----+--------+
only showing top 5 rows



### Load customer.xml file 

In [9]:
import xml.etree.ElementTree as ET
from xml.dom import minidom
import numpy as np
import pandas as pd

In [52]:
# Get a generator of dict containing account data 
accounts_dict = ET.parse('Dataset/CustomerMgmt.xml').iter("@ActionType")    
c_ids = []
for x in accounts_dict:
    print(x)
    if x.find("Account") != None:
        c_ids.append(x.attrib["C_ID"])

In [46]:
xmldoc = minidom.parse('Dataset/CustomerMgmt.xml')
itemlist = xmldoc.getElementsByTagName('@TPCDI:Action')

# Get actions types.
action_types = []
# Get surrogent keys of customers.
ca_ids = []

for item in itemlist:
    action_type = item.attributes["ActionType"].value
    if action_type != "INACT" and item.getElementsByTagName("Account"):
        ca_ids.append(item.getElementsByTagName("Account")[0].attributes["CA_ID"].value)
        action_types.append(action_type)
   
    

In [64]:
customer_data_array = []
for x, y, z in zip(c_ids, action_types, ca_ids):
    customer_data_array.append([x, y, z])

In [67]:
# Convert to Dataframe 
customer_pd = pd.DataFrame(customer_data_array, 
                 columns=["C_ID", "ACTION_TYPE", "CA_ID"], index=None)

In [68]:
mySchema = StructType([ StructField("C_ID", StringType(), True)\

                       ,StructField("ACTION_TYPE", StringType(), True)\

                       ,StructField("CA_ID", StringType(), True)])
customer_df = sparkSession.createDataFrame(customer_pd, schema = mySchema)

customer_df.show(5)

+----+-----------+-----+
|C_ID|ACTION_TYPE|CA_ID|
+----+-----------+-----+
|   0|        NEW|    0|
|   1|        NEW|    1|
|   2|        NEW|    2|
|   3|        NEW|    3|
|   4|        NEW|    4|
+----+-----------+-----+
only showing top 5 rows



### Join account_with_status with customer_df

In [69]:
dim_account_df = account_with_status.join(
                        customer_df, 
                        account_with_status.CA_ID == customer_df.CA_ID,   
                        'inner'
                  )

In [70]:
dim_account_df.show(5)

+--------+-------+-----+--------+--------+--------------------+---------+--------+-----+--------+----+-----------+-----+
|CDC_FLAG|CDC_DSN|CA_ID|CA_B_ID |CA_C_ID |             CA_NAME|CA_TAX_ST|CA_ST_ID|ST_ID| ST_NAME|C_ID|ACTION_TYPE|CA_ID|
+--------+-------+-----+--------+--------+--------------------+---------+--------+-----+--------+----+-----------+-----+
|       U|  43579|  751|   36480|     436|SBkXmBJLgAbOmSROj...|        1|    INAC| INAC|Inactive| 436|        NEW|  751|
|       U|  43574| 1143|    1474|     618|INkSQXOCuakseRkSa...|        2|    ACTV| ACTV|  Active| 618|        NEW| 1143|
|       U|  43561| 3568|   30347|    1761|LgEiiaOJQMRJNcDMm...|        1|    ACTV| ACTV|  Active|1761|    ADDACCT| 3568|
|       U|  43561| 3568|   30347|    1761|LgEiiaOJQMRJNcDMm...|        1|    ACTV| ACTV|  Active|1761|    UPDACCT| 3568|
|       U|  43553| 4128|   28792|    1607|FdnzvlBxEzFnsRpVd...|        1|    INAC| INAC|Inactive|1607|    ADDACCT| 4128|
+--------+-------+-----+--------

In [80]:
columns_to_drop = ['CA_ST_ID', 'ST_ID', 'CA_B_ID', 'CA_ID', 'CA_B_ID']
dim_account_df = dim_account_df.drop(*columns_to_drop)

In [81]:
dim_account_df.show(5)

+--------+-------+--------+--------+--------------------+---------+--------+----+-----------+
|CDC_FLAG|CDC_DSN|CA_B_ID |CA_C_ID |             CA_NAME|CA_TAX_ST| ST_NAME|C_ID|ACTION_TYPE|
+--------+-------+--------+--------+--------------------+---------+--------+----+-----------+
|       U|  43579|   36480|     436|SBkXmBJLgAbOmSROj...|        1|Inactive| 436|        NEW|
|       U|  43574|    1474|     618|INkSQXOCuakseRkSa...|        2|  Active| 618|        NEW|
|       U|  43561|   30347|    1761|LgEiiaOJQMRJNcDMm...|        1|  Active|1761|    ADDACCT|
|       U|  43561|   30347|    1761|LgEiiaOJQMRJNcDMm...|        1|  Active|1761|    UPDACCT|
|       U|  43553|   28792|    1607|FdnzvlBxEzFnsRpVd...|        1|Inactive|1607|    ADDACCT|
+--------+-------+--------+--------+--------------------+---------+--------+----+-----------+
only showing top 5 rows



In [82]:
dim_account_df.columns

['CDC_FLAG',
 'CDC_DSN',
 'CA_B_ID ',
 'CA_C_ID ',
 'CA_NAME',
 'CA_TAX_ST',
 'ST_NAME',
 'C_ID',
 'ACTION_TYPE']

### Populate the dim table

In [83]:
from pyspark.sql import HiveContext
sqlContext = HiveContext(sc)

sqlContext.sql("CREATE TABLE IF NOT EXISTS src (key INT, value STRING)")
sqlContext.sql("LOAD DATA LOCAL INPATH 'examples/src/main/resources/kv1.txt' INTO TABLE src")

# Queries can be expressed in HiveQL.
results = sqlContext.sql("FROM src SELECT key, value").collect()

NameError: name 'sc' is not defined

In [None]:
BinaryType – Binary data.
BooleanType – Boolean values.
ByteType – A byte value.
DateType – A datetime value.
DoubleType – A floating-point double value.
IntegerType – An integer value.
LongType – A long integer value.
NullType – A null value.


