In [17]:
from pyspark.sql import SparkSession
from pyspark.sql import Column

from pyspark.sql.functions import col, \
                                  lower, initcap, \
                                  concat, lit, substring

from typing import List, Union

In [18]:
ss = SparkSession.builder.master('local[2]').appName('format_json').getOrCreate()

In [22]:
custmer_json = "data/raw_data/cdw_sapp_custmer.json"

custmer_df = ss.read\
            .option("multiline", False)\
            .json(custmer_json)

In [23]:
def apply_format_phone_no(colmn: Column):
    return(concat(lit('('),
            substring(colmn, 1, 3),
            lit(')-'),
            substring(colmn,4, 3),
            lit('-'),
            substring(colmn, 7, 4)
            )
        )

In [24]:
def concat_with_sep(colmn_list: List[Column], sep: str = ','):
    concated_colmn = colmn_list [0]
    
    for colmn in colmn_list[1:]:
        concated_colmn = concat(concated_colmn, lit(sep))
        concated_colmn = concat(concated_colmn, colmn)
                
    return concated_colmn

In [25]:
custmer_df = custmer_df.withColumn('FIRST_NAME', initcap(col('FIRST_NAME')))
custmer_df = custmer_df.withColumn('MIDDLE_NAME', lower(col('MIDDLE_NAME')))
custmer_df = custmer_df.withColumn('LAST_NAME', initcap(col('LAST_NAME')))

custmer_df = custmer_df.withColumn('FULL_STREET_ADDRESS',
                   concat_with_sep([col('STREET_NAME'), col('APT_NO')], sep = ',')
                   )

custmer_df = custmer_df.withColumn('CUST_PHONE', apply_format_phone_no(col('CUST_PHONE')))


---

In [30]:
custmer_df.printSchema()

root
 |-- APT_NO: string (nullable = true)
 |-- CREDIT_CARD_NO: string (nullable = true)
 |-- CUST_CITY: string (nullable = true)
 |-- CUST_COUNTRY: string (nullable = true)
 |-- CUST_EMAIL: string (nullable = true)
 |-- CUST_PHONE: string (nullable = true)
 |-- CUST_STATE: string (nullable = true)
 |-- CUST_ZIP: string (nullable = true)
 |-- FIRST_NAME: string (nullable = true)
 |-- LAST_NAME: string (nullable = true)
 |-- LAST_UPDATED: string (nullable = true)
 |-- MIDDLE_NAME: string (nullable = true)
 |-- SSN: long (nullable = true)
 |-- STREET_NAME: string (nullable = true)
 |-- FULL_STREET_ADDRESS: string (nullable = true)



In [32]:
custmer_df.show(4)

+------+----------------+------------+-------------+-------------------+-----------+----------+--------+----------+---------+--------------------+-----------+---------+-----------------+--------------------+
|APT_NO|  CREDIT_CARD_NO|   CUST_CITY| CUST_COUNTRY|         CUST_EMAIL| CUST_PHONE|CUST_STATE|CUST_ZIP|FIRST_NAME|LAST_NAME|        LAST_UPDATED|MIDDLE_NAME|      SSN|      STREET_NAME| FULL_STREET_ADDRESS|
+------+----------------+------------+-------------+-------------------+-----------+----------+--------+----------+---------+--------------------+-----------+---------+-----------------+--------------------+
|   656|4210653310061055|     Natchez|United States|AHooper@example.com|(123)-781-8|        MS|   39120|      Alec|   Hooper|2018-04-21T12:49:...|         wm|123456100|Main Street North|Main Street North...|
|   829|4210653310102868|Wethersfield|United States|EHolman@example.com|(123)-893-3|        CT|   06109|      Etta|   Holman|2018-04-21T12:49:...|    brendan|123453023|