In [0]:
import getpass
username = getpass.getuser()

In [0]:
input_dir = '/public/retail_db'
output_dir = f'/user/{username}/retail_db_pipe'

In [0]:
dbutils.fs.ls('/public/retail_db')

[FileInfo(path='dbfs:/public/retail_db/README.md', name='README.md', size=826, modificationTime=1688522119000),
 FileInfo(path='dbfs:/public/retail_db/categories/', name='categories/', size=0, modificationTime=1688522106000),
 FileInfo(path='dbfs:/public/retail_db/create_db.sql', name='create_db.sql', size=10303495, modificationTime=1688522111000),
 FileInfo(path='dbfs:/public/retail_db/create_db_tables_pg.sql', name='create_db_tables_pg.sql', size=1830, modificationTime=1688522112000),
 FileInfo(path='dbfs:/public/retail_db/customers/', name='customers/', size=0, modificationTime=1688522112000),
 FileInfo(path='dbfs:/public/retail_db/departments/', name='departments/', size=0, modificationTime=1688522113000),
 FileInfo(path='dbfs:/public/retail_db/load_db_tables_pg.sql', name='load_db_tables_pg.sql', size=10297392, modificationTime=1688522116000),
 FileInfo(path='dbfs:/public/retail_db/order_items/', name='order_items/', size=0, modificationTime=1688522117000),
 FileInfo(path='dbfs:/p

In [0]:
for file_details in dbutils.fs.ls(input_dir):
    if not ('.git' in file_details.path or file_details.path.endswith('sql')):
        print(f'Converting data in {file_details.path} folder from comma seperated to pipe seperated')
        df = spark.read.csv(file_details.path)
        folder_name = file_details.path.split('/')[-2]        
        df.coalesce(1).write.mode('overwrite').csv(f'{output_dir}/{folder_name}', sep = '|')

Converting data in dbfs:/public/retail_db/README.md folder from comma seperated to pipe seperated
Converting data in dbfs:/public/retail_db/categories/ folder from comma seperated to pipe seperated
Converting data in dbfs:/public/retail_db/customers/ folder from comma seperated to pipe seperated
Converting data in dbfs:/public/retail_db/departments/ folder from comma seperated to pipe seperated
Converting data in dbfs:/public/retail_db/order_items/ folder from comma seperated to pipe seperated
Converting data in dbfs:/public/retail_db/orders/ folder from comma seperated to pipe seperated
Converting data in dbfs:/public/retail_db/products/ folder from comma seperated to pipe seperated


In [0]:
schema = """ 
    order_id INT,
    order_date TIMESTAMP,
    order_customer_id INT,
    order_status STRING
"""

In [0]:
orders = spark.read.schema(schema).csv(f'/user/{username}/retail_db_pipe/orders')

In [0]:
orders.show()

+--------+----------+-----------------+------------+
|order_id|order_date|order_customer_id|order_status|
+--------+----------+-----------------+------------+
|    NULL|      NULL|             NULL|        NULL|
|    NULL|      NULL|             NULL|        NULL|
|    NULL|      NULL|             NULL|        NULL|
|    NULL|      NULL|             NULL|        NULL|
|    NULL|      NULL|             NULL|        NULL|
|    NULL|      NULL|             NULL|        NULL|
|    NULL|      NULL|             NULL|        NULL|
|    NULL|      NULL|             NULL|        NULL|
|    NULL|      NULL|             NULL|        NULL|
|    NULL|      NULL|             NULL|        NULL|
|    NULL|      NULL|             NULL|        NULL|
|    NULL|      NULL|             NULL|        NULL|
|    NULL|      NULL|             NULL|        NULL|
|    NULL|      NULL|             NULL|        NULL|
|    NULL|      NULL|             NULL|        NULL|
|    NULL|      NULL|             NULL|       

In [0]:
orders = spark.read.schema(schema).csv(f'/user/{username}/retail_db_pipe/orders', sep = '|')

In [0]:
orders.show()

+--------+-------------------+-----------------+---------------+
|order_id|         order_date|order_customer_id|   order_status|
+--------+-------------------+-----------------+---------------+
|       1|2013-07-25 00:00:00|            11599|         CLOSED|
|       2|2013-07-25 00:00:00|              256|PENDING_PAYMENT|
|       3|2013-07-25 00:00:00|            12111|       COMPLETE|
|       4|2013-07-25 00:00:00|             8827|         CLOSED|
|       5|2013-07-25 00:00:00|            11318|       COMPLETE|
|       6|2013-07-25 00:00:00|             7130|       COMPLETE|
|       7|2013-07-25 00:00:00|             4530|       COMPLETE|
|       8|2013-07-25 00:00:00|             2911|     PROCESSING|
|       9|2013-07-25 00:00:00|             5657|PENDING_PAYMENT|
|      10|2013-07-25 00:00:00|             5648|PENDING_PAYMENT|
|      11|2013-07-25 00:00:00|              918| PAYMENT_REVIEW|
|      12|2013-07-25 00:00:00|             1837|         CLOSED|
|      13|2013-07-25 00:0