In [19]:
from pyspark.sql import SparkSession
from pyspark.sql import functions as sql_functions


In [17]:
spark = SparkSession.builder.master('local[2]').appName('BOM management app').getOrCreate()

24/04/07 14:25:53 WARN SparkSession: Using an existing Spark session; only runtime SQL configurations will take effect.


# parquet source

In [135]:
parquet_df = spark.read.format('parquet').load('./bom_sources/parquets')
parquet_df.show()

+--------+------------+--------------------+--------+--------+----------+------------+-------------+------------+
|Part_Num|Product_Name| Product_Description|Quantity|EOL_Date|Unit_Price|Availability|Random_Column|Manufacturer|
+--------+------------+--------------------+--------+--------+----------+------------+-------------+------------+
|18181818|      'wxyz'|'Sed dont ut labo...|      19| 05-2024|        75|           1|       0.6239|'company 12'|
|26262626|    '234567'|'Sed do eitempor ...|      16| 11-2024|        77|           1|       0.7854| 'company 2'|
|13131313|    'mnopqr'|            'Duisur'|      15| 08-2023|        80|           1|       0.7412| 'company 7'|
|15151515|  'yzabcdef'|'Sunt im id est l...|      22| 11-2025|       120|           1|       0.5421| 'company 7'|
|16161616|'ghijklmnop'|    'Lorer sit amet'|      28| 02-2026|        85|           1|       0.3021|'company 10'|
|23232323|    '567890'|           'Sunorum'|      21| 04-2025|       115|           1|  

In [136]:
from functools import reduce


In [137]:

oldColumns = parquet_df.schema.names
newColumns = ["p_id", "name", "description", "quantity", "eol", "price", "availability", "rand_col", "manufacture"]


my_func = lambda input_df, idx: input_df.withColumnRenamed(oldColumns[idx], newColumns[idx])

parquet_df = reduce(
    my_func,
    range(len(oldColumns)),
    parquet_df
)

parquet_df.show()

+--------+------------+--------------------+--------+-------+-----+------------+--------+------------+
|    p_id|        name|         description|quantity|    eol|price|availability|rand_col| manufacture|
+--------+------------+--------------------+--------+-------+-----+------------+--------+------------+
|18181818|      'wxyz'|'Sed dont ut labo...|      19|05-2024|   75|           1|  0.6239|'company 12'|
|26262626|    '234567'|'Sed do eitempor ...|      16|11-2024|   77|           1|  0.7854| 'company 2'|
|13131313|    'mnopqr'|            'Duisur'|      15|08-2023|   80|           1|  0.7412| 'company 7'|
|15151515|  'yzabcdef'|'Sunt im id est l...|      22|11-2025|  120|           1|  0.5421| 'company 7'|
|16161616|'ghijklmnop'|    'Lorer sit amet'|      28|02-2026|   85|           1|  0.3021|'company 10'|
|23232323|    '567890'|           'Sunorum'|      21|04-2025|  115|           1|  0.6213| 'company 5'|
|24242424|    '012345'|          'Loreamet'|      29|12-2025|   82|      

In [138]:
parquet_df = parquet_df.drop('description').drop('rand_col')
parquet_df.show()

+--------+------------+--------+-------+-----+------------+------------+
|    p_id|        name|quantity|    eol|price|availability| manufacture|
+--------+------------+--------+-------+-----+------------+------------+
|18181818|      'wxyz'|      19|05-2024|   75|           1|'company 12'|
|26262626|    '234567'|      16|11-2024|   77|           1| 'company 2'|
|13131313|    'mnopqr'|      15|08-2023|   80|           1| 'company 7'|
|15151515|  'yzabcdef'|      22|11-2025|  120|           1| 'company 7'|
|16161616|'ghijklmnop'|      28|02-2026|   85|           1|'company 10'|
|23232323|    '567890'|      21|04-2025|  115|           1| 'company 5'|
|24242424|    '012345'|      29|12-2025|   82|           1| 'company 5'|
|14141414|    'stuvwx'|      35|07-2026|   67|           1| 'company 3'|
|17171717|    'qrstuv'|      32|10-2027|   60|           0|'company 11'|
|12312312|   'sdadasd'|      12|01-2026|  123|           1| 'company 1'|
|12312312|   'sdadasd'|      12|01-2026|  123|     

In [154]:
from datetime import datetime

parquet_df = parquet_df \
    .withColumn('name', sql_functions.regexp_replace('name', "'", '')) \
    .withColumn('manufacture', sql_functions.regexp_replace('manufacture', "'", '')) \
    .withColumn('price', sql_functions.col('price').cast('float')) \
    .withColumn('eol', sql_functions.add_months(sql_functions.to_date('eol', 'MM-yyyy'), 1))
parquet_df.show()

+--------+----------+--------+----------+-----+------------+-----------+
|    p_id|      name|quantity|       eol|price|availability|manufacture|
+--------+----------+--------+----------+-----+------------+-----------+
|26262626|    234567|      16|2025-02-01| 77.0|           1|  company 2|
|18181818|      wxyz|      19|2024-08-01| 75.0|           1| company 12|
|15151515|  yzabcdef|      22|2026-02-01|120.0|           1|  company 7|
|23232323|    567890|      21|2025-07-01|115.0|           1|  company 5|
|14141414|    stuvwx|      35|2026-10-01| 67.0|           1|  company 3|
|16161616|ghijklmnop|      28|2026-05-01| 85.0|           1| company 10|
|24242424|    012345|      29|2026-03-01| 82.0|           1|  company 5|
|12312312|   sdadasd|      12|2026-04-01|123.0|           1|  company 1|
|17171717|    qrstuv|      32|2028-01-01| 60.0|           0| company 11|
|10101010|       xyz|      18|2024-07-01| 72.0|           1|  company 3|
|45645645|    qwerty|      30|2025-09-01| 89.0|    

# csv source

In [157]:
csv_schema = StructType()\
    .add('Part_Number', 'integer') \
    .add('Name', 'string') \
    .add('Description', 'string') \
    .add('Quantity', 'integer') \
    .add('EOL', 'string') \
    .add('Unit_Price', 'float') \
    .add('Manufacturer', 'string') \
    .add('avail', 'integer')


csv_df = spark.read.format('csv').option('header', 'true').schema(csv_schema).load('./bom_sources/source_1.csv')
csv_df.show()

+-----------+------------+--------------------+--------+-------+----------+------------+-----+
|Part_Number|        Name|         Description|Quantity|    EOL|Unit_Price|Manufacturer|avail|
+-----------+------------+--------------------+--------+-------+----------+------------+-----+
|   12312312|   'sdadasd'|'sadas asdsad asdas'|      12|01-2026|     123.0| 'company 1'|    1|
|   45645645|    'qwerty'|'Lorem ipsum dolo...|      30|06-2025|      89.0| 'company 2'|    1|
|   78978978|    'foobar'|'Consectetur adip...|      25|09-2027|      55.0| 'company 3'|    0|
|   10101010|       'xyz'|'Sed do eiusmod t...|      18|04-2024|      72.0| 'company 4'|    1|
|   11111111|    'abcdef'|'Ut enim ad minim...|      40|12-2026|     105.0| 'company 5'|    1|
|   12121212|    'ghijkl'|'Quis nostrud exe...|      20|03-2025|      95.0| 'company 6'|    1|
|   13131313|    'mnopqr'|'Duis aute irure ...|      15|08-2023|      80.0| 'company 7'|    1|
|   14141414|    'stuvwx'|'Excepteur sint o...|   

In [160]:
old_columns = csv_df.schema.names
new_columns = ["p_id", "name", "description", "quantity", "eol", "price", "manufacture", "availability"]

In [161]:
csv_df = reduce(
    lambda input_df, idx: input_df.withColumnRenamed(old_columns[idx], new_columns[idx]),
    range(len(old_columns)),
    csv_df
)

In [179]:
csv_data = csv_df \
    .drop('description') \
    .withColumn('name', sql_functions.regexp_replace('name',"'", "")) \
    .withColumn('manufacture', sql_functions.regexp_replace('manufacture', "'", "")) \
    .withColumn('eol', sql_functions.add_months(sql_functions.to_date('eol', "MM-yyyy"), 1))

csv_data = csv_data.select('p_id', 'name', 'quantity', 'eol', 'price', 'availability', 'manufacture')
csv_data.show(200)


+--------+----------+--------+----------+-----+------------+-----------+
|    p_id|      name|quantity|       eol|price|availability|manufacture|
+--------+----------+--------+----------+-----+------------+-----------+
|12312312|   sdadasd|      12|2026-02-01|123.0|           1|  company 1|
|45645645|    qwerty|      30|2025-07-01| 89.0|           1|  company 2|
|78978978|    foobar|      25|2027-10-01| 55.0|           0|  company 3|
|10101010|       xyz|      18|2024-05-01| 72.0|           1|  company 4|
|11111111|    abcdef|      40|2027-01-01|105.0|           1|  company 5|
|12121212|    ghijkl|      20|2025-04-01| 95.0|           1|  company 6|
|13131313|    mnopqr|      15|2023-09-01| 80.0|           1|  company 7|
|14141414|    stuvwx|      35|2026-08-01| 67.0|           1|  company 8|
|15151515|  yzabcdef|      22|2025-12-01|120.0|           1|  company 9|
|16161616|ghijklmnop|      28|2026-03-01| 85.0|           1| company 10|
|17171717|    qrstuv|      32|2027-11-01| 60.0|    

# json source

In [169]:

json_df = spark.read.format('json').option("multiline", "true").load('./bom_sources/boms.json')
json_df = json_df.select(
    sql_functions.col('Part_Num').alias('p_id'),
    sql_functions.col('Product_Name').alias('name'),
    sql_functions.col('Quantity').alias('quantity'),
    sql_functions.col('EOL_Date').alias('eol'),
    sql_functions.col('Unit_Price').alias('price'),
    sql_functions.col('Availability').alias('availability'),
    sql_functions.col('Manufacturer').alias('manufacture')

)
json_df.show()

+--------+------------+--------+-------+-----+------------+------------+
|    p_id|        name|quantity|    eol|price|availability| manufacture|
+--------+------------+--------+-------+-----+------------+------------+
|12312312|   'sdadasd'|      12|01-2026|  123|           1| 'company 1'|
|45645645|    'qwerty'|      30|06-2025|   89|           1| 'company 2'|
|78978978|    'foobar'|      25|09-2027|   55|           0| 'company 3'|
|10101010|       'xyz'|      18|04-2024|   72|           1| 'company 4'|
|11111111|    'abcdef'|      40|12-2026|  105|           1| 'company 5'|
|12312312|   'sdadasd'|      12|01-2026|  123|           1| 'company 1'|
|13131313|    'mnopqr'|      15|08-2023|   80|           1| 'company 7'|
|14141414|    'stuvwx'|      35|07-2026|   67|           1| 'company 8'|
|15151515|  'yzabcdef'|      22|11-2025|  120|           1| 'company 9'|
|16161616|'ghijklmnop'|      28|02-2026|   85|           1|'company 10'|
|17171717|    'qrstuv'|      32|10-2027|   60|     

In [178]:
json_df = json_df \
    .withColumn('name', sql_functions.regexp_replace('name', "'", "")) \
    .withColumn('manufacture', sql_functions.regexp_replace('manufacture', "'", "")) \
    .withColumn('eol', sql_functions.add_months(sql_functions.to_date('eol', 'MM-yyyy'), 1)) \
    .withColumn('price', sql_functions.col('price').cast('float'))

json_df.show(200)

+--------+----------+--------+----------+-----+------------+-----------+
|    p_id|      name|quantity|       eol|price|availability|manufacture|
+--------+----------+--------+----------+-----+------------+-----------+
|12312312|   sdadasd|      12|2026-03-01|123.0|           1|  company 1|
|45645645|    qwerty|      30|2025-08-01| 89.0|           1|  company 2|
|78978978|    foobar|      25|2027-11-01| 55.0|           0|  company 3|
|10101010|       xyz|      18|2024-06-01| 72.0|           1|  company 4|
|11111111|    abcdef|      40|2027-02-01|105.0|           1|  company 5|
|12312312|   sdadasd|      12|2026-03-01|123.0|           1|  company 1|
|13131313|    mnopqr|      15|2023-10-01| 80.0|           1|  company 7|
|14141414|    stuvwx|      35|2026-09-01| 67.0|           1|  company 8|
|15151515|  yzabcdef|      22|2026-01-01|120.0|           1|  company 9|
|16161616|ghijklmnop|      28|2026-04-01| 85.0|           1| company 10|
|17171717|    qrstuv|      32|2027-12-01| 60.0|    

In [196]:
print(f'{csv_data.count(), json_df.count(), parquet_df.count() = }')

csv_data.count(), json_df.count(), parquet_df.count() = (44, 20, 17)


In [199]:
parquet_df.union(json_df).union(csv_data).distinct().count()

80