# PYSPARK SCHEMA RECOVERY

# StructType • StructField • Complex Types • Data Fixing

# DATASET 1 — USER PROFILE API (CORRUPTED TYPES)

# Raw data (as received from API)

In [1]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col
spark=SparkSession.builder.appName('Struct Type').getOrCreate()

In [2]:
raw_users = [
("U001","Amit","29","Hyderabad","50000"),
("U002","Neha","Thirty Two","Delhi","62000"),
("U003","Ravi",None,"Bangalore","45k"),
("U004","Pooja","28","Mumbai",58000),
("U005",None,"31","Chennai","")
]

# Exercises

# 1. Design a StructType schema for this data

In [3]:
from pyspark.sql.types import(
    StructType,
    StructField,
    StringType,
    IntegerType,
    LongType
)

In [7]:
user_schema=StructType([
    StructField('user_id',StringType(),nullable=False),
    StructField('name',StringType(),nullable=True),
    StructField('age',StringType(),nullable=True),
    StructField('city',StringType(),nullable=True),
    StructField('salary',StringType(),nullable=True)
])

# 2. Load the data using the schema

In [8]:
df_users=spark.createDataFrame(data=raw_users,schema=user_schema)


In [9]:
df_users.show()
df_users.printSchema()

+-------+-----+----------+---------+------+
|user_id| name|       age|     city|salary|
+-------+-----+----------+---------+------+
|   U001| Amit|        29|Hyderabad| 50000|
|   U002| Neha|Thirty Two|    Delhi| 62000|
|   U003| Ravi|      NULL|Bangalore|   45k|
|   U004|Pooja|        28|   Mumbai| 58000|
|   U005| NULL|        31|  Chennai|      |
+-------+-----+----------+---------+------+

root
 |-- user_id: string (nullable = false)
 |-- name: string (nullable = true)
 |-- age: string (nullable = true)
 |-- city: string (nullable = true)
 |-- salary: string (nullable = true)



# 3. Identify records that fail type conversion

In [19]:

df_cast = df_users.withColumn(
    "age_int",
    when(col("age").rlike(r"^[0-9]+$"), col("age").cast("int")).otherwise(None)
).withColumn(
    "salary_str", col("salary").cast("string")
).withColumn(
    "salary_int",
    when(col("salary_str").rlike(r"^[0-9]+$"), col("salary_str").cast("int")).otherwise(None)
)

df_fail = df_cast.filter(
    (col("age").isNotNull() & col("age_int").isNull()) |
    (col("salary").isNotNull() & col("salary_int").isNull())
).select("user_id", "name", "age", "city", "salary")

df_fail.show(truncate=False)


+-------+----+----------+---------+------+
|user_id|name|age       |city     |salary|
+-------+----+----------+---------+------+
|U002   |Neha|Thirty Two|Delhi    |62000 |
|U003   |Ravi|NULL      |Bangalore|45k   |
|U005   |NULL|31        |Chennai  |      |
+-------+----+----------+---------+------+



# 4. Convert age to integer safely

In [20]:
from pyspark.sql.functions import col, when, trim
from pyspark.sql.types import IntegerType

df_users_clean = df_users.withColumn(
    "age_clean", trim(col("age"))
).withColumn(
    "age_int",
    when(
        (col("age_clean").isNotNull()) & (col("age_clean") != '') & col("age_clean").rlike("^[0-9]+$"),
        col("age_clean").cast(IntegerType())
    ).otherwise(None)
).drop("age_clean")

print("DataFrame with safe integer age column:")
df_users_clean.select("user_id", "name", "age", "age_int").show(truncate=False)
df_users_clean.printSchema()

DataFrame with safe integer age column:
+-------+-----+----------+-------+
|user_id|name |age       |age_int|
+-------+-----+----------+-------+
|U001   |Amit |29        |29     |
|U002   |Neha |Thirty Two|NULL   |
|U003   |Ravi |NULL      |NULL   |
|U004   |Pooja|28        |28     |
|U005   |NULL |31        |31     |
+-------+-----+----------+-------+

root
 |-- user_id: string (nullable = false)
 |-- name: string (nullable = true)
 |-- age: string (nullable = true)
 |-- city: string (nullable = true)
 |-- salary: string (nullable = true)
 |-- age_int: integer (nullable = true)



# 5. Normalize salary into integer (handle k )

In [21]:
from pyspark.sql.functions import col, when, regexp_replace
from pyspark.sql.types import LongType

df_users_with_salary_int = df_users_clean.withColumn(
    "salary_int",
    when(
        (col("salary").isNotNull()) & (col("salary") != '') & col("salary").rlike("^[0-9]+$"),
        col("salary").cast(LongType())
    ).when(
        (col("salary").isNotNull()) & (col("salary") != '') & col("salary").rlike("^[0-9]+[kK]$"),
        regexp_replace(col("salary"), "[kK]", "000").cast(LongType())
    ).otherwise(None)
)

print("DataFrame with normalized salary column:")
df_users_with_salary_int.select("user_id", "salary", "salary_int").show(truncate=False)
df_users_with_salary_int.printSchema()

DataFrame with normalized salary column:
+-------+------+----------+
|user_id|salary|salary_int|
+-------+------+----------+
|U001   |50000 |50000     |
|U002   |62000 |62000     |
|U003   |45k   |45000     |
|U004   |58000 |58000     |
|U005   |      |NULL      |
+-------+------+----------+

root
 |-- user_id: string (nullable = false)
 |-- name: string (nullable = true)
 |-- age: string (nullable = true)
 |-- city: string (nullable = true)
 |-- salary: string (nullable = true)
 |-- age_int: integer (nullable = true)
 |-- salary_int: long (nullable = true)



# 6. Replace missing names with "UNKNOWN"

In [22]:
from pyspark.sql.functions import col

df_users_with_unknown_names = df_users_with_salary_int.fillna("UNKNOWN", subset=["name"])

print("DataFrame with missing names replaced by 'UNKNOWN':")
df_users_with_unknown_names.select("user_id", "name", "age_int", "salary_int").show(truncate=False)

DataFrame with missing names replaced by 'UNKNOWN':
+-------+-------+-------+----------+
|user_id|name   |age_int|salary_int|
+-------+-------+-------+----------+
|U001   |Amit   |29     |50000     |
|U002   |Neha   |NULL   |62000     |
|U003   |Ravi   |NULL   |45000     |
|U004   |Pooja  |28     |58000     |
|U005   |UNKNOWN|31     |NULL      |
+-------+-------+-------+----------+



# 7. Drop records where age cannot be recovered

In [23]:
from pyspark.sql.functions import col

df_users_final = df_users_with_unknown_names.filter(col("age_int").isNotNull())

print("DataFrame after dropping records with unrecoverable age:")
df_users_final.select("user_id", "name", "age", "age_int", "salary", "salary_int").show(truncate=False)
df_users_final.printSchema()

DataFrame after dropping records with unrecoverable age:
+-------+-------+---+-------+------+----------+
|user_id|name   |age|age_int|salary|salary_int|
+-------+-------+---+-------+------+----------+
|U001   |Amit   |29 |29     |50000 |50000     |
|U004   |Pooja  |28 |28     |58000 |58000     |
|U005   |UNKNOWN|31 |31     |      |NULL      |
+-------+-------+---+-------+------+----------+

root
 |-- user_id: string (nullable = false)
 |-- name: string (nullable = false)
 |-- age: string (nullable = true)
 |-- city: string (nullable = true)
 |-- salary: string (nullable = true)
 |-- age_int: integer (nullable = true)
 |-- salary_int: long (nullable = true)



# 8. Produce a final clean DataFrame

In [24]:
df_final_clean = df_users_final.select("user_id", "name", "age_int", "city", "salary_int")

print("Final Clean DataFrame:")
df_final_clean.show(truncate=False)
df_final_clean.printSchema()

Final Clean DataFrame:
+-------+-------+-------+---------+----------+
|user_id|name   |age_int|city     |salary_int|
+-------+-------+-------+---------+----------+
|U001   |Amit   |29     |Hyderabad|50000     |
|U004   |Pooja  |28     |Mumbai   |58000     |
|U005   |UNKNOWN|31     |Chennai  |NULL      |
+-------+-------+-------+---------+----------+

root
 |-- user_id: string (nullable = false)
 |-- name: string (nullable = false)
 |-- age_int: integer (nullable = true)
 |-- city: string (nullable = true)
 |-- salary_int: long (nullable = true)



# DATASET 2 — E-COMMERCE ORDERS (ARRAY CORRUPTION)

In [25]:
raw_orders = [
("O001","U001","Laptop,Mobile,Tablet",75000),
("O002","U002",["Mobile","Tablet"],32000),
("O003","U003","Laptop",72000),
("O004","U004",None,25000),
("O005","U005","Laptop|Mobile",68000)
]

In [26]:
from pyspark.sql.types import ArrayType

# 1. Define a schema with ArrayType

In [27]:
from pyspark.sql.types import (
    StructType,
    StructField,
    StringType,
    IntegerType,
    ArrayType
)

order_schema = StructType([
    StructField('order_id', StringType(), False),
    StructField('user_id', StringType(), True),
    StructField('items', ArrayType(StringType()), True),
    StructField('amount', IntegerType(), True)
])

print("Order Schema Defined:")
print(order_schema)

Order Schema Defined:
StructType([StructField('order_id', StringType(), False), StructField('user_id', StringType(), True), StructField('items', ArrayType(StringType(), True), True), StructField('amount', IntegerType(), True)])


# 2. Normalize all item values into arrays

In [28]:
from pyspark.sql.functions import col, when, split, array, lit, trim
from pyspark.sql.types import ArrayType, StringType, StructType, StructField, IntegerType

processed_raw_orders = []
for order in raw_orders:
    order_id, user_id, items_data, amount = order
    if isinstance(items_data, list):
        items_data_str = ",".join(items_data)
    elif items_data is None:
        items_data_str = None
    else:
        items_data_str = str(items_data)
    processed_raw_orders.append((order_id, user_id, items_data_str, amount))
intermediate_order_schema = StructType([
    StructField('order_id', StringType(), False),
    StructField('user_id', StringType(), True),
    StructField('items_str', StringType(), True),
    StructField('amount', IntegerType(), True)
])

df_orders_raw = spark.createDataFrame(data=processed_raw_orders, schema=intermediate_order_schema)

df_orders_clean = df_orders_raw.withColumn(
    "items",
    when(col("items_str").isNull(), array().cast(ArrayType(StringType())))
    .when(trim(col("items_str")) == lit(""), array().cast(ArrayType(StringType())))
    .when(col("items_str").contains(","), split(col("items_str"), ","))
    .when(col("items_str").contains("|"), split(col("items_str"), "\\|"))
    .otherwise(array(col("items_str")))
).drop("items_str")

print("Normalized Orders DataFrame:")
df_orders_clean.show(truncate=False)
df_orders_clean.printSchema()

Normalized Orders DataFrame:
+--------+-------+------+------------------------+
|order_id|user_id|amount|items                   |
+--------+-------+------+------------------------+
|O001    |U001   |75000 |[Laptop, Mobile, Tablet]|
|O002    |U002   |32000 |[Mobile, Tablet]        |
|O003    |U003   |72000 |[Laptop]                |
|O004    |U004   |25000 |[]                      |
|O005    |U005   |68000 |[Laptop, Mobile]        |
+--------+-------+------+------------------------+

root
 |-- order_id: string (nullable = false)
 |-- user_id: string (nullable = true)
 |-- amount: integer (nullable = true)
 |-- items: array (nullable = true)
 |    |-- element: string (containsNull = true)



# 3. Handle multiple delimiters

In [30]:
df_orders_clean = df_orders_raw.withColumn(
    "items",
    when(col("items_str").isNull(), array().cast(ArrayType(StringType())))
    .when(trim(col("items_str")) == lit(""), array().cast(ArrayType(StringType())))
    .when(col("items_str").contains(","), split(col("items_str"), ","))
    .when(col("items_str").contains("|"), split(col("items_str"), "\\|"))
    .otherwise(array(col("items_str")))
).drop("items_str")

print("Normalized Orders DataFrame:")
df_orders_clean.show(truncate=False)
df_orders_clean.printSchema()

Normalized Orders DataFrame:
+--------+-------+------+------------------------+
|order_id|user_id|amount|items                   |
+--------+-------+------+------------------------+
|O001    |U001   |75000 |[Laptop, Mobile, Tablet]|
|O002    |U002   |32000 |[Mobile, Tablet]        |
|O003    |U003   |72000 |[Laptop]                |
|O004    |U004   |25000 |[]                      |
|O005    |U005   |68000 |[Laptop, Mobile]        |
+--------+-------+------+------------------------+

root
 |-- order_id: string (nullable = false)
 |-- user_id: string (nullable = true)
 |-- amount: integer (nullable = true)
 |-- items: array (nullable = true)
 |    |-- element: string (containsNull = true)



#4. Replace null items with empty arrays

In [31]:


print("df_orders_clean with null items already replaced by empty arrays:")
df_orders_clean.show(truncate=False)
df_orders_clean.printSchema()

df_orders_clean with null items already replaced by empty arrays:
+--------+-------+------+------------------------+
|order_id|user_id|amount|items                   |
+--------+-------+------+------------------------+
|O001    |U001   |75000 |[Laptop, Mobile, Tablet]|
|O002    |U002   |32000 |[Mobile, Tablet]        |
|O003    |U003   |72000 |[Laptop]                |
|O004    |U004   |25000 |[]                      |
|O005    |U005   |68000 |[Laptop, Mobile]        |
+--------+-------+------+------------------------+

root
 |-- order_id: string (nullable = false)
 |-- user_id: string (nullable = true)
 |-- amount: integer (nullable = true)
 |-- items: array (nullable = true)
 |    |-- element: string (containsNull = true)



# 5. Explode items into one row per item

In [32]:
from pyspark.sql.functions import explode

df_orders_exploded = df_orders_clean.withColumn("item", explode(col("items")))

print("Orders DataFrame after exploding 'items' column:")
df_orders_exploded.select("order_id", "user_id", "item", "amount").show(truncate=False)
df_orders_exploded.printSchema()

Orders DataFrame after exploding 'items' column:
+--------+-------+------+------+
|order_id|user_id|item  |amount|
+--------+-------+------+------+
|O001    |U001   |Laptop|75000 |
|O001    |U001   |Mobile|75000 |
|O001    |U001   |Tablet|75000 |
|O002    |U002   |Mobile|32000 |
|O002    |U002   |Tablet|32000 |
|O003    |U003   |Laptop|72000 |
|O005    |U005   |Laptop|68000 |
|O005    |U005   |Mobile|68000 |
+--------+-------+------+------+

root
 |-- order_id: string (nullable = false)
 |-- user_id: string (nullable = true)
 |-- amount: integer (nullable = true)
 |-- items: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- item: string (nullable = true)



# 6. Count frequency of each item

In [33]:
df_orders_frequency=df_orders_clean.withColumn("item",explode(col("items"))).groupBy("item").count()
print("Frequency of Each Item:")
df_orders_frequency.show()

Frequency of Each Item:
+------+-----+
|  item|count|
+------+-----+
|Laptop|    3|
|Mobile|    3|
|Tablet|    2|
+------+-----+



# 7. Identify orders with more than 2 items

In [34]:
from pyspark.sql.functions import col, size

df_orders_more_than_2_items = df_orders_clean.filter(size(col("items")) > 2)

print("Orders with more than 2 items:")
df_orders_more_than_2_items.select("order_id", "items").show(truncate=False)

Orders with more than 2 items:
+--------+------------------------+
|order_id|items                   |
+--------+------------------------+
|O001    |[Laptop, Mobile, Tablet]|
+--------+------------------------+



# DATASET 3 — DEVICE USAGE (MAP CORRUPTION)

In [35]:
raw_devices = [
("U001",{"mobile":120,"laptop":300}),
("U002","mobile:200,tablet:100"),
("U003",{"desktop":"400","mobile":"150"}),
("U004",None),
("U005","laptop-250")
]

# 1. Design a MapType(StringType, IntegerType) schema

In [36]:
from pyspark.sql.types import MapType

In [37]:
device_schema=StructType([
    StructField('user_id',StringType(),nullable=False),
    StructField('device_usage',MapType(StringType(),IntegerType()),nullable=True)
])

# 2. Parse string maps into proper maps

In [38]:
from pyspark.sql.functions import col, when, split, lit, array, create_map, regexp_replace, trim, map_from_entries, transform, struct
from pyspark.sql.types import MapType, StringType, IntegerType, StructType, StructField

processed_raw_devices = []
for user_id, device_data in raw_devices:
    device_data_str = None
    if isinstance(device_data, dict):
        parts = []
        for k, v in device_data.items():
            parts.append(f"{k}:{v}")
        device_data_str = ",".join(parts)
    elif isinstance(device_data, str):
        if '-' in device_data and ':' not in device_data:
            device_data_str = device_data.replace('-', ':')
        else:
            device_data_str = device_data
    processed_raw_devices.append((user_id, device_data_str))

intermediate_device_schema = StructType([
    StructField('user_id', StringType(), False),
    StructField('device_usage_raw_str', StringType(), True)
])

df_devices_raw_str = spark.createDataFrame(data=processed_raw_devices, schema=intermediate_device_schema)

df_devices_clean = df_devices_raw_str.withColumn(
    "device_usage",
    when(
        col("device_usage_raw_str").isNull() | (trim(col("device_usage_raw_str")) == lit("")),
        create_map().cast(MapType(StringType(), IntegerType()))
    ).otherwise(
        map_from_entries(
            transform(
                split(col("device_usage_raw_str"), ","),
                lambda kv_pair:
                    struct(
                        split(kv_pair, ":").getItem(0).alias("key"),
                        when(
                            split(kv_pair, ":").getItem(1).rlike("^[0-9]+$"),
                            split(kv_pair, ":").getItem(1).cast(IntegerType())
                        ).otherwise(lit(None)).alias("value")
                    )
            )
        )
    )
).drop("device_usage_raw_str")

print("Cleaned Device Usage DataFrame:")
df_devices_clean.show(truncate=False)
df_devices_clean.printSchema()

Cleaned Device Usage DataFrame:
+-------+-------------------------------+
|user_id|device_usage                   |
+-------+-------------------------------+
|U001   |{mobile -> 120, laptop -> 300} |
|U002   |{mobile -> 200, tablet -> 100} |
|U003   |{desktop -> 400, mobile -> 150}|
|U004   |{}                             |
|U005   |{laptop -> 250}                |
+-------+-------------------------------+

root
 |-- user_id: string (nullable = false)
 |-- device_usage: map (nullable = true)
 |    |-- key: string
 |    |-- value: integer (valueContainsNull = true)



# 3. Convert all usage values to integers

In [39]:
print("df_devices_clean already has usage values converted to integers:")
df_devices_clean.show(truncate=False)
df_devices_clean.printSchema()

df_devices_clean already has usage values converted to integers:
+-------+-------------------------------+
|user_id|device_usage                   |
+-------+-------------------------------+
|U001   |{mobile -> 120, laptop -> 300} |
|U002   |{mobile -> 200, tablet -> 100} |
|U003   |{desktop -> 400, mobile -> 150}|
|U004   |{}                             |
|U005   |{laptop -> 250}                |
+-------+-------------------------------+

root
 |-- user_id: string (nullable = false)
 |-- device_usage: map (nullable = true)
 |    |-- key: string
 |    |-- value: integer (valueContainsNull = true)



# 4. Handle malformed key-value pairs

In [40]:
print("df_devices_clean already handles malformed key-value pairs by setting malformed values to NULL:")
df_devices_clean.show(truncate=False)
df_devices_clean.printSchema()

df_devices_clean already handles malformed key-value pairs by setting malformed values to NULL:
+-------+-------------------------------+
|user_id|device_usage                   |
+-------+-------------------------------+
|U001   |{mobile -> 120, laptop -> 300} |
|U002   |{mobile -> 200, tablet -> 100} |
|U003   |{desktop -> 400, mobile -> 150}|
|U004   |{}                             |
|U005   |{laptop -> 250}                |
+-------+-------------------------------+

root
 |-- user_id: string (nullable = false)
 |-- device_usage: map (nullable = true)
 |    |-- key: string
 |    |-- value: integer (valueContainsNull = true)



# 5. Replace missing maps with empty maps

In [41]:
print("df_devices_clean already has missing (NULL) maps replaced with empty maps:")
df_devices_clean.show(truncate=False)
df_devices_clean.printSchema()

df_devices_clean already has missing (NULL) maps replaced with empty maps:
+-------+-------------------------------+
|user_id|device_usage                   |
+-------+-------------------------------+
|U001   |{mobile -> 120, laptop -> 300} |
|U002   |{mobile -> 200, tablet -> 100} |
|U003   |{desktop -> 400, mobile -> 150}|
|U004   |{}                             |
|U005   |{laptop -> 250}                |
+-------+-------------------------------+

root
 |-- user_id: string (nullable = false)
 |-- device_usage: map (nullable = true)
 |    |-- key: string
 |    |-- value: integer (valueContainsNull = true)



# 6. Extract mobile usage safely

In [42]:
from pyspark.sql.functions import col

df_devices_with_mobile_usage = df_devices_clean.withColumn(
    "mobile_usage", col("device_usage")["mobile"]
)

print("DataFrame with extracted mobile usage:")
df_devices_with_mobile_usage.select("user_id", "device_usage", "mobile_usage").show(truncate=False)
df_devices_with_mobile_usage.printSchema()

DataFrame with extracted mobile usage:
+-------+-------------------------------+------------+
|user_id|device_usage                   |mobile_usage|
+-------+-------------------------------+------------+
|U001   |{mobile -> 120, laptop -> 300} |120         |
|U002   |{mobile -> 200, tablet -> 100} |200         |
|U003   |{desktop -> 400, mobile -> 150}|150         |
|U004   |{}                             |NULL        |
|U005   |{laptop -> 250}                |NULL        |
+-------+-------------------------------+------------+

root
 |-- user_id: string (nullable = false)
 |-- device_usage: map (nullable = true)
 |    |-- key: string
 |    |-- value: integer (valueContainsNull = true)
 |-- mobile_usage: integer (nullable = true)



# 7. Identify users with usage above a threshold

In [43]:
from pyspark.sql.functions import col
usage_threshold = 150

df_users_above_threshold = df_devices_with_mobile_usage.filter(
    col("mobile_usage").isNotNull() & (col("mobile_usage") > usage_threshold)
)

print(f"Users with mobile usage above {usage_threshold}:")
df_users_above_threshold.select("user_id", "mobile_usage").show(truncate=False)

Users with mobile usage above 150:
+-------+------------+
|user_id|mobile_usage|
+-------+------------+
|U002   |200         |
+-------+------------+



# DATASET 4 — NESTED ADDRESS JSON (BROKEN STRUCTS)

In [44]:
raw_profiles = [
("U001","Hyderabad,Telangana,500081"),
("U002",{"city":"Delhi","state":"Delhi","pincode":"110001"}),
("U003",("Bangalore","Karnataka",560001)),
("U004","Mumbai,MH"),
("U005",None)
]

# 1. Design a nested StructType for address

In [45]:
address_schema=StructType([
    StructField('city',StringType(),nullable=True),
    StructField('state',StringType(),nullable=True),
    StructField('pincode',IntegerType(),nullable=True)
])

profile_schema=StructType([
    StructField('user_id',StringType(),nullable=False),
    StructField('address',address_schema,nullable=True)
])

# 2. Normalize all address formats into struct

In [47]:
from pyspark.sql.functions import col, when, split, lit, struct, array_contains, get
from pyspark.sql.types import StringType, IntegerType, StructType, StructField

processed_raw_profiles = []
for user_id, address_data in raw_profiles:
    address_str = None
    if isinstance(address_data, str):
        address_str = address_data
    elif isinstance(address_data, dict):
        city = address_data.get('city')
        state = address_data.get('state')
        pincode = address_data.get('pincode')
        address_str = f"{city or ''},{state or ''},{pincode or ''}"
    elif isinstance(address_data, tuple):
        city, state, pincode = address_data
        address_str = f"{city or ''},{state or ''},{pincode or ''}"
    processed_raw_profiles.append((user_id, address_str))

intermediate_profile_schema = StructType([
    StructField('user_id', StringType(), False),
    StructField('address_raw_str', StringType(), True)
])

df_profiles_raw_str = spark.createDataFrame(data=processed_raw_profiles, schema=intermediate_profile_schema)

df_profiles_clean = df_profiles_raw_str.withColumn(
    "address",
    when(
        col("address_raw_str").isNull() | (col("address_raw_str") == lit("")),
        lit(None).cast(address_schema)
    ).otherwise(
        struct(
            get(split(col("address_raw_str"), ","), 0).alias("city"),
            get(split(col("address_raw_str"), ","), 1).alias("state"),
            when(
                get(split(col("address_raw_str"), ","), 2).rlike("^[0-9]+$"),
                get(split(col("address_raw_str"), ","), 2).cast(IntegerType())
            ).otherwise(lit(None)).alias("pincode")
        )
    )
).drop("address_raw_str")

print("Cleaned Profiles DataFrame with normalized address:")
df_profiles_clean.show(truncate=False)
df_profiles_clean.printSchema()

Cleaned Profiles DataFrame with normalized address:
+-------+------------------------------+
|user_id|address                       |
+-------+------------------------------+
|U001   |{Hyderabad, Telangana, 500081}|
|U002   |{Delhi, Delhi, 110001}        |
|U003   |{Bangalore, Karnataka, 560001}|
|U004   |{Mumbai, MH, NULL}            |
|U005   |NULL                          |
+-------+------------------------------+

root
 |-- user_id: string (nullable = false)
 |-- address: struct (nullable = true)
 |    |-- city: string (nullable = true)
 |    |-- state: string (nullable = true)
 |    |-- pincode: integer (nullable = true)



# 3. Extract city, state, pincode safely

In [60]:
print("City, State, and Pincode have already been safely extracted and flattened:")
df_profiles_flat.show(truncate=False)
df_profiles_flat.printSchema()

City, State, and Pincode have already been safely extracted and flattened:
+-------+---------+---------+-------+
|user_id|city     |state    |pincode|
+-------+---------+---------+-------+
|U001   |Hyderabad|Telangana|500081 |
|U002   |Delhi    |Delhi    |110001 |
|U003   |Bangalore|Karnataka|560001 |
|U004   |Mumbai   |MH       |0      |
+-------+---------+---------+-------+

root
 |-- user_id: string (nullable = false)
 |-- city: string (nullable = true)
 |-- state: string (nullable = true)
 |-- pincode: integer (nullable = true)



# 4. Set default pincode when missing

In [48]:
from pyspark.sql.functions import col, when, lit, struct

default_pincode = 0

df_profiles_with_default_pincode = df_profiles_clean.withColumn(
    "address",
    when(
        col("address").isNotNull(),
        struct(
            col("address.city").alias("city"),
            col("address.state").alias("state"),
            when(col("address.pincode").isNull(), lit(default_pincode).cast(IntegerType()))
            .otherwise(col("address.pincode"))
            .alias("pincode")
        )
    ).otherwise(lit(None).cast(address_schema))
)

print("Profiles DataFrame with default pincode:")
df_profiles_with_default_pincode.show(truncate=False)
df_profiles_with_default_pincode.printSchema()

Profiles DataFrame with default pincode:
+-------+------------------------------+
|user_id|address                       |
+-------+------------------------------+
|U001   |{Hyderabad, Telangana, 500081}|
|U002   |{Delhi, Delhi, 110001}        |
|U003   |{Bangalore, Karnataka, 560001}|
|U004   |{Mumbai, MH, 0}               |
|U005   |NULL                          |
+-------+------------------------------+

root
 |-- user_id: string (nullable = false)
 |-- address: struct (nullable = true)
 |    |-- city: string (nullable = true)
 |    |-- state: string (nullable = true)
 |    |-- pincode: integer (nullable = true)



# 5. Drop irrecoverable records

In [49]:
from pyspark.sql.functions import col

df_profiles_final = df_profiles_with_default_pincode.filter(col("address").isNotNull())

print("Profiles DataFrame after dropping irrecoverable address records:")
df_profiles_final.show(truncate=False)
df_profiles_final.printSchema()

Profiles DataFrame after dropping irrecoverable address records:
+-------+------------------------------+
|user_id|address                       |
+-------+------------------------------+
|U001   |{Hyderabad, Telangana, 500081}|
|U002   |{Delhi, Delhi, 110001}        |
|U003   |{Bangalore, Karnataka, 560001}|
|U004   |{Mumbai, MH, 0}               |
+-------+------------------------------+

root
 |-- user_id: string (nullable = false)
 |-- address: struct (nullable = true)
 |    |-- city: string (nullable = true)
 |    |-- state: string (nullable = true)
 |    |-- pincode: integer (nullable = true)



# 6. Flatten the struct into columns

In [50]:
from pyspark.sql.functions import col

df_profiles_flat = df_profiles_final.select("user_id", col("address.city"), col("address.state"), col("address.pincode"))

print("Profiles DataFrame with address flattened into columns:")
df_profiles_flat.show(truncate=False)
df_profiles_flat.printSchema()

Profiles DataFrame with address flattened into columns:
+-------+---------+---------+-------+
|user_id|city     |state    |pincode|
+-------+---------+---------+-------+
|U001   |Hyderabad|Telangana|500081 |
|U002   |Delhi    |Delhi    |110001 |
|U003   |Bangalore|Karnataka|560001 |
|U004   |Mumbai   |MH       |0      |
+-------+---------+---------+-------+

root
 |-- user_id: string (nullable = false)
 |-- city: string (nullable = true)
 |-- state: string (nullable = true)
 |-- pincode: integer (nullable = true)



# DATASET 5 — TRANSACTION LOGS (MIXED DATES & NUMBERS)

In [51]:
raw_transactions = [
("T001","2024-01-05","45000"),
("T002","05/01/2024",52000),
("T003","Jan 06 2024","Thirty Thousand"),
("T004",None,38000),
("T005","2024/01/07","42000")
]

# 1. Design schema using StructType

In [52]:
transactions_schema= StructType([
    StructField('transaction_id',StringType(),nullable=False),
    StructField('transaction_date',StringType(),nullable=True),
    StructField('transaction_amount',StringType(),nullable=True)
])

# 2. Normalize all dates into DateType

In [53]:
from pyspark.sql.functions import col, to_date, when
from pyspark.sql.types import DateType

df_transactions = spark.createDataFrame(data=raw_transactions, schema=transactions_schema)

df_transactions_clean = df_transactions.withColumn(
    "transaction_date_clean",
    when(col("transaction_date").rlike("^\\d{4}-\\d{2}-\\d{2}$"), to_date(col("transaction_date"), "yyyy-MM-dd"))
    .when(col("transaction_date").rlike("^\\d{2}/\\d{2}/\\d{4}$"), to_date(col("transaction_date"), "dd/MM/yyyy"))
    .when(col("transaction_date").rlike("^[A-Za-z]{3} \\d{2} \\d{4}$"), to_date(col("transaction_date"), "MMM dd yyyy"))
    .when(col("transaction_date").rlike("^\\d{4}/\\d{2}/\\d{2}$"), to_date(col("transaction_date"), "yyyy/MM/dd"))
    .otherwise(None)
)

print("Transactions DataFrame with normalized dates:")
df_transactions_clean.select("transaction_id", "transaction_date", "transaction_date_clean", "transaction_amount").show(truncate=False)
df_transactions_clean.printSchema()

Transactions DataFrame with normalized dates:
+--------------+----------------+----------------------+------------------+
|transaction_id|transaction_date|transaction_date_clean|transaction_amount|
+--------------+----------------+----------------------+------------------+
|T001          |2024-01-05      |2024-01-05            |45000             |
|T002          |05/01/2024      |2024-01-05            |52000             |
|T003          |Jan 06 2024     |2024-01-06            |Thirty Thousand   |
|T004          |NULL            |NULL                  |38000             |
|T005          |2024/01/07      |2024-01-07            |42000             |
+--------------+----------------+----------------------+------------------+

root
 |-- transaction_id: string (nullable = false)
 |-- transaction_date: string (nullable = true)
 |-- transaction_amount: string (nullable = true)
 |-- transaction_date_clean: date (nullable = true)



# 3. Convert amount into integer

In [56]:
from pyspark.sql.functions import col, when
from pyspark.sql.types import IntegerType

df_amount = df_transactions_clean.withColumn(
    "transaction_amount_int",
    when(
        (col("transaction_amount").isNotNull()) & (col("transaction_amount") != '') & col("transaction_amount").rlike("^[0-9]+$"),
        col("transaction_amount").cast(IntegerType())
    ).otherwise(None)
)

print("Transactions DataFrame with normalized amounts:")
df_amount.select("transaction_id", "transaction_amount", "transaction_amount_int").show(truncate=False)
df_amount.printSchema()

Transactions DataFrame with normalized amounts:
+--------------+------------------+----------------------+
|transaction_id|transaction_amount|transaction_amount_int|
+--------------+------------------+----------------------+
|T001          |45000             |45000                 |
|T002          |52000             |52000                 |
|T003          |Thirty Thousand   |NULL                  |
|T004          |38000             |38000                 |
|T005          |42000             |42000                 |
+--------------+------------------+----------------------+

root
 |-- transaction_id: string (nullable = false)
 |-- transaction_date: string (nullable = true)
 |-- transaction_amount: string (nullable = true)
 |-- transaction_date_clean: date (nullable = true)
 |-- transaction_amount_int: integer (nullable = true)



# 4. Identify unrecoverable records

In [57]:
from pyspark.sql.functions import col

failed_date_conversion = df_amount.filter(
    (col("transaction_date_clean").isNull()) &
    (col("transaction_date").isNotNull()) &
    (col("transaction_date") != '')
)

failed_amount_conversion = df_amount.filter(
    (col("transaction_amount_int").isNull()) &
    (col("transaction_amount").isNotNull()) &
    (col("transaction_amount") != '')
)

print("Records with unrecoverable transaction dates:")
failed_date_conversion.select("transaction_id", "transaction_date", "transaction_date_clean").show(truncate=False)

print("Records with unrecoverable transaction amounts:")
failed_amount_conversion.select("transaction_id", "transaction_amount", "transaction_amount_int").show(truncate=False)

Records with unrecoverable transaction dates:
+--------------+----------------+----------------------+
|transaction_id|transaction_date|transaction_date_clean|
+--------------+----------------+----------------------+
+--------------+----------------+----------------------+

Records with unrecoverable transaction amounts:
+--------------+------------------+----------------------+
|transaction_id|transaction_amount|transaction_amount_int|
+--------------+------------------+----------------------+
|T003          |Thirty Thousand   |NULL                  |
+--------------+------------------+----------------------+



# 5. Separate valid vs invalid transactions

In [58]:
from pyspark.sql.functions import col

df_valid_transactions = df_amount.filter(
    col("transaction_date_clean").isNotNull() & col("transaction_amount_int").isNotNull()
)

df_invalid_transactions = df_amount.filter(
    (col("transaction_date_clean").isNull() & col("transaction_date").isNotNull() & (col("transaction_date") != '')) |
    (col("transaction_amount_int").isNull() & col("transaction_amount").isNotNull() & (col("transaction_amount") != ''))
)

print("Valid Transactions:")
df_valid_transactions.show(truncate=False)

print("Invalid Transactions:")
df_invalid_transactions.show(truncate=False)

Valid Transactions:
+--------------+----------------+------------------+----------------------+----------------------+
|transaction_id|transaction_date|transaction_amount|transaction_date_clean|transaction_amount_int|
+--------------+----------------+------------------+----------------------+----------------------+
|T001          |2024-01-05      |45000             |2024-01-05            |45000                 |
|T002          |05/01/2024      |52000             |2024-01-05            |52000                 |
|T005          |2024/01/07      |42000             |2024-01-07            |42000                 |
+--------------+----------------+------------------+----------------------+----------------------+

Invalid Transactions:
+--------------+----------------+------------------+----------------------+----------------------+
|transaction_id|transaction_date|transaction_amount|transaction_date_clean|transaction_amount_int|
+--------------+----------------+------------------+--------------

# 6. Produce a clean transactions DataFrame

In [59]:
df_final_clean_transactions = df_valid_transactions.select("transaction_id", col("transaction_date_clean").alias("transaction_date"), col("transaction_amount_int").alias("transaction_amount"))

print("Final Clean Transactions DataFrame:")
df_final_clean_transactions.show(truncate=False)
df_final_clean_transactions.printSchema()

Final Clean Transactions DataFrame:
+--------------+----------------+------------------+
|transaction_id|transaction_date|transaction_amount|
+--------------+----------------+------------------+
|T001          |2024-01-05      |45000             |
|T002          |2024-01-05      |52000             |
|T005          |2024-01-07      |42000             |
+--------------+----------------+------------------+

root
 |-- transaction_id: string (nullable = false)
 |-- transaction_date: date (nullable = true)
 |-- transaction_amount: integer (nullable = true)

