<a href="https://colab.research.google.com/github/VictoriaUsman/Big-Data/blob/main/Schema_Enforcement.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName("Enforce Schema").getOrCreate()

In [6]:
df = spark.read.csv("/content/first_100_customers.csv", header=True)

In [7]:
df.show()

+-----------+-----------+---------+-----------+-------+-----------------+---------+
|customer_id|       name|     city|      state|country|registration_date|is_active|
+-----------+-----------+---------+-----------+-------+-----------------+---------+
|          0| Customer_0|     Pune|Maharashtra|  India|       2023-06-29|    False|
|          1| Customer_1|Bangalore| Tamil Nadu|  India|       2023-12-07|     True|
|          2| Customer_2|Hyderabad|    Gujarat|  India|       2023-10-27|     True|
|          3| Customer_3|Bangalore|  Karnataka|  India|       2023-10-17|    False|
|          4| Customer_4|Ahmedabad|  Karnataka|  India|       2023-03-14|    False|
|          5| Customer_5|Hyderabad|  Karnataka|  India|       2023-07-28|    False|
|          6| Customer_6|     Pune|      Delhi|  India|       2023-08-29|    False|
|          7| Customer_7|Ahmedabad|West Bengal|  India|       2023-12-28|     True|
|          8| Customer_8|     Pune|  Karnataka|  India|       2023-06-22|   

In [8]:
df.printSchema()

root
 |-- customer_id: string (nullable = true)
 |-- name: string (nullable = true)
 |-- city: string (nullable = true)
 |-- state: string (nullable = true)
 |-- country: string (nullable = true)
 |-- registration_date: string (nullable = true)
 |-- is_active: string (nullable = true)



Stuct Method

In [2]:
from pyspark.sql.types import StructField, StringType, IntegerType, StructType, BooleanType, FloatType

In [9]:
df.head()

Row(customer_id='0', name='Customer_0', city='Pune', state='Maharashtra', country='India', registration_date='2023-06-29', is_active='False')

In [14]:
schema = StructType([
    StructField("customer_id", IntegerType(), True),
    StructField("name", StringType(), True),
    StructField("city", StringType(), True),
    StructField("state", StringType(), True),
    StructField("country", IntegerType(), True),
    StructField("registration_date", StringType(), True),
    StructField("is_active", BooleanType(), True)])

In [21]:
df = spark.read \
    .format("csv") \
    .option("header", "true") \
    .schema(schema) \
    .load("/content/first_100_customers.csv")


In [22]:
df.printSchema()

root
 |-- customer_id: integer (nullable = true)
 |-- name: string (nullable = true)
 |-- city: string (nullable = true)
 |-- state: string (nullable = true)
 |-- country: integer (nullable = true)
 |-- registration_date: string (nullable = true)
 |-- is_active: boolean (nullable = true)



In [23]:
df.show()

+-----------+-----------+---------+-----------+-------+-----------------+---------+
|customer_id|       name|     city|      state|country|registration_date|is_active|
+-----------+-----------+---------+-----------+-------+-----------------+---------+
|          0| Customer_0|     Pune|Maharashtra|   NULL|       2023-06-29|    false|
|          1| Customer_1|Bangalore| Tamil Nadu|   NULL|       2023-12-07|     true|
|          2| Customer_2|Hyderabad|    Gujarat|   NULL|       2023-10-27|     true|
|          3| Customer_3|Bangalore|  Karnataka|   NULL|       2023-10-17|    false|
|          4| Customer_4|Ahmedabad|  Karnataka|   NULL|       2023-03-14|    false|
|          5| Customer_5|Hyderabad|  Karnataka|   NULL|       2023-07-28|    false|
|          6| Customer_6|     Pune|      Delhi|   NULL|       2023-08-29|    false|
|          7| Customer_7|Ahmedabad|West Bengal|   NULL|       2023-12-28|     true|
|          8| Customer_8|     Pune|  Karnataka|   NULL|       2023-06-22|   

Using DDL

In [24]:
df.head()

Row(customer_id=0, name='Customer_0', city='Pune', state='Maharashtra', country=None, registration_date='2023-06-29', is_active=False)

In [28]:
ddl_schema = "customer_id INT, name BOOLEAN, city STRING,state STRING,country STRING,registration_date STRING, is_active BOOLEAN"

In [29]:
df2 = spark.read \
    .format("csv") \
    .option("header", "true") \
    .schema(ddl_schema) \
    .load("/content/first_100_customers.csv")

In [30]:
df2.printSchema()

root
 |-- customer_id: integer (nullable = true)
 |-- name: boolean (nullable = true)
 |-- city: string (nullable = true)
 |-- state: string (nullable = true)
 |-- country: string (nullable = true)
 |-- registration_date: string (nullable = true)
 |-- is_active: boolean (nullable = true)



In [31]:
df2.show()

+-----------+----+---------+-----------+-------+-----------------+---------+
|customer_id|name|     city|      state|country|registration_date|is_active|
+-----------+----+---------+-----------+-------+-----------------+---------+
|          0|NULL|     Pune|Maharashtra|  India|       2023-06-29|    false|
|          1|NULL|Bangalore| Tamil Nadu|  India|       2023-12-07|     true|
|          2|NULL|Hyderabad|    Gujarat|  India|       2023-10-27|     true|
|          3|NULL|Bangalore|  Karnataka|  India|       2023-10-17|    false|
|          4|NULL|Ahmedabad|  Karnataka|  India|       2023-03-14|    false|
|          5|NULL|Hyderabad|  Karnataka|  India|       2023-07-28|    false|
|          6|NULL|     Pune|      Delhi|  India|       2023-08-29|    false|
|          7|NULL|Ahmedabad|West Bengal|  India|       2023-12-28|     true|
|          8|NULL|     Pune|  Karnataka|  India|       2023-06-22|     true|
|          9|NULL|   Mumbai|  Telangana|  India|       2023-01-05|     true|