In [8]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName("schema_enforcement").getOrCreate()

#####  read data with without header without schema enforcement

In [14]:
df1 = spark.read \
    .format("csv") \
        .option("inferSchema" ,"true") \
        .load("hdfs://localhost:9000/user/tkm/retail_db/orders.csv")

In [15]:
df1.show(3)

+---+-------------------+-----+---------------+
|_c0|                _c1|  _c2|            _c3|
+---+-------------------+-----+---------------+
|  1|2013-07-25 00:00:00|11599|         CLOSED|
|  2|2013-07-25 00:00:00|  256|PENDING_PAYMENT|
|  3|2013-07-25 00:00:00|12111|       COMPLETE|
+---+-------------------+-----+---------------+
only showing top 3 rows



In [16]:
df1.printSchema()

root
 |-- _c0: integer (nullable = true)
 |-- _c1: timestamp (nullable = true)
 |-- _c2: integer (nullable = true)
 |-- _c3: string (nullable = true)



#####  read data with without header with schema enforcement normal type

In [17]:
orders_schema = 'order_id long , order_date timestamp , customer_id integer , order_status string'

In [18]:
df2 = spark.read \
    .format("csv") \
        .schema(orders_schema) \
            .load("hdfs://localhost:9000/user/tkm/retail_db/orders.csv")

In [19]:
df2.show(3)

+--------+-------------------+-----------+---------------+
|order_id|         order_date|customer_id|   order_status|
+--------+-------------------+-----------+---------------+
|       1|2013-07-25 00:00:00|      11599|         CLOSED|
|       2|2013-07-25 00:00:00|        256|PENDING_PAYMENT|
|       3|2013-07-25 00:00:00|      12111|       COMPLETE|
+--------+-------------------+-----------+---------------+
only showing top 3 rows



In [20]:
df2.printSchema()

root
 |-- order_id: long (nullable = true)
 |-- order_date: timestamp (nullable = true)
 |-- customer_id: integer (nullable = true)
 |-- order_status: string (nullable = true)



#####  read data with without header with schema enforcement Struct Type

In [21]:
# importing types
from pyspark.sql.types import *

In [23]:
orders_schema_structtype = StructType([
    StructField("orderid",LongType()),
    StructField("orderdate",TimestampType()),
    StructField("customerid",IntegerType()),
    StructField("orderstatus",StringType())
])

In [25]:
df3 =  spark.read \
    .format("csv") \
        .schema(orders_schema_structtype) \
            .load("hdfs://localhost:9000/user/tkm/retail_db/orders.csv")
            
           

In [26]:
df3.show(3)

+-------+-------------------+----------+---------------+
|orderid|          orderdate|customerid|    orderstatus|
+-------+-------------------+----------+---------------+
|      1|2013-07-25 00:00:00|     11599|         CLOSED|
|      2|2013-07-25 00:00:00|       256|PENDING_PAYMENT|
|      3|2013-07-25 00:00:00|     12111|       COMPLETE|
+-------+-------------------+----------+---------------+
only showing top 3 rows



In [27]:
df3.printSchema()

root
 |-- orderid: long (nullable = true)
 |-- orderdate: timestamp (nullable = true)
 |-- customerid: integer (nullable = true)
 |-- orderstatus: string (nullable = true)



#####  read data with with header without schema enforcement

In [30]:
df4 = spark.read \
    .format("csv") \
        .option("header","true") \
            .option("inferSchema","true")\
                .load("hdfs://localhost:9000/user/tkm/data/order.csv")

In [29]:
df4.show(3)

+--------+-------------------+-----------+---------------+
|order_id|         order_date|customer_id|   order_status|
+--------+-------------------+-----------+---------------+
|       1|2013-07-25 00:00:00|      11599|         CLOSED|
|       2|2013-07-25 00:00:00|        256|PENDING_PAYMENT|
|       3|2013-07-25 00:00:00|      12111|       COMPLETE|
+--------+-------------------+-----------+---------------+
only showing top 3 rows



In [33]:
df4.printSchema()

root
 |-- order_id: integer (nullable = true)
 |-- order_date: timestamp (nullable = true)
 |-- customer_id: integer (nullable = true)
 |-- order_status: string (nullable = true)



#####  read data with with header with schema enforcement

In [31]:
df5 = spark.read \
    .format("csv") \
        .option("header","true") \
            .schema(orders_schema)\
                .load("hdfs://localhost:9000/user/tkm/data/order.csv")

In [34]:
df5.printSchema()

root
 |-- order_id: long (nullable = true)
 |-- order_date: timestamp (nullable = true)
 |-- customer_id: integer (nullable = true)
 |-- order_status: string (nullable = true)



In [32]:
df5.show(3)

+--------+-------------------+-----------+---------------+
|order_id|         order_date|customer_id|   order_status|
+--------+-------------------+-----------+---------------+
|       1|2013-07-25 00:00:00|      11599|         CLOSED|
|       2|2013-07-25 00:00:00|        256|PENDING_PAYMENT|
|       3|2013-07-25 00:00:00|      12111|       COMPLETE|
+--------+-------------------+-----------+---------------+
only showing top 3 rows



##  different way of selecting column in dataframe

In [35]:
df5.select(df5.order_id ,df5.order_status).show(2)

+--------+---------------+
|order_id|   order_status|
+--------+---------------+
|       1|         CLOSED|
|       2|PENDING_PAYMENT|
+--------+---------------+
only showing top 2 rows



In [37]:
df5.select("order_id","order_status").show(2)

+--------+---------------+
|order_id|   order_status|
+--------+---------------+
|       1|         CLOSED|
|       2|PENDING_PAYMENT|
+--------+---------------+
only showing top 2 rows



In [42]:
from pyspark.sql.functions import *

In [44]:
df5.select(col("order_id"),col("order_status")).show(2)

+--------+---------------+
|order_id|   order_status|
+--------+---------------+
|       1|         CLOSED|
|       2|PENDING_PAYMENT|
+--------+---------------+
only showing top 2 rows



In [45]:
spark.stop()

In [49]:
! hadoop fs -head  /user/tkm/data/order_wd.csv

order_id,order_date,customer_id,order_status
1,07/25/13,11599,CLOSED
2,07/25/13,256,PENDING_PAYMENT
3,07/25/13,12111,COMPLETE
4,07/25/13,8827,CLOSED
5,07/25/13,11318,COMPLETE
6,07/25/13,7130,COMPLETE
7,07/25/13,4530,COMPLETE
8,07/25/13,2911,PROCESSING
9,07/25/13,5657,PENDING_PAYMENT
10,07/25/13,5648,PENDING_PAYMENT
11,07/25/13,918,PAYMENT_REVIEW
12,07/25/13,1837,CLOSED
13,07/25/13,9149,PENDING_PAYMENT
14,07/25/13,9842,PROCESSING
15,07/25/13,2568,COMPLETE
16,07/25/13,7276,PENDING_PAYMENT
17,07/25/13,2667,COMPLETE
18,07/25/13,1205,CLOSED
19,07/25/13,9488,PENDING_PAYMENT
20,07/25/13,9198,PROCESSING
21,07/25/13,2711,PENDING
22,07/25/13,333,COMPLETE
23,07/25/13,4367,PENDING_PAYMENT
24,07/25/13,11441,CLOSED
25,07/25/13,9503,CLOSED
26,07/25/13,7562,COMPLETE
27,07/25/13,3241,PENDING_PAYMENT
28,07/25/13,656,COMPLETE
29,07/25/13,196,PROCESSING
30,07/25/13,10039,PENDING_PAYMENT
31,07/25/13,6983,PAYMENT_REVIEW
32,07/25/13,3960,COMPLETE
33,07/25/13,5793,PENDING_PAYMENT
34,07/25/13,4189,PR
