In [2]:
from pyspark.sql import SparkSession
from pyspark.sql.types import *
spark = SparkSession.builder \
    .appName("7 cach tao data frame") \
    .config("spark.sql.warehouse.dir" , "C:\data\Thuc_hanh") \
    .getOrCreate()

In [3]:
orders_schema_struct = StructType([
    StructField("Row ID", LongType()),
    StructField("Order ID", StringType()),
    StructField("Order Date", DateType()),
    StructField("Ship Date", DateType()),
    StructField("Ship Mode", StringType()),
    StructField("Customer ID", StringType()),
    StructField("Customer Name", StringType()),
    StructField("Segment", StringType()),
    StructField("Country", StringType()),
    StructField("City", StringType()),
    StructField("State", StringType()),
    StructField("Postal Code", LongType()),
    StructField("Region", StringType()),
    StructField("Product ID", StringType()),
    StructField("Category", StringType()),
    StructField("Sub-Category", StringType()),
    StructField("Product Name", StringType()),
    StructField("Sales", FloatType()),
    StructField("Quantity", LongType()),
    StructField("Discount", FloatType()),
    StructField("Profit", FloatType()),
])

In [4]:
df1 = spark.read \
    .format("csv") \
    .option("delimiter", ";") \
    .option("header", "true") \
    .schema(orders_schema_struct) \
    .load("c:/data/Sample_Superstore.csv")

In [5]:
#cach1 doc truc tiep tu file    
df1.show(5)

+------+--------------+----------+---------+--------------+-----------+---------------+---------+-------------+---------------+----------+-----------+------+---------------+---------------+------------+--------------------+-----+--------+--------+------+
|Row ID|      Order ID|Order Date|Ship Date|     Ship Mode|Customer ID|  Customer Name|  Segment|      Country|           City|     State|Postal Code|Region|     Product ID|       Category|Sub-Category|        Product Name|Sales|Quantity|Discount|Profit|
+------+--------------+----------+---------+--------------+-----------+---------------+---------+-------------+---------------+----------+-----------+------+---------------+---------------+------------+--------------------+-----+--------+--------+------+
|     1|CA-2016-152156|      NULL|     NULL|  Second Class|   CG-12520|    Claire Gute| Consumer|United States|      Henderson|  Kentucky|      42420| South|FUR-BO-10001798|      Furniture|   Bookcases|Bush Somerset Col...| NULL|      

In [8]:
#cach2 input la ban du lieu da co san co the tuy bien tao data frame
df1.createOrReplaceTempView("orders")
df2 = spark.sql("select * from orders")
df2.show(5)

+------+--------------+----------+---------+--------------+-----------+---------------+---------+-------------+---------------+----------+-----------+------+---------------+---------------+------------+--------------------+-----+--------+--------+------+
|Row ID|      Order ID|Order Date|Ship Date|     Ship Mode|Customer ID|  Customer Name|  Segment|      Country|           City|     State|Postal Code|Region|     Product ID|       Category|Sub-Category|        Product Name|Sales|Quantity|Discount|Profit|
+------+--------------+----------+---------+--------------+-----------+---------------+---------+-------------+---------------+----------+-----------+------+---------------+---------------+------------+--------------------+-----+--------+--------+------+
|     1|CA-2016-152156|      NULL|     NULL|  Second Class|   CG-12520|    Claire Gute| Consumer|United States|      Henderson|  Kentucky|      42420| South|FUR-BO-10001798|      Furniture|   Bookcases|Bush Somerset Col...| NULL|      

In [10]:
#cach 3 input la ban du lieu da co san it linh hoat hon cach 2
df3 = spark.table("orders")
df3.show(5)

+------+--------------+----------+---------+--------------+-----------+---------------+---------+-------------+---------------+----------+-----------+------+---------------+---------------+------------+--------------------+-----+--------+--------+------+
|Row ID|      Order ID|Order Date|Ship Date|     Ship Mode|Customer ID|  Customer Name|  Segment|      Country|           City|     State|Postal Code|Region|     Product ID|       Category|Sub-Category|        Product Name|Sales|Quantity|Discount|Profit|
+------+--------------+----------+---------+--------------+-----------+---------------+---------+-------------+---------------+----------+-----------+------+---------------+---------------+------------+--------------------+-----+--------+--------+------+
|     1|CA-2016-152156|      NULL|     NULL|  Second Class|   CG-12520|    Claire Gute| Consumer|United States|      Henderson|  Kentucky|      42420| South|FUR-BO-10001798|      Furniture|   Bookcases|Bush Somerset Col...| NULL|      

In [12]:
#cach 4 muon tao nhanh 1 dataframe de co the test hoac muon demo 1 function nao day
# range(start,end+1, buoc nhay)
df4 = spark.range(5)
df4.show()

+---+
| id|
+---+
|  0|
|  1|
|  2|
|  3|
|  4|
+---+



In [16]:
#cach 5 
list_orders = [(1,'2013-07-25',123241,'CLOSED'),(2,'2014-08-11',243425,'PENDING_PAYMENT'),(3,'2013-07-25',231132,'COMPLETE')]
df5 = spark.createDataFrame(list_orders)
df5 = df5.toDF('id_orders','date','Id','status')
df5.printSchema()
df5.show()

root
 |-- id_orders: long (nullable = true)
 |-- date: string (nullable = true)
 |-- Id: long (nullable = true)
 |-- status: string (nullable = true)

+---------+----------+------+---------------+
|id_orders|      date|    Id|         status|
+---------+----------+------+---------------+
|        1|2013-07-25|123241|         CLOSED|
|        2|2014-08-11|243425|PENDING_PAYMENT|
|        3|2013-07-25|231132|       COMPLETE|
+---------+----------+------+---------------+



In [20]:
#cach 6
orders_schema = "id_orders long, date string, Id long, status string"
df6 = spark.createDataFrame(list_orders, orders_schema)
df6.printSchema()
df6.show()

root
 |-- id_orders: long (nullable = true)
 |-- date: string (nullable = true)
 |-- Id: long (nullable = true)
 |-- status: string (nullable = true)

+---------+----------+------+---------------+
|id_orders|      date|    Id|         status|
+---------+----------+------+---------------+
|        1|2013-07-25|123241|         CLOSED|
|        2|2014-08-11|243425|PENDING_PAYMENT|
|        3|2013-07-25|231132|       COMPLETE|
+---------+----------+------+---------------+



In [33]:
#cach 7 tao data frame tu RDD
orders_rdd = spark.sparkContext.textFile("C:/data/orders.csv")
rdd = orders_rdd.map(lambda x: (int(x.split(";")[0]), x.split(";")[1], int(x.split(";")[2]), x.split(";")[3]))
orders_schema = "id_orders long, date string, Id long, status string"
rdd.take(3)


[(1, '25/07/2013', 123241, 'CLOSED'),
 (2, '11/08/2014', 243425, 'PENDING_PAYMENT'),
 (3, '25/07/2013', 231132, 'COMPLETE')]

In [35]:
df7 = spark.createDataFrame(rdd, orders_schema)
df7.show()

+---------+----------+------+---------------+
|id_orders|      date|    Id|         status|
+---------+----------+------+---------------+
|        1|25/07/2013|123241|         CLOSED|
|        2|11/08/2014|243425|PENDING_PAYMENT|
|        3|25/07/2013|231132|       COMPLETE|
+---------+----------+------+---------------+

