**Comprehensive Guide to Creating DataFrames in PySpark:**

In PySpark, there are several ways to create a DataFrame. Below are the most common methods:

-   spark.read()
    -  From CSV Files   (using  spark.read.csv())
    -  From JSON Files  (using spark.read.json())
    -  From Parquet Files   (using spark.read.parquet())
    -  From orc Files   (using spark.read.orc())

- spark.sql()

-   spark.table("db_schema.tbl_name")

-   spark.createDataFrame()

-   spark.range()

-   spark rdd
    -   rdd.toDF(list of column names)
    -   rdd.toDF(schema)

In [10]:
import findspark
findspark.init
import getpass
from pyspark.sql import SparkSession

username = getpass.getuser()
spark = SparkSession. \
    builder. \
    config("spark.sql.catalogImplementation", "hive"). \
    config("spark.sql.warehouse.dir",f"/Users/{username}/Documents/data/warehouse"). \
    enableHiveSupport(). \
    master("local"). \
    getOrCreate()

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
24/12/27 21:19:26 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [5]:
spark.sql("show databases").show()

+---------+
|namespace|
+---------+
|  default|
|   retail|
+---------+



In [12]:
spark.sql("show tables").show()

+---------+---------+-----------+
|namespace|tableName|isTemporary|
+---------+---------+-----------+
|         |   orders|       true|
+---------+---------+-----------+



In [14]:
spark.sql("select * from orders where order_status = 'CLOSED' limit 10").show()

+--------+-------------------+-----------+------------+
|order_id|         order_date|customer_id|order_status|
+--------+-------------------+-----------+------------+
|       1|2013-07-27 00:00:00|      30265|      CLOSED|
|       2|2013-11-25 00:00:00|      20386|      CLOSED|
|       6|2014-07-20 00:00:00|      49340|      CLOSED|
|       9|2014-01-07 00:00:00|      26329|      CLOSED|
|      15|2014-06-10 00:00:00|       4869|      CLOSED|
|      19|2013-09-10 00:00:00|      46034|      CLOSED|
|      26|2014-07-19 00:00:00|      43824|      CLOSED|
|      27|2014-02-20 00:00:00|      11361|      CLOSED|
|      29|2013-08-22 00:00:00|      32688|      CLOSED|
|      30|2013-08-29 00:00:00|      17044|      CLOSED|
+--------+-------------------+-----------+------------+



In [15]:
new_df = spark.sql("select * from orders where order_status = 'CLOSED' limit 10")

In [16]:
new_df.show()

+--------+-------------------+-----------+------------+
|order_id|         order_date|customer_id|order_status|
+--------+-------------------+-----------+------------+
|       1|2013-07-27 00:00:00|      30265|      CLOSED|
|       2|2013-11-25 00:00:00|      20386|      CLOSED|
|       6|2014-07-20 00:00:00|      49340|      CLOSED|
|       9|2014-01-07 00:00:00|      26329|      CLOSED|
|      15|2014-06-10 00:00:00|       4869|      CLOSED|
|      19|2013-09-10 00:00:00|      46034|      CLOSED|
|      26|2014-07-19 00:00:00|      43824|      CLOSED|
|      27|2014-02-20 00:00:00|      11361|      CLOSED|
|      29|2013-08-22 00:00:00|      32688|      CLOSED|
|      30|2013-08-29 00:00:00|      17044|      CLOSED|
+--------+-------------------+-----------+------------+



In [20]:
spark.sql("show tables").show()

+---------+--------------+-----------+
|namespace|     tableName|isTemporary|
+---------+--------------+-----------+
|  default|orders_managed|      false|
|         |        orders|       true|
+---------+--------------+-----------+



In [24]:
new_df = spark.table("spark_catalog.default.orders_managed")

In [25]:
new_df.show(5)

+--------+-------------------+-----------+------------+
|order_id|         order_date|customer_id|order_status|
+--------+-------------------+-----------+------------+
|       1|2013-07-27 00:00:00|      30265|      CLOSED|
|       2|2013-11-25 00:00:00|      20386|      CLOSED|
|       3|2014-01-21 00:00:00|      15768|    COMPLETE|
|       4|2014-07-04 00:00:00|      27181|  PROCESSING|
|       5|2014-03-08 00:00:00|      12448|    COMPLETE|
+--------+-------------------+-----------+------------+
only showing top 5 rows



In [26]:
orders_list = [
(1,'2013-07-27 00:00:00.0',30265,'CLOSED'),
(2,'2013-11-25 00:00:00.0',20386,'CLOSED'),
(3,'2014-01-21 00:00:00.0',15768,'COMPLETE')
]

In [27]:
new_df = spark.createDataFrame(orders_list)

In [28]:
new_df.show()

+---+--------------------+-----+--------+
| _1|                  _2|   _3|      _4|
+---+--------------------+-----+--------+
|  1|2013-07-27 00:00:...|30265|  CLOSED|
|  2|2013-11-25 00:00:...|20386|  CLOSED|
|  3|2014-01-21 00:00:...|15768|COMPLETE|
+---+--------------------+-----+--------+



In [29]:
new_df.printSchema()

root
 |-- _1: long (nullable = true)
 |-- _2: string (nullable = true)
 |-- _3: long (nullable = true)
 |-- _4: string (nullable = true)



In [30]:
new_df = spark.createDataFrame(orders_list).toDF('order_id','order_date','customer_id','order_status')

In [31]:
new_df.show()

+--------+--------------------+-----------+------------+
|order_id|          order_date|customer_id|order_status|
+--------+--------------------+-----------+------------+
|       1|2013-07-27 00:00:...|      30265|      CLOSED|
|       2|2013-11-25 00:00:...|      20386|      CLOSED|
|       3|2014-01-21 00:00:...|      15768|    COMPLETE|
+--------+--------------------+-----------+------------+



In [32]:
new_df.printSchema()

root
 |-- order_id: long (nullable = true)
 |-- order_date: string (nullable = true)
 |-- customer_id: long (nullable = true)
 |-- order_status: string (nullable = true)



In [52]:
orders_schema = ["order_id","order_date","customer_id","order_status"]

In [53]:
new_df = spark.createDataFrame(orders_list,orders_schema)

In [54]:
new_df.show()

+--------+--------------------+-----------+------------+
|order_id|          order_date|customer_id|order_status|
+--------+--------------------+-----------+------------+
|       1|2013-07-27 00:00:...|      30265|      CLOSED|
|       2|2013-11-25 00:00:...|      20386|      CLOSED|
|       3|2014-01-21 00:00:...|      15768|    COMPLETE|
+--------+--------------------+-----------+------------+



In [55]:
new_df.printSchema()

root
 |-- order_id: long (nullable = true)
 |-- order_date: string (nullable = true)
 |-- customer_id: long (nullable = true)
 |-- order_status: string (nullable = true)



In [35]:
orders_schema = 'orderid integer, orderdate string, customerid integer, orderstatus string'

In [36]:
new_df = spark.createDataFrame(orders_list,orders_schema)

In [37]:
new_df.show()

+-------+--------------------+----------+-----------+
|orderid|           orderdate|customerid|orderstatus|
+-------+--------------------+----------+-----------+
|      1|2013-07-27 00:00:...|     30265|     CLOSED|
|      2|2013-11-25 00:00:...|     20386|     CLOSED|
|      3|2014-01-21 00:00:...|     15768|   COMPLETE|
+-------+--------------------+----------+-----------+



In [38]:
new_df.printSchema()

root
 |-- orderid: integer (nullable = true)
 |-- orderdate: string (nullable = true)
 |-- customerid: integer (nullable = true)
 |-- orderstatus: string (nullable = true)



In [41]:
from pyspark.sql.functions import to_timestamp

In [43]:
df = new_df.withColumn("orderdate",to_timestamp('orderdate'))

In [44]:
df.show()

+-------+-------------------+----------+-----------+
|orderid|          orderdate|customerid|orderstatus|
+-------+-------------------+----------+-----------+
|      1|2013-07-27 00:00:00|     30265|     CLOSED|
|      2|2013-11-25 00:00:00|     20386|     CLOSED|
|      3|2014-01-21 00:00:00|     15768|   COMPLETE|
+-------+-------------------+----------+-----------+



In [45]:
df.printSchema()

root
 |-- orderid: integer (nullable = true)
 |-- orderdate: timestamp (nullable = true)
 |-- customerid: integer (nullable = true)
 |-- orderstatus: string (nullable = true)



In [47]:
spark.range(5).show()

+---+
| id|
+---+
|  0|
|  1|
|  2|
|  3|
|  4|
+---+



In [48]:
spark.range(0,8).show()

+---+
| id|
+---+
|  0|
|  1|
|  2|
|  3|
|  4|
|  5|
|  6|
|  7|
+---+



In [50]:
spark.range(0,8,2).show()

+---+
| id|
+---+
|  0|
|  2|
|  4|
|  6|
+---+



In [51]:
spark.range(0,8,3).show()

+---+
| id|
+---+
|  0|
|  3|
|  6|
+---+



In [11]:
orders_rdd = spark.sparkContext.textFile("/Users/sugumarsrinivasan/Documents/data/orders.csv")

In [57]:
orders_rdd.take(5)

['1,2013-07-27 00:00:00.0,30265,CLOSED',
 '2,2013-11-25 00:00:00.0,20386,CLOSED',
 '3,2014-01-21 00:00:00.0,15768,COMPLETE',
 '4,2014-07-04 00:00:00.0,27181,PROCESSING',
 '5,2014-03-08 00:00:00.0,12448,COMPLETE']

In [12]:
new_orders_rdd = orders_rdd.map(lambda x: (int(x.split(",")[0]), x.split(",")[1], int(x.split(",")[2]), x.split(",")[3]))

In [13]:
new_orders_rdd.take(5)

                                                                                

[(1, '2013-07-27 00:00:00.0', 30265, 'CLOSED'),
 (2, '2013-11-25 00:00:00.0', 20386, 'CLOSED'),
 (3, '2014-01-21 00:00:00.0', 15768, 'COMPLETE'),
 (4, '2014-07-04 00:00:00.0', 27181, 'PROCESSING'),
 (5, '2014-03-08 00:00:00.0', 12448, 'COMPLETE')]

In [60]:
orders_schema = 'order_id integer, order_date string, customer_id long, order_status string'

In [61]:
df = spark.createDataFrame(new_orders_rdd,orders_schema)

In [64]:
df.show(5)

[Stage 20:>                                                         (0 + 1) / 1]

+--------+--------------------+-----------+------------+
|order_id|          order_date|customer_id|order_status|
+--------+--------------------+-----------+------------+
|       1|2013-07-27 00:00:...|      30265|      CLOSED|
|       2|2013-11-25 00:00:...|      20386|      CLOSED|
|       3|2014-01-21 00:00:...|      15768|    COMPLETE|
|       4|2014-07-04 00:00:...|      27181|  PROCESSING|
|       5|2014-03-08 00:00:...|      12448|    COMPLETE|
+--------+--------------------+-----------+------------+
only showing top 5 rows



24/12/27 21:00:01 WARN PythonRunner: Detected deadlock while completing task 0.0 in stage 20 (TID 20): Attempting to kill Python Worker
                                                                                

In [63]:
df.printSchema()

root
 |-- order_id: integer (nullable = true)
 |-- order_date: string (nullable = true)
 |-- customer_id: long (nullable = true)
 |-- order_status: string (nullable = true)



In [12]:
df = new_orders_rdd.toDF(['order_id','order_date','customer_id','order_status'])

In [13]:
df.show(5)

24/12/27 21:11:07 WARN PythonRunner: Detected deadlock while completing task 0.0 in stage 3 (TID 3): Attempting to kill Python Worker
                                                                                

+--------+--------------------+-----------+------------+
|order_id|          order_date|customer_id|order_status|
+--------+--------------------+-----------+------------+
|       1|2013-07-27 00:00:...|      30265|      CLOSED|
|       2|2013-11-25 00:00:...|      20386|      CLOSED|
|       3|2014-01-21 00:00:...|      15768|    COMPLETE|
|       4|2014-07-04 00:00:...|      27181|  PROCESSING|
|       5|2014-03-08 00:00:...|      12448|    COMPLETE|
+--------+--------------------+-----------+------------+
only showing top 5 rows



In [14]:
df.printSchema()

root
 |-- order_id: long (nullable = true)
 |-- order_date: string (nullable = true)
 |-- customer_id: long (nullable = true)
 |-- order_status: string (nullable = true)



In [14]:
from pyspark.sql.types import *

In [15]:
orders_schema = StructType([
    StructField("order_id",LongType()),
    StructField("order_date", StringType()),
    StructField("customer_id", LongType()),
    StructField("orders_status", StringType())
    ])

In [18]:
df = new_orders_rdd.toDF(orders_schema)

In [19]:
df.show(5)

24/12/27 21:21:08 WARN PythonRunner: Detected deadlock while completing task 0.0 in stage 2 (TID 2): Attempting to kill Python Worker
                                                                                

+--------+--------------------+-----------+-------------+
|order_id|          order_date|customer_id|orders_status|
+--------+--------------------+-----------+-------------+
|       1|2013-07-27 00:00:...|      30265|       CLOSED|
|       2|2013-11-25 00:00:...|      20386|       CLOSED|
|       3|2014-01-21 00:00:...|      15768|     COMPLETE|
|       4|2014-07-04 00:00:...|      27181|   PROCESSING|
|       5|2014-03-08 00:00:...|      12448|     COMPLETE|
+--------+--------------------+-----------+-------------+
only showing top 5 rows



In [20]:
df.printSchema()

root
 |-- order_id: long (nullable = true)
 |-- order_date: string (nullable = true)
 |-- customer_id: long (nullable = true)
 |-- orders_status: string (nullable = true)

