# What is DataFrame

API Documentation: https://spark.apache.org/docs/3.5.0/api/python/reference/pyspark.sql/api/pyspark.sql.DataFrame.html
https://spark.apache.org/docs/3.5.0/api/python/reference/pyspark.sql/dataframe.html

In [1]:
from pyspark.sql import SparkSession

spark = SparkSession.builder \
                    .master("local[*]") \
                    .appName("Create Dataframe") \
                    .getOrCreate()

In [2]:
spark

In [None]:
spark.sparkContext.defaultParallelism

In [3]:
customer_data = [
    ["C1","Pratap","16-12-1979",10000],
    ["C2","Sruthi","08-01-1984",20000],
    ["C3","Kiyanshita","24-08-2011",30000],
    ["C4","Nirupama","01-11-2022",40000]
]

customer_schema = "CustomerID string, CustomerName String, CustomerDoB String, CustomerSalary Integer"

In [4]:
df = spark.createDataFrame(data=customer_data, schema=customer_schema)

In [6]:
df.printSchema()

root
 |-- CustomerID: string (nullable = true)
 |-- CustomerName: string (nullable = true)
 |-- CustomerDoB: string (nullable = true)
 |-- CustomerSalary: integer (nullable = true)



In [7]:
df.show()

+----------+------------+-----------+--------------+
|CustomerID|CustomerName|CustomerDoB|CustomerSalary|
+----------+------------+-----------+--------------+
|        C1|      Pratap| 16-12-1979|         10000|
|        C2|      Sruthi| 08-01-1984|         20000|
|        C3|  Kiyanshita| 24-08-2011|         30000|
|        C4|    Nirupama| 01-11-2022|         40000|
+----------+------------+-----------+--------------+



In [11]:
df[3]

Column<'CustomerSalary'>

In [12]:
df["CustomerSalary"]

Column<'CustomerID'>

In [13]:
df.CustomerID

Column<'CustomerID'>

In [16]:
from pyspark.sql.functions import col, expr, column
col("CustomerID")

Column<'CustomerID'>

In [15]:
expr("CustomerID")

Column<'CustomerID'>

In [17]:
column("CustomerID")

Column<'CustomerID'>

In [42]:
from pyspark.sql.functions import col, expr, column
df1 = df.select(  column("CustomerID"), expr("CustomerSalary * 1000 as sal") )

In [45]:
df1.explain()

== Physical Plan ==
*(1) Project [CustomerID#0, (CustomerSalary#3 * 1000) AS sal#170]
+- *(1) Scan ExistingRDD[CustomerID#0,CustomerName#1,CustomerDoB#2,CustomerSalary#3]




In [44]:
df1.rdd

MapPartitionsRDD[34] at javaToPython at NativeMethodAccessorImpl.java:0

In [15]:
df.select(col("CustomerSalary") * 100).show()

+----------------------+
|(CustomerSalary * 100)|
+----------------------+
|               1000000|
|               2000000|
|               3000000|
|               4000000|
+----------------------+



In [16]:
df.select(df["CustomerSalary"] * 100).show()

+----------------------+
|(CustomerSalary * 100)|
+----------------------+
|               1000000|
|               2000000|
|               3000000|
|               4000000|
+----------------------+



In [19]:
df.select(df.CustomerSalary * 100).show()

+----------------------+
|(CustomerSalary * 100)|
+----------------------+
|               1000000|
|               2000000|
|               3000000|
|               4000000|
+----------------------+



In [20]:
df.select(df[3] * 100).show()

+----------------------+
|(CustomerSalary * 100)|
+----------------------+
|               1000000|
|               2000000|
|               3000000|
|               4000000|
+----------------------+

