# What is DataFrame

API Documentation: https://spark.apache.org/docs/3.5.0/api/python/reference/pyspark.sql/api/pyspark.sql.DataFrame.html
https://spark.apache.org/docs/3.5.0/api/python/reference/pyspark.sql/dataframe.html

In [1]:
from pyspark.sql import SparkSession

spark = SparkSession.builder \
                    .master("local[*]") \
                    .appName("Create Dataframe") \
                    .getOrCreate()

In [2]:
spark

In [None]:
spark.sparkContext.defaultParallelism

In [3]:
customer_data = [
    ["C1","Pratap","16-12-1979",10000],
    ["C2","Sruthi","08-01-1984",20000],
    ["C3","Kiyanshita","24-08-2011",30000],
    ["C4","Nirupama","01-11-2022",40000]
]

customer_schema = "CustomerID string, CustomerName String, CustomerDoB String, CustomerSalary Integer"

In [4]:
df = spark.createDataFrame(data=customer_data, schema=customer_schema)

In [17]:
df.printSchema()

root
 |-- CustomerID: string (nullable = true)
 |-- CustomerName: string (nullable = true)
 |-- CustomerDoB: string (nullable = true)
 |-- CustomerSalary: integer (nullable = true)



In [5]:
df.show()

+----------+------------+-----------+--------------+
|CustomerID|CustomerName|CustomerDoB|CustomerSalary|
+----------+------------+-----------+--------------+
|        C1|      Pratap| 16-12-1979|         10000|
|        C2|      Sruthi| 08-01-1984|         20000|
|        C3|  Kiyanshita| 24-08-2011|         30000|
|        C4|    Nirupama| 01-11-2022|         40000|
+----------+------------+-----------+--------------+



In [6]:
df.first()

Row(CustomerID='C1', CustomerName='Pratap', CustomerDoB='16-12-1979', CustomerSalary=10000)

In [8]:
row = df.first()

In [9]:
row

Row(CustomerID='C1', CustomerName='Pratap', CustomerDoB='16-12-1979', CustomerSalary=10000)

In [10]:
row["CustomerID"]

'C1'

In [11]:
df.head()

Row(CustomerID='C1', CustomerName='Pratap', CustomerDoB='16-12-1979', CustomerSalary=10000)

In [12]:
df.head(3)

[Row(CustomerID='C1', CustomerName='Pratap', CustomerDoB='16-12-1979', CustomerSalary=10000),
 Row(CustomerID='C2', CustomerName='Sruthi', CustomerDoB='08-01-1984', CustomerSalary=20000),
 Row(CustomerID='C3', CustomerName='Kiyanshita', CustomerDoB='24-08-2011', CustomerSalary=30000)]

In [14]:
df.head(3)[1]["CustomerID"]

'C2'

In [22]:
df.head(3)[1]["CustomerID"]

'C2'

In [29]:
df.filter(df["CustomerSalary"] > 30000).head()

Row(CustomerID='C4', CustomerName='Nirupama', CustomerDoB='01-11-2022', CustomerSalary=40000)