# What is DataFrame

API Documentation: https://spark.apache.org/docs/3.5.0/api/python/reference/pyspark.sql/api/pyspark.sql.DataFrame.html
https://spark.apache.org/docs/3.5.0/api/python/reference/pyspark.sql/dataframe.html

In [1]:
from pyspark.sql import SparkSession

spark = (SparkSession.builder 
                    .master("local[*]") 
                    .appName("Create Dataframe") 
                    .getOrCreate())

In [2]:
spark

In [3]:
spark.sparkContext.defaultParallelism

8

In [5]:
customer_data = [
    ["C1","Pratap","16-12-1979","10000"],
    ["C2","Sruthi","08-01-1984","20000"],
    ["C3","Kiyanshita","24-08-2011","30000"],
    ["C4","Nirupama","01-11-2022","40000"]
]

In [7]:
df = spark.createDataFrame(data=customer_data)

In [8]:
df.show()

+---+----------+----------+-----+
| _1|        _2|        _3|   _4|
+---+----------+----------+-----+
| C1|    Pratap|16-12-1979|10000|
| C2|    Sruthi|08-01-1984|20000|
| C3|Kiyanshita|24-08-2011|30000|
| C4|  Nirupama|01-11-2022|40000|
+---+----------+----------+-----+



In [13]:
df.printSchema()

root
 |-- _1: string (nullable = true)
 |-- _2: string (nullable = true)
 |-- _3: string (nullable = true)
 |-- _4: string (nullable = true)



In [15]:
customer_data = [
    ["C1","Pratap","16-12-1979","10000"],
    ["C2","Sruthi","08-01-1984","20000"],
    ["C3","Kiyanshita","24-08-2011","30000"],
    ["C4","Nirupama","01-11-2022","40000"]
]

customer_schema = "CustomerID string, CustomerName String, CustomerDoB String, CustomerSalary String"

In [16]:
df = spark.createDataFrame(data=customer_data, schema=customer_schema)

In [17]:
df.printSchema()

root
 |-- CustomerID: string (nullable = true)
 |-- CustomerName: string (nullable = true)
 |-- CustomerDoB: string (nullable = true)
 |-- CustomerSalary: string (nullable = true)



In [18]:
df.show()

+----------+------------+-----------+--------------+
|CustomerID|CustomerName|CustomerDoB|CustomerSalary|
+----------+------------+-----------+--------------+
|        C1|      Pratap| 16-12-1979|         10000|
|        C2|      Sruthi| 08-01-1984|         20000|
|        C3|  Kiyanshita| 24-08-2011|         30000|
|        C4|    Nirupama| 01-11-2022|         40000|
+----------+------------+-----------+--------------+



In [19]:
customer_data = [
    ["C1","Pratap","16-12-1979",10000],
    ["C2","Sruthi","08-01-1984",20000],
    ["C3","Kiyanshita","24-08-2011",30000],
    ["C4","Nirupama","01-11-2022",40000]
]

customer_schema = "CustomerID string, CustomerName String, CustomerDoB String, CustomerSalary Integer"

In [20]:
df = spark.createDataFrame(data=customer_data, schema=customer_schema)

In [21]:
df.printSchema()

root
 |-- CustomerID: string (nullable = true)
 |-- CustomerName: string (nullable = true)
 |-- CustomerDoB: string (nullable = true)
 |-- CustomerSalary: integer (nullable = true)



In [22]:
df.show()

+----------+------------+-----------+--------------+
|CustomerID|CustomerName|CustomerDoB|CustomerSalary|
+----------+------------+-----------+--------------+
|        C1|      Pratap| 16-12-1979|         10000|
|        C2|      Sruthi| 08-01-1984|         20000|
|        C3|  Kiyanshita| 24-08-2011|         30000|
|        C4|    Nirupama| 01-11-2022|         40000|
+----------+------------+-----------+--------------+

