# What is DataFrame

API Documentation: https://spark.apache.org/docs/3.5.0/api/python/reference/pyspark.sql/api/pyspark.sql.DataFrame.html
https://spark.apache.org/docs/3.5.0/api/python/reference/pyspark.sql/dataframe.html

In [1]:
from pyspark.sql import SparkSession

spark = SparkSession.builder \
                    .master("local[*]") \
                    .appName("Create Dataframe") \
                    .getOrCreate()

In [2]:
spark

In [None]:
spark.sparkContext.defaultParallelism

In [3]:
customer_data = [
    ["C1","Pratap","16-12-1979",10000],
    ["C2","Sruthi","08-01-1984",20000],
    ["C3","Kiyanshita","24-08-2011",30000],
    ["C4","Nirupama","01-11-2022",40000]
]

customer_schema = "CustomerID string, CustomerName String, CustomerDoB String, CustomerSalary Integer"

In [11]:
df = spark.createDataFrame(data=customer_data, schema=customer_schema)

In [12]:
df.schema

StructType([StructField('CustomerID', StringType(), True), StructField('CustomerName', StringType(), True), StructField('CustomerDoB', StringType(), True), StructField('CustomerSalary', IntegerType(), True)])

In [6]:
df.show()

+----------+------------+-----------+--------------+
|CustomerID|CustomerName|CustomerDoB|CustomerSalary|
+----------+------------+-----------+--------------+
|        C1|      Pratap| 16-12-1979|         10000|
|        C2|      Sruthi| 08-01-1984|         20000|
|        C3|  Kiyanshita| 24-08-2011|         30000|
|        C4|    Nirupama| 01-11-2022|         40000|
+----------+------------+-----------+--------------+



### StructType: https://spark.apache.org/docs/3.5.0/api/python/reference/pyspark.sql/api/pyspark.sql.types.StructType.html
### StructField: https://spark.apache.org/docs/3.5.0/api/python/reference/pyspark.sql/api/pyspark.sql.types.StructField.html
### DataTypes: https://spark.apache.org/docs/3.5.0/api/python/reference/pyspark.sql/data_types.html

In [7]:
from pyspark.sql.types import StructType, StructField, StringType, IntegerType

customer_data = [
    ["C1","Pratap","16-12-1979",10000],
    ["C2","Sruthi","08-01-1984",20000],
    ["C3","Kiyanshita","24-08-2011",30000],
    ["C4","Nirupama","01-11-2022",40000]
]

customer_schema = StructType(
                        fields = [
                            StructField(
                                    name="CustomerID", 
                                    dataType=StringType(), 
                                    nullable=True
                                ),
                            StructField(name="CustomerName", dataType=StringType(), nullable=True),
                            
                            StructField(name="CustomerDoB", dataType=StringType(), nullable=True),
                            
                            StructField(name="CustomerSalary", dataType=IntegerType(), nullable=True)
                                    ]
                            )

In [8]:
df = spark.createDataFrame(data=customer_data, schema=customer_schema)

In [9]:
df.printSchema()

root
 |-- CustomerID: string (nullable = true)
 |-- CustomerName: string (nullable = true)
 |-- CustomerDoB: string (nullable = true)
 |-- CustomerSalary: integer (nullable = true)



In [10]:
df.show()

+----------+------------+-----------+--------------+
|CustomerID|CustomerName|CustomerDoB|CustomerSalary|
+----------+------------+-----------+--------------+
|        C1|      Pratap| 16-12-1979|         10000|
|        C2|      Sruthi| 08-01-1984|         20000|
|        C3|  Kiyanshita| 24-08-2011|         30000|
|        C4|    Nirupama| 01-11-2022|         40000|
+----------+------------+-----------+--------------+

