# PySpark Basic DataFrame

### Session creation

In [1]:
from pyspark.sql import SparkSession

In [2]:
spark = SparkSession.builder.appName('Basics_pyspark_df').getOrCreate()

### Dataframe creation

In [3]:
df = spark.read.csv('data/student_data.csv', header=True)   # Header is true is important if you have column name in your csv

In [4]:
# as we can see initially it didn't pick coorect dtypes for columns

df.printSchema()     # show the the dtypes of features

root
 |-- Name: string (nullable = true)
 |-- Roll_No: string (nullable = true)
 |-- Marks: string (nullable = true)
 |-- Class: string (nullable = true)
 |-- Subject: string (nullable = true)
 |-- Section: string (nullable = true)



In [5]:
#inferSchema
#The default value set to this option is 'False' when setting to 'true' it automatically infers column types based on the data.

df = spark.read.csv('data/student_data.csv', header=True, inferSchema=True)
df.printSchema()

root
 |-- Name: string (nullable = true)
 |-- Roll_No: integer (nullable = true)
 |-- Marks: integer (nullable = true)
 |-- Class: integer (nullable = true)
 |-- Subject: string (nullable = true)
 |-- Section: string (nullable = true)



In [6]:
df.show()

+-------+-------+-----+-----+--------+-------+
|   Name|Roll_No|Marks|Class| Subject|Section|
+-------+-------+-----+-----+--------+-------+
|  Avind|      1|   92|    9|   Maths|      A|
| Aditya|      2|   87|    9|   Maths|      A|
|   John|      3|   23|    9|   Maths|      B|
|   Mary|      4|   45|    9|   Maths|      B|
|Nicolas|      5|   67|    9| English|      A|
|  Jonny|      6|  100|    9| English|      A|
|    Tom|      7|   55|    9| English|      B|
|   Yash|      4|   32|    9| English|      B|
|Pushkar|      8|   30|    9| Science|      B|
|  Parth|      9|   76|    9| Science|      A|
| Piyush|     10|   86|    9| Science|      A|
| Zodiac|     11|   65|    9|Computer|      B|
+-------+-------+-----+-----+--------+-------+



In [7]:
df.describe().show()   # basic statistics for numeric and string columns

+-------+------+-----------------+------------------+-----+--------+-------+
|summary|  Name|          Roll_No|             Marks|Class| Subject|Section|
+-------+------+-----------------+------------------+-----+--------+-------+
|  count|    12|               12|                12|   12|      12|     12|
|   mean|  null|5.833333333333333|63.166666666666664|  9.0|    null|   null|
| stddev|  null|3.214550253664318|  26.1840802894824|  0.0|    null|   null|
|    min|Aditya|                1|                23|    9|Computer|      A|
|    max|Zodiac|               11|               100|    9| Science|      B|
+-------+------+-----------------+------------------+-----+--------+-------+



In [8]:
df.columns        # return python list of columns of df

['Name', 'Roll_No', 'Marks', 'Class', 'Subject', 'Section']

## creating own schema for data

In [9]:
from pyspark.sql.types import StructType, StructField, StringType, IntegerType, FloatType

In [10]:
import numpy as np
data = [("John","","Don",36636.334,"M",3000),
    ("Mahesh","Raj","",40288.101,"M",4000),
    ("Robert","Dorney","Junior",42114.99,"M",4000),
    ("May","","Jones",3919.00123,"F",4000),
    ("Nick","Mary","Brown",np.nan,"F",-1000)]

schema = StructType([
    StructField("firstname",StringType(),True),
    StructField("middlename",StringType(),True),
    StructField("lastname",StringType(),True),
    StructField("points", FloatType(), True),
    StructField("gender", StringType(), True),
    StructField("salary", IntegerType(), True)
  ])

In [11]:
df1 = spark.createDataFrame(data=data,schema=schema)
df1.printSchema()
df1.show(truncate=False)

root
 |-- firstname: string (nullable = true)
 |-- middlename: string (nullable = true)
 |-- lastname: string (nullable = true)
 |-- points: float (nullable = true)
 |-- gender: string (nullable = true)
 |-- salary: integer (nullable = true)

+---------+----------+--------+---------+------+------+
|firstname|middlename|lastname|points   |gender|salary|
+---------+----------+--------+---------+------+------+
|John     |          |Don     |36636.336|M     |3000  |
|Mahesh   |Raj       |        |40288.1  |M     |4000  |
|Robert   |Dorney    |Junior  |42114.99 |M     |4000  |
|May      |          |Jones   |3919.0012|F     |4000  |
|Nick     |Mary      |Brown   |NaN      |F     |-1000 |
+---------+----------+--------+---------+------+------+



### nested Columns

In [12]:
structureData = [(("John","","Don"),36636.334,"M",3000),
    (("Mahesh","Raj",""),40288.101,"M",4000),
    (("Robert","Dorney","Junior"),42114.99,"M",4000),
    (("May","","Jones"),3919.00123,"F",4000),
    (("Nick","Mary","Brown"),np.nan,"F",-1000)]

structureSchema = StructType([
        StructField('name', StructType([
             StructField('firstname', StringType(), True),
             StructField('middlename', StringType(), True),
             StructField('lastname', StringType(), True)
             ])),
         StructField('id', StringType(), True),
         StructField('gender', StringType(), True),
         StructField('salary', IntegerType(), True)
         ])

df2 = spark.createDataFrame(data=structureData,schema=structureSchema)
df2.printSchema()
df2.show(truncate=False)

root
 |-- name: struct (nullable = true)
 |    |-- firstname: string (nullable = true)
 |    |-- middlename: string (nullable = true)
 |    |-- lastname: string (nullable = true)
 |-- id: string (nullable = true)
 |-- gender: string (nullable = true)
 |-- salary: integer (nullable = true)

+------------------------+----------+------+------+
|name                    |id        |gender|salary|
+------------------------+----------+------+------+
|{John, , Don}           |36636.334 |M     |3000  |
|{Mahesh, Raj, }         |40288.101 |M     |4000  |
|{Robert, Dorney, Junior}|42114.99  |M     |4000  |
|{May, , Jones}          |3919.00123|F     |4000  |
|{Nick, Mary, Brown}     |NaN       |F     |-1000 |
+------------------------+----------+------+------+

