#### **PySpark Dataframe**
- PySpark Dataframe
- Reading The Dataset
- Checking The Datatypes of the Columns(Schema)
- Check Describe option similar to pandas
- Adding Columns
- Dropping Columns
- Renaming Columns

In [2]:
from pyspark.sql import SparkSession

In [3]:
spark = SparkSession.builder.appName('Dataframe').getOrCreate()

In [9]:
## read the dataset
# inferSchema to keep datatypes as they are. inferSchema=False reads every columns as string
spdf = spark.read.option('header', 'true').csv('people.csv', inferSchema=True)

In [10]:
spdf.show()

+---------+---+----------+
|     Name|Age|Experience|
+---------+---+----------+
|    Peter| 45|        15|
|Clevelend| 42|         3|
| Quagmire| 61|        40|
|      Joe| 40|         5|
+---------+---+----------+



In [11]:
## check the Schema
spdf.printSchema()

root
 |-- Name: string (nullable = true)
 |-- Age: integer (nullable = true)
 |-- Experience: integer (nullable = true)



In [14]:
## perform all above lines in one
df = spark.read.csv('people.csv', header=True, inferSchema=True)
df.show()

+---------+---+----------+
|     Name|Age|Experience|
+---------+---+----------+
|    Peter| 45|        15|
|Clevelend| 42|         3|
| Quagmire| 61|        40|
|      Joe| 40|         5|
+---------+---+----------+



In [15]:
## check columns
df.columns

['Name', 'Age', 'Experience']

In [18]:
## read head n records
df.head(3) # returns a list

[Row(Name='Peter', Age=45, Experience=15),
 Row(Name='Clevelend', Age=42, Experience=3),
 Row(Name='Quagmire', Age=61, Experience=40)]

In [20]:
## pick up a column
df.select('Name').show()

+---------+
|     Name|
+---------+
|    Peter|
|Clevelend|
| Quagmire|
|      Joe|
+---------+



In [21]:
df.select(['Name', 'Age']).show()

+---------+---+
|     Name|Age|
+---------+---+
|    Peter| 45|
|Clevelend| 42|
| Quagmire| 61|
|      Joe| 40|
+---------+---+



In [22]:
## check datatypes
df.dtypes

[('Name', 'string'), ('Age', 'int'), ('Experience', 'int')]

In [25]:
## check describe
df.describe().show()

+-------+---------+-----------------+-----------------+
|summary|     Name|              Age|       Experience|
+-------+---------+-----------------+-----------------+
|  count|        4|                4|                4|
|   mean|     NULL|             47.0|            15.75|
| stddev|     NULL|9.556847457887635|16.99754884289693|
|    min|Clevelend|               40|                3|
|    max| Quagmire|               61|               40|
+-------+---------+-----------------+-----------------+



In [30]:
## adding columns in datframe
ndf = df.withColumn('Experience after 2 years', df['Experience']+2)

In [32]:
ndf.show()

+---------+---+----------+------------------------+
|     Name|Age|Experience|Experience after 2 years|
+---------+---+----------+------------------------+
|    Peter| 45|        15|                      17|
|Clevelend| 42|         3|                       5|
| Quagmire| 61|        40|                      42|
|      Joe| 40|         5|                       7|
+---------+---+----------+------------------------+



In [36]:
## drop the columns
dndf = ndf.drop('Experience after 2 years')
dndf.show()

+---------+---+----------+
|     Name|Age|Experience|
+---------+---+----------+
|    Peter| 45|        15|
|Clevelend| 42|         3|
| Quagmire| 61|        40|
|      Joe| 40|         5|
+---------+---+----------+



In [38]:
## rename the column
df.withColumnRenamed('Age', 'New Age').show()

+---------+-------+----------+
|     Name|New Age|Experience|
+---------+-------+----------+
|    Peter|     45|        15|
|Clevelend|     42|         3|
| Quagmire|     61|        40|
|      Joe|     40|         5|
+---------+-------+----------+

