## Install Libraries

In [1]:
from pyspark.sql import SparkSession

In [2]:
spark=SparkSession.builder.appName('Dataframe').getOrCreate()

In [3]:
spark

## Read the Dataset

In [4]:
df_pyspark=spark.read.option('header','true').csv('sample1.csv',inferSchema=True)

In [5]:
df_pyspark.printSchema()

root
 |-- Name: string (nullable = true)
 |-- age: integer (nullable = true)
 |-- Experience: integer (nullable = true)
 |-- Salary: integer (nullable = true)



In [8]:
df_pyspark=spark.read.csv('sample1.csv', header=True, inferSchema=True)
df_pyspark.show()

+--------+---+----------+------+
|    Name|age|Experience|Salary|
+--------+---+----------+------+
|    jack| 31|        10| 30000|
|    alex| 30|         8| 25000|
|caroline| 29|         4| 20000|
|    paul| 24|         3| 20000|
|  sandra| 21|         1| 15000|
|casandra| 23|         2| 18000|
+--------+---+----------+------+



In [9]:
df_pyspark.printSchema()

root
 |-- Name: string (nullable = true)
 |-- age: integer (nullable = true)
 |-- Experience: integer (nullable = true)
 |-- Salary: integer (nullable = true)



In [10]:
type(df_pyspark)

pyspark.sql.dataframe.DataFrame

## Show Top 3 Records

In [11]:
df_pyspark.head(3)

[Row(Name='jack', age=31, Experience=10, Salary=30000),
 Row(Name='alex', age=30, Experience=8, Salary=25000),
 Row(Name='caroline', age=29, Experience=4, Salary=20000)]

In [12]:
df_pyspark.show()

+--------+---+----------+------+
|    Name|age|Experience|Salary|
+--------+---+----------+------+
|    jack| 31|        10| 30000|
|    alex| 30|         8| 25000|
|caroline| 29|         4| 20000|
|    paul| 24|         3| 20000|
|  sandra| 21|         1| 15000|
|casandra| 23|         2| 18000|
+--------+---+----------+------+



## Show Only Two Columns

In [13]:
df_pyspark.select(['Name','Experience']).show()

+--------+----------+
|    Name|Experience|
+--------+----------+
|    jack|        10|
|    alex|         8|
|caroline|         4|
|    paul|         3|
|  sandra|         1|
|casandra|         2|
+--------+----------+



In [14]:
df_pyspark['Name']

Column<'Name'>

## Show Attributes/Columns Data Types

In [15]:
df_pyspark.dtypes

[('Name', 'string'), ('age', 'int'), ('Experience', 'int'), ('Salary', 'int')]

## Statistical Summary of Dataset

In [16]:
df_pyspark.describe().show()

+-------+------+------------------+-----------------+------------------+
|summary|  Name|               age|       Experience|            Salary|
+-------+------+------------------+-----------------+------------------+
|  count|     6|                 6|                6|                 6|
|   mean|  NULL|26.333333333333332|4.666666666666667|21333.333333333332|
| stddev|  NULL| 4.179314138308661|3.559026084010437| 5354.126134736337|
|    min|  alex|                21|                1|             15000|
|    max|sandra|                31|               10|             30000|
+-------+------+------------------+-----------------+------------------+



## Add New Column

In [17]:
df_pyspark=df_pyspark.withColumn('Experience After 2 year',df_pyspark['Experience']+2)
df_pyspark.show()

+--------+---+----------+------+-----------------------+
|    Name|age|Experience|Salary|Experience After 2 year|
+--------+---+----------+------+-----------------------+
|    jack| 31|        10| 30000|                     12|
|    alex| 30|         8| 25000|                     10|
|caroline| 29|         4| 20000|                      6|
|    paul| 24|         3| 20000|                      5|
|  sandra| 21|         1| 15000|                      3|
|casandra| 23|         2| 18000|                      4|
+--------+---+----------+------+-----------------------+



## Drop Columns

In [18]:
df_pyspark=df_pyspark.drop('Experience After 2 year')
df_pyspark.show()

+--------+---+----------+------+
|    Name|age|Experience|Salary|
+--------+---+----------+------+
|    jack| 31|        10| 30000|
|    alex| 30|         8| 25000|
|caroline| 29|         4| 20000|
|    paul| 24|         3| 20000|
|  sandra| 21|         1| 15000|
|casandra| 23|         2| 18000|
+--------+---+----------+------+



## Rename Column

In [19]:
df_pyspark.withColumnRenamed('Name','Employee Name').show()

+-------------+---+----------+------+
|Employee Name|age|Experience|Salary|
+-------------+---+----------+------+
|         jack| 31|        10| 30000|
|         alex| 30|         8| 25000|
|     caroline| 29|         4| 20000|
|         paul| 24|         3| 20000|
|       sandra| 21|         1| 15000|
|     casandra| 23|         2| 18000|
+-------------+---+----------+------+

