## PySpark Session-2:

In [4]:
# Create pyspark session
from pyspark.sql import SparkSession

In [6]:
spark = SparkSession.builder.appName("Dataframe").getOrCreate()

In [7]:
# Read the daataset
df_pyspark = spark.read.option('header','true').csv("Test2.csv")

In [8]:
df_pyspark.head(3)

[Row(Name='A', Age='30', Experience='10'),
 Row(Name='B', Age='31', Experience='11'),
 Row(Name='C', Age='32', Experience='12')]

In [9]:
df_pyspark.show()

+----+---+----------+
|Name|Age|Experience|
+----+---+----------+
|   A| 30|        10|
|   B| 31|        11|
|   C| 32|        12|
|   D| 33|        13|
+----+---+----------+



In [11]:
# Check the schema, where all attributes are string by default
df_pyspark.printSchema()

root
 |-- Name: string (nullable = true)
 |-- Age: string (nullable = true)
 |-- Experience: string (nullable = true)



In [12]:
# To read the dataset and retain the datatype as that of dataset use inferSchema option
df_pyspark = spark.read.option('header','true').csv("Test2.csv",inferSchema=True)

In [13]:
# Now if you use inferSchema and check the datatype of attributes
df_pyspark.printSchema()

root
 |-- Name: string (nullable = true)
 |-- Age: integer (nullable = true)
 |-- Experience: integer (nullable = true)



In [16]:
# Another method to read the dataset and retain datatypes as same
df_pyspark = spark.read.csv("test2.csv", header=True, inferSchema=True)

In [17]:
df_pyspark.printSchema()

root
 |-- Name: string (nullable = true)
 |-- Age: integer (nullable = true)
 |-- Experience: integer (nullable = true)



In [20]:
# To view the column names
df_pyspark.columns

['Name', 'Age', 'Experience']

In [21]:
# Select one particular column of Dataframe
df_pyspark.select("Name")

DataFrame[Name: string]

In [22]:
# Select one particular column along with data
df_pyspark.select("Name").show()

+----+
|Name|
+----+
|   A|
|   B|
|   C|
|   D|
+----+



In [23]:
# Select multiple columns of Dataframe
df_pyspark.select(["Name","Experience"])

DataFrame[Name: string, Experience: int]

In [25]:
# Select multiple columns along with data
df_pyspark.select(["Name","Experience"]).show()

+----+----------+
|Name|Experience|
+----+----------+
|   A|        10|
|   B|        11|
|   C|        12|
|   D|        13|
+----+----------+



In [27]:
# Check the datatypes of dataframe
df_pyspark.dtypes

[('Name', 'string'), ('Age', 'int'), ('Experience', 'int')]

In [29]:
# Describe the dataframe
df_pyspark.describe()

DataFrame[summary: string, Name: string, Age: string, Experience: string]

In [30]:
# Describe the dataframe with summary of cilumns
df_pyspark.describe().show()

+-------+----+------------------+------------------+
|summary|Name|               Age|        Experience|
+-------+----+------------------+------------------+
|  count|   4|                 4|                 4|
|   mean|null|              31.5|              11.5|
| stddev|null|1.2909944487358056|1.2909944487358056|
|    min|   A|                30|                10|
|    max|   D|                33|                13|
+-------+----+------------------+------------------+



In [31]:
# Adding a new column:  new_column_name = Exp after 2yr
# use withColumn function
df_pyspark.withColumn("Exp_after_2yr",df_pyspark["Experience"]+2)

DataFrame[Name: string, Age: int, Experience: int, Exp_after_2yr: int]

In [32]:
df_pyspark.withColumn("Exp_after_2yr",df_pyspark["Experience"]+2).show()

+----+---+----------+-------------+
|Name|Age|Experience|Exp_after_2yr|
+----+---+----------+-------------+
|   A| 30|        10|           12|
|   B| 31|        11|           13|
|   C| 32|        12|           14|
|   D| 33|        13|           15|
+----+---+----------+-------------+



In [33]:
df_pyspark = df_pyspark.withColumn("Exp_after_2yr",df_pyspark["Experience"]+2)

In [34]:
df_pyspark.show()

+----+---+----------+-------------+
|Name|Age|Experience|Exp_after_2yr|
+----+---+----------+-------------+
|   A| 30|        10|           12|
|   B| 31|        11|           13|
|   C| 32|        12|           14|
|   D| 33|        13|           15|
+----+---+----------+-------------+



In [35]:
# Drop the column
# drop the column Exp_after_2yr
df_pyspark.drop("Exp_after-2yr")

DataFrame[Name: string, Age: int, Experience: int, Exp_after_2yr: int]

In [45]:
df_pyspark.drop("Exp_after_2yr")
df_pyspark.drop("Exp_after_2yr").show()

+----+---+----------+
|Name|Age|Experience|
+----+---+----------+
|   A| 30|        10|
|   B| 31|        11|
|   C| 32|        12|
|   D| 33|        13|
+----+---+----------+



In [47]:
df_pyspark = df_pyspark.drop("Exp_after_2yr")

In [49]:
df_pyspark.show()

+----+---+----------+
|Name|Age|Experience|
+----+---+----------+
|   A| 30|        10|
|   B| 31|        11|
|   C| 32|        12|
|   D| 33|        13|
+----+---+----------+



In [50]:
# Renaming columns
# Rename the column Experience to Exp
df_pyspark.withColumnRenamed("Experience","Exp")

DataFrame[Name: string, Age: int, Exp: int]

In [52]:
df_pyspark = df_pyspark.withColumnRenamed("Experience","Exp")

In [53]:
df_pyspark.show()

+----+---+---+
|Name|Age|Exp|
+----+---+---+
|   A| 30| 10|
|   B| 31| 11|
|   C| 32| 12|
|   D| 33| 13|
+----+---+---+

