# In this notebook we will see 

- Pyspark DataFrame
    - Reading datasets
    - Checking the datatypes of columns(Schema)
    - Selecting columns and Indexing check 
    - Describe option similar to pandas
    - Adding columns dropping columns

In [1]:
from pyspark.sql import SparkSession


In [2]:
spark = SparkSession.builder.appName('Dataframe').getOrCreate()

In [3]:
spark

In [8]:
#read dataset
df_pyspark = spark.read.csv('pysparktest.csv', header=True, inferSchema=True)
df_pyspark.show()

+---------+---+----------+
|     Name|Age|Experience|
+---------+---+----------+
|  Shubham| 27|         5|
|   Manish| 25|         1|
|Suryakant| 58|        34|
|  Saurabh| 18|         0|
+---------+---+----------+



In [9]:
df_pyspark.printSchema()

root
 |-- Name: string (nullable = true)
 |-- Age: integer (nullable = true)
 |-- Experience: integer (nullable = true)



In [10]:
#find datatype
type(df_pyspark)

pyspark.sql.dataframe.DataFrame

In [11]:
#find columns]
df_pyspark.columns

['Name', 'Age', 'Experience']

In [12]:
df_pyspark.head(3)

[Row(Name='Shubham', Age=27, Experience=5),
 Row(Name='Manish', Age=25, Experience=1),
 Row(Name='Suryakant', Age=58, Experience=34)]

In [13]:
#show only particular columns
df_pyspark.select('Name').show()

+---------+
|     Name|
+---------+
|  Shubham|
|   Manish|
|Suryakant|
|  Saurabh|
+---------+



In [14]:
df_pyspark.select(['Name','Experience']).show()

+---------+----------+
|     Name|Experience|
+---------+----------+
|  Shubham|         5|
|   Manish|         1|
|Suryakant|        34|
|  Saurabh|         0|
+---------+----------+



In [15]:
df_pyspark['Name']

Column<'Name'>

In [16]:
## check datatype 

df_pyspark.dtypes

[('Name', 'string'), ('Age', 'int'), ('Experience', 'int')]

In [17]:
## Describe dataframe

df_pyspark.describe().show()

+-------+---------+-----------------+-----------------+
|summary|     Name|              Age|       Experience|
+-------+---------+-----------------+-----------------+
|  count|        4|                4|                4|
|   mean|     null|             32.0|             10.0|
| stddev|     null|17.75762746915627|16.14517471774978|
|    min|   Manish|               18|                0|
|    max|Suryakant|               58|               34|
+-------+---------+-----------------+-----------------+



In [18]:
#Adding columns

df_pyspark = df_pyspark.withColumn('Exp after 4  year', df_pyspark['Experience']+4)

In [19]:
df_pyspark.show()

+---------+---+----------+-----------------+
|     Name|Age|Experience|Exp after 4  year|
+---------+---+----------+-----------------+
|  Shubham| 27|         5|                9|
|   Manish| 25|         1|                5|
|Suryakant| 58|        34|               38|
|  Saurabh| 18|         0|                4|
+---------+---+----------+-----------------+



In [20]:
# Drop column
df_pyspark = df_pyspark.drop('Exp after 4  year')

In [21]:
df_pyspark.show()

+---------+---+----------+
|     Name|Age|Experience|
+---------+---+----------+
|  Shubham| 27|         5|
|   Manish| 25|         1|
|Suryakant| 58|        34|
|  Saurabh| 18|         0|
+---------+---+----------+



In [25]:
## Remane columns
df_pyspark.withColumnRenamed('Name','First Name').show()

+----------+---+----------+
|First Name|Age|Experience|
+----------+---+----------+
|   Shubham| 27|         5|
|    Manish| 25|         1|
| Suryakant| 58|        34|
|   Saurabh| 18|         0|
+----------+---+----------+

