## ***In this tutorial we will cover:***



*   Pyspark Dataframe
*   Reading the Dataset
* Checking the Datatypes of the column (schema)
* Selecting Columns and Indexing
* Check Describe option similar to Pandas
* Adding Columns
* Dropping Columns



In [2]:
!pip install pyspark

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting pyspark
  Downloading pyspark-3.3.0.tar.gz (281.3 MB)
[K     |████████████████████████████████| 281.3 MB 33 kB/s 
[?25hCollecting py4j==0.10.9.5
  Downloading py4j-0.10.9.5-py2.py3-none-any.whl (199 kB)
[K     |████████████████████████████████| 199 kB 48.0 MB/s 
[?25hBuilding wheels for collected packages: pyspark
  Building wheel for pyspark (setup.py) ... [?25l[?25hdone
  Created wheel for pyspark: filename=pyspark-3.3.0-py2.py3-none-any.whl size=281764026 sha256=ea190ec60d1b14a3c920341225803b26b305ece9757717e360b3da542e8f7532
  Stored in directory: /root/.cache/pip/wheels/7a/8e/1b/f73a52650d2e5f337708d9f6a1750d451a7349a867f928b885
Successfully built pyspark
Installing collected packages: py4j, pyspark
Successfully installed py4j-0.10.9.5 pyspark-3.3.0


# Starting Session

In [3]:
from pyspark.sql import SparkSession

In [5]:
spark = SparkSession.builder.appName('Dataframe').getOrCreate()
spark

# Reading the Dataset

In [36]:
# Read the dataset

df_pyspark1 = spark.read.option('header','true').csv('test1.csv',inferSchema = True)
print(df_pyspark1)
df_pyspark1.show()

DataFrame[name: string, age: int, experience: int]
+-------+---+----------+
|   name|age|experience|
+-------+---+----------+
| Uttam | 21|        10|
| Karan | 23|         4|
|Krishna| 28|        50|
|  Arjun| 24|         9|
|  Bhim | 26|         7|
+-------+---+----------+



In [37]:
# Check the schema

df_pyspark1.printSchema()

root
 |-- name: string (nullable = true)
 |-- age: integer (nullable = true)
 |-- experience: integer (nullable = true)



In [38]:
df_pyspark = spark.read.csv('test1.csv',header = True, inferSchema = True)
print(df_pyspark)
df_pyspark.show()

DataFrame[name: string, age: int, experience: int]
+-------+---+----------+
|   name|age|experience|
+-------+---+----------+
| Uttam | 21|        10|
| Karan | 23|         4|
|Krishna| 28|        50|
|  Arjun| 24|         9|
|  Bhim | 26|         7|
+-------+---+----------+



# Checking the Datatypes of the column (schema)

In [39]:
df_pyspark.columns

['name', 'age', 'experience']

In [40]:
df_pyspark.printSchema()

root
 |-- name: string (nullable = true)
 |-- age: integer (nullable = true)
 |-- experience: integer (nullable = true)



In [41]:
df_pyspark.head(3)

[Row(name='Uttam ', age=21, experience=10),
 Row(name='Karan ', age=23, experience=4),
 Row(name='Krishna', age=28, experience=50)]

# Selecting Columns and Indexing

In [42]:
# Selecting a column 

df_pyspark.select('name')

DataFrame[name: string]

In [43]:
df_pyspark.select('name').show()

+-------+
|   name|
+-------+
| Uttam |
| Karan |
|Krishna|
|  Arjun|
|  Bhim |
+-------+



In [44]:
# Selecting multiple columns

df_pyspark.select(['name','age'])

DataFrame[name: string, age: int]

In [45]:
df_pyspark.select(['name','age']).show()

+-------+---+
|   name|age|
+-------+---+
| Uttam | 21|
| Karan | 23|
|Krishna| 28|
|  Arjun| 24|
|  Bhim | 26|
+-------+---+



In [46]:
type(df_pyspark.select('name'))

pyspark.sql.dataframe.DataFrame

In [47]:
# # Finding the datatype of a column

df_pyspark.dtypes

[('name', 'string'), ('age', 'int'), ('experience', 'int')]

# Check Describe option similar to Pandas

In [48]:
df_pyspark.describe().show()

+-------+------+-----------------+------------------+
|summary|  name|              age|        experience|
+-------+------+-----------------+------------------+
|  count|     5|                5|                 5|
|   mean|  null|             24.4|              16.0|
| stddev|  null|2.701851217221259|19.144189719076646|
|    min| Arjun|               21|                 4|
|    max|Uttam |               28|                50|
+-------+------+-----------------+------------------+



# Adding Columns in Dataframe

In [53]:
df_pyspark = df_pyspark.withColumn('experience after 2 years', df_pyspark['experience']+2)
df_pyspark.show()

+-------+---+----------+------------------------+
|   name|age|experience|experience after 2 years|
+-------+---+----------+------------------------+
| Uttam | 21|        10|                      12|
| Karan | 23|         4|                       6|
|Krishna| 28|        50|                      52|
|  Arjun| 24|         9|                      11|
|  Bhim | 26|         7|                       9|
+-------+---+----------+------------------------+



# Dropping Columns in Dataframe

In [55]:
df_pyspark = df_pyspark.drop('experience after 2 years')
df_pyspark.show()

+-------+---+----------+
|   name|age|experience|
+-------+---+----------+
| Uttam | 21|        10|
| Karan | 23|         4|
|Krishna| 28|        50|
|  Arjun| 24|         9|
|  Bhim | 26|         7|
+-------+---+----------+



# Renaming Columns in Dataframe

In [56]:
df_pyspark = df_pyspark.withColumnRenamed('name','new_name')
df_pyspark.show()

+--------+---+----------+
|new_name|age|experience|
+--------+---+----------+
|  Uttam | 21|        10|
|  Karan | 23|         4|
| Krishna| 28|        50|
|   Arjun| 24|         9|
|   Bhim | 26|         7|
+--------+---+----------+

