In [51]:
'''

@Author: Vighnesh Harish Bilgi
@Date: 2022-12-14
@Last Modified by: Vighnesh Harish Bilgi
@Last Modified time: 2022-12-14
@Title : 1 - Pyspark basic dataframe functions

'''

'\n\n@Author: Vighnesh Harish Bilgi\n@Date: 2022-12-14\n@Last Modified by: Vighnesh Harish Bilgi\n@Last Modified time: 2022-12-14\n@Title : 1 - Pyspark basic dataframe functions\n\n'

In [52]:
import findspark
findspark.init()

In [53]:
import pyspark
import pandas as pd
from pyspark.sql import SparkSession

## Performing Pyspark basic dataframe functions

### Creating spark session

In [54]:
spark = SparkSession.builder.appName('Dataframe').getOrCreate()

In [55]:
spark

### Reading .csv file as spark dataframe

In [56]:
df_pyspark = spark.read.csv('test1.csv')
df_pyspark.show()

+--------+---+----------+
|     _c0|_c1|       _c2|
+--------+---+----------+
|    Name|age|Experience|
|Vighnesh| 27|         2|
|   Anoop| 25|         5|
|  Nikhil| 26|         6|
+--------+---+----------+



### Reading .csv file as spark dataframe with header as True

In [57]:
df_pyspark = spark.read.option('header','true').csv('test1.csv')
df_pyspark.show()

+--------+---+----------+
|    Name|age|Experience|
+--------+---+----------+
|Vighnesh| 27|         2|
|   Anoop| 25|         5|
|  Nikhil| 26|         6|
+--------+---+----------+



### Viewing Type of spark Dataframe

In [58]:
type(df_pyspark)

pyspark.sql.dataframe.DataFrame

### Using head() on spark dataframe

In [59]:
df_pyspark.head()

Row(Name='Vighnesh', age='27', Experience='2')

In [60]:
df_pyspark.head(3)

[Row(Name='Vighnesh', age='27', Experience='2'),
 Row(Name='Anoop', age='25', Experience='5'),
 Row(Name='Nikhil', age='26', Experience='6')]

### Viewing schema of spark dataframe

#### Here we see that 'age' and 'Experience' as string data type when it should be integer

In [61]:
df_pyspark.printSchema()

root
 |-- Name: string (nullable = true)
 |-- age: string (nullable = true)
 |-- Experience: string (nullable = true)



### Correcting schema of the dataframe by parameter 'inferSchema'

In [62]:
df_pyspark = spark.read.option('header','true').csv('test1.csv',inferSchema=True)
df_pyspark.show()

+--------+---+----------+
|    Name|age|Experience|
+--------+---+----------+
|Vighnesh| 27|         2|
|   Anoop| 25|         5|
|  Nikhil| 26|         6|
+--------+---+----------+



In [63]:
df_pyspark.printSchema()

root
 |-- Name: string (nullable = true)
 |-- age: integer (nullable = true)
 |-- Experience: integer (nullable = true)



### Viewing one column of spark dataframe

In [64]:
df_pyspark.select('Name').show()

+--------+
|    Name|
+--------+
|Vighnesh|
|   Anoop|
|  Nikhil|
+--------+



In [65]:
type(df_pyspark.select('Name'))

pyspark.sql.dataframe.DataFrame

### Viewing multiple columns of spark dataframe

In [66]:
df_pyspark.select(['Name','Experience']).show()

+--------+----------+
|    Name|Experience|
+--------+----------+
|Vighnesh|         2|
|   Anoop|         5|
|  Nikhil|         6|
+--------+----------+



In [67]:
df_pyspark['Name']

Column<b'Name'>

### Viewing datatypes of the columns

In [68]:
df_pyspark.dtypes

[('Name', 'string'), ('age', 'int'), ('Experience', 'int')]

### Using describe() on spark dataframe

In [69]:
df_pyspark.describe().show()

+-------+--------+----+------------------+
|summary|    Name| age|        Experience|
+-------+--------+----+------------------+
|  count|       3|   3|                 3|
|   mean|    null|26.0| 4.333333333333333|
| stddev|    null| 1.0|2.0816659994661326|
|    min|   Anoop|  25|                 2|
|    max|Vighnesh|  27|                 6|
+-------+--------+----+------------------+



### Creating new column in spark dataframe

In [70]:
df_pyspark = df_pyspark.withColumn('Experience After 2 years', df_pyspark['Experience']+2)
df_pyspark.show()

+--------+---+----------+------------------------+
|    Name|age|Experience|Experience After 2 years|
+--------+---+----------+------------------------+
|Vighnesh| 27|         2|                       4|
|   Anoop| 25|         5|                       7|
|  Nikhil| 26|         6|                       8|
+--------+---+----------+------------------------+



### Dropping new column in spark dataframe

In [71]:
df_pyspark= df_pyspark.drop('Experience After 2 years')
df_pyspark.show()

+--------+---+----------+
|    Name|age|Experience|
+--------+---+----------+
|Vighnesh| 27|         2|
|   Anoop| 25|         5|
|  Nikhil| 26|         6|
+--------+---+----------+



### Adding new row to the dataframe

In [74]:
newRow = spark.createDataFrame([('Divyansh',29,7)], df_pyspark.columns)
df_pyspark = df_pyspark.union(newRow)
df_pyspark.show()

+--------+---+----------+
|    Name|age|Experience|
+--------+---+----------+
|Vighnesh| 27|         2|
|   Anoop| 25|         5|
|  Nikhil| 26|         6|
|Divyansh| 29|         7|
+--------+---+----------+



### Renaming new column in spark dataframe

In [75]:
df_pyspark.withColumnRenamed('Name','New Name').show()

+--------+---+----------+
|New Name|age|Experience|
+--------+---+----------+
|Vighnesh| 27|         2|
|   Anoop| 25|         5|
|  Nikhil| 26|         6|
|Divyansh| 29|         7|
+--------+---+----------+

