In [48]:
from pyspark.sql import SparkSession

In [49]:
spark=SparkSession.builder.appName("Dataframe").getOrCreate()

In [50]:
spark

#### Reading the Dataset


In [51]:
df_pyspark=spark.read.option('header','true').csv("test_1.csv")

#### Checking the Schema


In [52]:
df_pyspark.printSchema()

root
 |-- Name: string (nullable = true)
 |-- Age: string (nullable = true)
 |-- Experience: string (nullable = true)



In [53]:
df_pyspark=spark.read.option('header','true').csv("test_1.csv", inferSchema=True)

df_pyspark.printSchema()

root
 |-- Name: string (nullable = true)
 |-- Age: integer (nullable = true)
 |-- Experience: integer (nullable = true)



In [54]:
df_pyspark=spark.read.csv("test_1.csv", header=True, inferSchema=True)
df_pyspark

DataFrame[Name: string, Age: int, Experience: int]

In [55]:
type(df_pyspark)

pyspark.sql.dataframe.DataFrame

#### Selecting the columns

In [56]:
df_pyspark.columns

['Name', 'Age', 'Experience']

In [57]:
df_pyspark.show()

+--------+---+----------+
|    Name|Age|Experience|
+--------+---+----------+
|SIddhant| 24|        10|
|Niranjan| 22|        15|
| Shekhar| 28|        20|
+--------+---+----------+



In [58]:
df_pyspark.select('Name')

DataFrame[Name: string]

In [59]:
df_pyspark.select('Name').show()

+--------+
|    Name|
+--------+
|SIddhant|
|Niranjan|
| Shekhar|
+--------+



In [60]:
type(df_pyspark.select('Name'))

pyspark.sql.dataframe.DataFrame

In [61]:
df_pyspark.select(['Name',"Experience"])

DataFrame[Name: string, Experience: int]

In [62]:
df_pyspark.select(['Name',"Experience"]).show()

+--------+----------+
|    Name|Experience|
+--------+----------+
|SIddhant|        10|
|Niranjan|        15|
| Shekhar|        20|
+--------+----------+



In [63]:
type(df_pyspark.select(['Name',"Experience" ]))

pyspark.sql.dataframe.DataFrame

### Checking Datatypes 

In [64]:
df_pyspark.dtypes

[('Name', 'string'), ('Age', 'int'), ('Experience', 'int')]

#### Describe option in Pyspark

In [65]:
df_pyspark.describe()

DataFrame[summary: string, Name: string, Age: string, Experience: string]

In [66]:
df_pyspark.describe().show()

+-------+--------+------------------+----------+
|summary|    Name|               Age|Experience|
+-------+--------+------------------+----------+
|  count|       3|                 3|         3|
|   mean|    NULL|24.666666666666668|      15.0|
| stddev|    NULL| 3.055050463303893|       5.0|
|    min|Niranjan|                22|        10|
|    max| Shekhar|                28|        20|
+-------+--------+------------------+----------+



#### Adding and Dropping Columns

In [95]:
df_pyspark=df_pyspark.withColumn("Experience and Age combined", df_pyspark['Experience'] + df_pyspark['Age'])

In [96]:
df_pyspark.show()

+--------+---+----------+---------------------------+
|    Name|Age|Experience|Experience and Age combined|
+--------+---+----------+---------------------------+
|SIddhant| 24|        10|                         34|
|Niranjan| 22|        15|                         37|
| Shekhar| 28|        20|                         48|
+--------+---+----------+---------------------------+



In [98]:
df_pyspark=df_pyspark.drop('Experience and Age combined')

In [100]:
df_pyspark.show()

+--------+---+----------+
|    Name|Age|Experience|
+--------+---+----------+
|SIddhant| 24|        10|
|Niranjan| 22|        15|
| Shekhar| 28|        20|
+--------+---+----------+



#### Renaming the columns

In [101]:
df_pyspark.withColumnRenamed('Experience', 'Exp')

DataFrame[Name: string, Age: int, Exp: int]

In [102]:
df_pyspark.withColumnRenamed('Experience', 'Exp').show()

+--------+---+---+
|    Name|Age|Exp|
+--------+---+---+
|SIddhant| 24| 10|
|Niranjan| 22| 15|
| Shekhar| 28| 20|
+--------+---+---+



In [107]:
df_pyspark.select('Name').show()

+--------+
|    Name|
+--------+
|SIddhant|
|Niranjan|
| Shekhar|
+--------+



In [108]:
from pyspark.sql.functions import when

In [111]:
df_pyspark=df_pyspark.withColumn("Name", when(df_pyspark['Name']=="SIddhant", "Siddhant").otherwise(df_pyspark["Name"]))

In [112]:
df_pyspark.show()

+--------+---+----------+
|    Name|Age|Experience|
+--------+---+----------+
|Siddhant| 24|        10|
|Niranjan| 22|        15|
| Shekhar| 28|        20|
+--------+---+----------+

