In [1]:
from pyspark.sql import SparkSession

In [2]:
spark = SparkSession.builder.appName('tutorial2').getOrCreate()

In [3]:
spark

In [4]:
# one way of reading dataset
df_pyspark = spark.read.option('header', 'True').csv('test.csv', inferSchema=True)

In [5]:
df_pyspark.printSchema()

root
 |-- Name: string (nullable = true)
 |-- Age: integer (nullable = true)
 |-- Experience: integer (nullable = true)



In [7]:
# another way of reading dataset
df_pyspark = spark.read.csv('test.csv', header=True, inferSchema=True)
df_pyspark.show()

+-----+---+----------+
| Name|Age|Experience|
+-----+---+----------+
| Ravi| 35|        15|
|Krish| 28|         7|
| Hari| 21|         1|
|Sunny| 30|         8|
| Siva| 25|         3|
| Giri| 32|        10|
+-----+---+----------+



In [8]:
type(df_pyspark)

pyspark.sql.dataframe.DataFrame

In [9]:
df_pyspark.columns

['Name', 'Age', 'Experience']

In [11]:
# getting a single column
df_pyspark.select('Name').show()

+-----+
| Name|
+-----+
| Ravi|
|Krish|
| Hari|
|Sunny|
| Siva|
| Giri|
+-----+



In [12]:
# getting two columns
df_pyspark.select(['Name', 'Experience']).show()

+-----+----------+
| Name|Experience|
+-----+----------+
| Ravi|        15|
|Krish|         7|
| Hari|         1|
|Sunny|         8|
| Siva|         3|
| Giri|        10|
+-----+----------+



In [14]:
# getting data types
df_pyspark.dtypes

[('Name', 'string'), ('Age', 'int'), ('Experience', 'int')]

In [15]:
df_pyspark.describe()

DataFrame[summary: string, Name: string, Age: string, Experience: string]

In [16]:
df_pyspark.describe().show()

+-------+-----+------------------+-----------------+
|summary| Name|               Age|       Experience|
+-------+-----+------------------+-----------------+
|  count|    6|                 6|                6|
|   mean| NULL|              28.5|7.333333333333333|
| stddev| NULL|5.0099900199501395| 5.00666222813829|
|    min| Giri|                21|                1|
|    max|Sunny|                35|               15|
+-------+-----+------------------+-----------------+



In [19]:
# adding columns
df_pyspark = df_pyspark.withColumn('Experience after 2 years', df_pyspark['Experience']+2)

In [20]:
df_pyspark.withColumn('Experience after 2 years', df_pyspark['Experience']+2).show()

+-----+---+----------+------------------------+
| Name|Age|Experience|Experience after 2 years|
+-----+---+----------+------------------------+
| Ravi| 35|        15|                      17|
|Krish| 28|         7|                       9|
| Hari| 21|         1|                       3|
|Sunny| 30|         8|                      10|
| Siva| 25|         3|                       5|
| Giri| 32|        10|                      12|
+-----+---+----------+------------------------+



In [21]:
df_pyspark.show()

+-----+---+----------+------------------------+
| Name|Age|Experience|Experience after 2 years|
+-----+---+----------+------------------------+
| Ravi| 35|        15|                      17|
|Krish| 28|         7|                       9|
| Hari| 21|         1|                       3|
|Sunny| 30|         8|                      10|
| Siva| 25|         3|                       5|
| Giri| 32|        10|                      12|
+-----+---+----------+------------------------+



In [22]:
# dropping columns
df_pyspark = df_pyspark.drop('Experience after 2 years')

In [23]:
df_pyspark.show()


+-----+---+----------+
| Name|Age|Experience|
+-----+---+----------+
| Ravi| 35|        15|
|Krish| 28|         7|
| Hari| 21|         1|
|Sunny| 30|         8|
| Siva| 25|         3|
| Giri| 32|        10|
+-----+---+----------+



In [26]:
df_pyspark = df_pyspark.withColumnRenamed('Name', 'Names')

In [27]:
df_pyspark.show()

+-----+---+----------+
|Names|Age|Experience|
+-----+---+----------+
| Ravi| 35|        15|
|Krish| 28|         7|
| Hari| 21|         1|
|Sunny| 30|         8|
| Siva| 25|         3|
| Giri| 32|        10|
+-----+---+----------+

