In [2]:
# Testing pyspark Intallation
import findspark
findspark.init('C:\Spark')
findspark.find()

import pyspark
findspark.find()

'C:\\Spark'

In [3]:
import pyspark
from pyspark.sql import SparkSession

spark = SparkSession.builder.master("local[1]").appName('SparkByExamples.com').getOrCreate()

spark

### Create DataFrame

In [9]:
df = spark.read.option("header",True).csv("data/IRIS.csv")
df.show(5)

+------------+-----------+------------+-----------+-----------+
|sepal_length|sepal_width|petal_length|petal_width|    species|
+------------+-----------+------------+-----------+-----------+
|         5.1|        3.5|         1.4|        0.2|Iris-setosa|
|         4.9|          3|         1.4|        0.2|Iris-setosa|
|         4.7|        3.2|         1.3|        0.2|Iris-setosa|
|         4.6|        3.1|         1.5|        0.2|Iris-setosa|
|           5|        3.6|         1.4|        0.2|Iris-setosa|
+------------+-----------+------------+-----------+-----------+
only showing top 5 rows



In [15]:
# Display 2 rows and full column contents

df.show(2,truncate=False) 

+------------+-----------+------------+-----------+-----------+
|sepal_length|sepal_width|petal_length|petal_width|species    |
+------------+-----------+------------+-----------+-----------+
|5.1         |3.5        |1.4         |0.2        |Iris-setosa|
|4.9         |3          |1.4         |0.2        |Iris-setosa|
+------------+-----------+------------+-----------+-----------+
only showing top 2 rows



In [16]:
# Display 2 rows & column values 25 characters

df.show(2,truncate=25) 

+------------+-----------+------------+-----------+-----------+
|sepal_length|sepal_width|petal_length|petal_width|    species|
+------------+-----------+------------+-----------+-----------+
|         5.1|        3.5|         1.4|        0.2|Iris-setosa|
|         4.9|          3|         1.4|        0.2|Iris-setosa|
+------------+-----------+------------+-----------+-----------+
only showing top 2 rows



In [17]:
# Display DataFrame rows & columns vertically

df.show(n=3,truncate=25,vertical=True)

-RECORD 0-------------------
 sepal_length | 5.1         
 sepal_width  | 3.5         
 petal_length | 1.4         
 petal_width  | 0.2         
 species      | Iris-setosa 
-RECORD 1-------------------
 sepal_length | 4.9         
 sepal_width  | 3           
 petal_length | 1.4         
 petal_width  | 0.2         
 species      | Iris-setosa 
-RECORD 2-------------------
 sepal_length | 4.7         
 sepal_width  | 3.2         
 petal_length | 1.3         
 petal_width  | 0.2         
 species      | Iris-setosa 
only showing top 3 rows



In [18]:
df.printSchema()

root
 |-- sepal_length: string (nullable = true)
 |-- sepal_width: string (nullable = true)
 |-- petal_length: string (nullable = true)
 |-- petal_width: string (nullable = true)
 |-- species: string (nullable = true)



In [20]:
# check the data type of “Age”column.

df.select("petal_width").dtypes

[('petal_width', 'string')]

In [21]:
# Change Column type using cast

df_datatype=df.withColumn("petal_width",df["petal_width"].cast("int"))

df_datatype.printSchema()

root
 |-- sepal_length: string (nullable = true)
 |-- sepal_width: string (nullable = true)
 |-- petal_length: string (nullable = true)
 |-- petal_width: integer (nullable = true)
 |-- species: string (nullable = true)



In [23]:
# Change Column type using selectExpr

from pyspark.sql.types import *

df_new = df.selectExpr("cast(petal_width as Double ) petal_width", "cast(sepal_length as Double) sepal_length")

df_new.printSchema()

df_new.show(5)

root
 |-- petal_width: double (nullable = true)
 |-- sepal_length: double (nullable = true)

+-----------+------------+
|petal_width|sepal_length|
+-----------+------------+
|        0.2|         5.1|
|        0.2|         4.9|
|        0.2|         4.7|
|        0.2|         4.6|
|        0.2|         5.0|
+-----------+------------+
only showing top 5 rows



In [25]:
#  Change Column type using SQL Expression

df.createOrReplaceTempView("Table")

df_sql = spark.sql("SELECT STRING(species),Float(sepal_length) from Table")
df_sql.printSchema()

df_sql.show(5)

root
 |-- species: string (nullable = true)
 |-- sepal_length: float (nullable = true)

+-----------+------------+
|    species|sepal_length|
+-----------+------------+
|Iris-setosa|         5.1|
|Iris-setosa|         4.9|
|Iris-setosa|         4.7|
|Iris-setosa|         4.6|
|Iris-setosa|         5.0|
+-----------+------------+
only showing top 5 rows



###  Change the value of an existing column

In [10]:
from pyspark.sql.functions import col

df_value = df.withColumn("petal_length",col("petal_length") * 10)

df_value.show(5)

+------------+-----------+------------+-----------+-----------+
|sepal_length|sepal_width|petal_length|petal_width|    species|
+------------+-----------+------------+-----------+-----------+
|         5.1|        3.5|        14.0|        0.2|Iris-setosa|
|         4.9|          3|        14.0|        0.2|Iris-setosa|
|         4.7|        3.2|        13.0|        0.2|Iris-setosa|
|         4.6|        3.1|        15.0|        0.2|Iris-setosa|
|           5|        3.6|        14.0|        0.2|Iris-setosa|
+------------+-----------+------------+-----------+-----------+
only showing top 5 rows



### Derive column from existing column

In [11]:
df_new = df.withColumn("petal_length",(col("petal_length") * 100) / 100)

df_new.show(5)

+------------+-----------+------------+-----------+-----------+
|sepal_length|sepal_width|petal_length|petal_width|    species|
+------------+-----------+------------+-----------+-----------+
|         5.1|        3.5|         1.4|        0.2|Iris-setosa|
|         4.9|          3|         1.4|        0.2|Iris-setosa|
|         4.7|        3.2|         1.3|        0.2|Iris-setosa|
|         4.6|        3.1|         1.5|        0.2|Iris-setosa|
|           5|        3.6|         1.4|        0.2|Iris-setosa|
+------------+-----------+------------+-----------+-----------+
only showing top 5 rows



### Rename a Dataframe Column

In [12]:
re_df = df.withColumnRenamed("petal_width","petal_width_1")
re_df.show(5)

+------------+-----------+------------+-------------+-----------+
|sepal_length|sepal_width|petal_length|petal_width_1|    species|
+------------+-----------+------------+-------------+-----------+
|         5.1|        3.5|         1.4|          0.2|Iris-setosa|
|         4.9|          3|         1.4|          0.2|Iris-setosa|
|         4.7|        3.2|         1.3|          0.2|Iris-setosa|
|         4.6|        3.1|         1.5|          0.2|Iris-setosa|
|           5|        3.6|         1.4|          0.2|Iris-setosa|
+------------+-----------+------------+-------------+-----------+
only showing top 5 rows



### Add a new column with constant value

In [13]:
# lit
from pyspark.sql.functions import lit

new_col = df.withColumn("COllege",lit("MITRC"))
new_col.show(5)

+------------+-----------+------------+-----------+-----------+-------+
|sepal_length|sepal_width|petal_length|petal_width|    species|COllege|
+------------+-----------+------------+-----------+-----------+-------+
|         5.1|        3.5|         1.4|        0.2|Iris-setosa|  MITRC|
|         4.9|          3|         1.4|        0.2|Iris-setosa|  MITRC|
|         4.7|        3.2|         1.3|        0.2|Iris-setosa|  MITRC|
|         4.6|        3.1|         1.5|        0.2|Iris-setosa|  MITRC|
|           5|        3.6|         1.4|        0.2|Iris-setosa|  MITRC|
+------------+-----------+------------+-----------+-----------+-------+
only showing top 5 rows



In [14]:
# Drop a column

drop_df = new_col.drop("COllege")

drop_df.show(5)

+------------+-----------+------------+-----------+-----------+
|sepal_length|sepal_width|petal_length|petal_width|    species|
+------------+-----------+------------+-----------+-----------+
|         5.1|        3.5|         1.4|        0.2|Iris-setosa|
|         4.9|          3|         1.4|        0.2|Iris-setosa|
|         4.7|        3.2|         1.3|        0.2|Iris-setosa|
|         4.6|        3.1|         1.5|        0.2|Iris-setosa|
|           5|        3.6|         1.4|        0.2|Iris-setosa|
+------------+-----------+------------+-----------+-----------+
only showing top 5 rows



In [None]:
# greatest

from pyspark.sql.functions import greatest,col

df1=df.withColumn("large",greatest(col("level1"),col("level2"),col("level3"),col("level4")))

In [None]:
# least

from pyspark.sql.functions import least,col

df2=df.withColumn("Small",least(col("level1"),col("level2"),col("level3"),col("level4")))