In [1]:
import findspark
findspark.init()

In [2]:
import pyspark
from pyspark.sql import SparkSession
spark = SparkSession.builder.master("local[1]") \
                    .appName('pyspark-examples') \
                    .getOrCreate()

In [3]:

from pyspark.sql.types import StructType,StructField, StringType, IntegerType,ArrayType,MapType

data = [("James","","Smith","36636","M",3000),
    ("Michael","Rose","","40288","M",4000),
    ("Robert","","Williams","42114","M",4000),
    ("Maria","Anne","Jones","39192","F",4000),
    ("Jen","Mary","Brown","","F",-1)
  ]

schema = StructType([ 
    StructField("firstname",StringType(),True), 
    StructField("middlename",StringType(),True), 
    StructField("lastname",StringType(),True), 
    StructField("id", StringType(), True), 
    StructField("gender", StringType(), True), 
    StructField("salary", IntegerType(), True) 
  ])

In [4]:
df = spark.createDataFrame(data= data, schema= schema)

In [5]:
df.show()

+---------+----------+--------+-----+------+------+
|firstname|middlename|lastname|   id|gender|salary|
+---------+----------+--------+-----+------+------+
|    James|          |   Smith|36636|     M|  3000|
|  Michael|      Rose|        |40288|     M|  4000|
|   Robert|          |Williams|42114|     M|  4000|
|    Maria|      Anne|   Jones|39192|     F|  4000|
|      Jen|      Mary|   Brown|     |     F|    -1|
+---------+----------+--------+-----+------+------+



In [6]:
df.printSchema()

root
 |-- firstname: string (nullable = true)
 |-- middlename: string (nullable = true)
 |-- lastname: string (nullable = true)
 |-- id: string (nullable = true)
 |-- gender: string (nullable = true)
 |-- salary: integer (nullable = true)



In [7]:
structureData = [
    (("James","","Smith"),"36636","M",3100),
    (("Michael","Rose",""),"40288","M",4300),
    (("Robert","","Williams"),"42114","M",1400),
    (("Maria","Anne","Jones"),"39192","F",5500),
    (("Jen","Mary","Brown"),"","F",-1)
  ]
structureSchema = StructType([
        StructField('name', StructType([
             StructField('firstname', StringType(), True),
             StructField('middlename', StringType(), True),
             StructField('lastname', StringType(), True)
             ])),
         StructField('id', StringType(), True),
         StructField('gender', StringType(), True),
         StructField('salary', IntegerType(), True)
         ])

In [8]:
df2 = spark.createDataFrame(data=structureData,schema=structureSchema)
df2.printSchema()

root
 |-- name: struct (nullable = true)
 |    |-- firstname: string (nullable = true)
 |    |-- middlename: string (nullable = true)
 |    |-- lastname: string (nullable = true)
 |-- id: string (nullable = true)
 |-- gender: string (nullable = true)
 |-- salary: integer (nullable = true)



In [9]:
df2.show()

+--------------------+-----+------+------+
|                name|   id|gender|salary|
+--------------------+-----+------+------+
|    {James, , Smith}|36636|     M|  3100|
|   {Michael, Rose, }|40288|     M|  4300|
|{Robert, , Williams}|42114|     M|  1400|
|{Maria, Anne, Jones}|39192|     F|  5500|
|  {Jen, Mary, Brown}|     |     F|    -1|
+--------------------+-----+------+------+



In [10]:
df2.filter(df2.salary > 3000).show()

+--------------------+-----+------+------+
|                name|   id|gender|salary|
+--------------------+-----+------+------+
|    {James, , Smith}|36636|     M|  3100|
|   {Michael, Rose, }|40288|     M|  4300|
|{Maria, Anne, Jones}|39192|     F|  5500|
+--------------------+-----+------+------+



In [11]:
df2.filter(df2.name.firstname == "James").show()

+----------------+-----+------+------+
|            name|   id|gender|salary|
+----------------+-----+------+------+
|{James, , Smith}|36636|     M|  3100|
+----------------+-----+------+------+



In [12]:
df2.filter(df2.name.middlename == "").show()

+--------------------+-----+------+------+
|                name|   id|gender|salary|
+--------------------+-----+------+------+
|    {James, , Smith}|36636|     M|  3100|
|{Robert, , Williams}|42114|     M|  1400|
+--------------------+-----+------+------+



In [13]:
df2.filter(df2.name.lastname.like("%s%")).show()

+--------------------+-----+------+------+
|                name|   id|gender|salary|
+--------------------+-----+------+------+
|{Robert, , Williams}|42114|     M|  1400|
|{Maria, Anne, Jones}|39192|     F|  5500|
+--------------------+-----+------+------+



In [14]:
from pyspark.sql.functions import lower

df2.filter(lower(df2.name.lastname).like("%s%")).show()

+--------------------+-----+------+------+
|                name|   id|gender|salary|
+--------------------+-----+------+------+
|    {James, , Smith}|36636|     M|  3100|
|{Robert, , Williams}|42114|     M|  1400|
|{Maria, Anne, Jones}|39192|     F|  5500|
+--------------------+-----+------+------+



In [15]:
from pyspark.sql.functions import concat_ws
df2.select(concat_ws(" ",df2.name.firstname,df2.name.middlename,df2.name.lastname).alias("fullName")).show()

+----------------+
|        fullName|
+----------------+
|    James  Smith|
|   Michael Rose |
|Robert  Williams|
|Maria Anne Jones|
|  Jen Mary Brown|
+----------------+



In [16]:
df3 = df2.withColumn("fullName",concat_ws(" ",df2.name.firstname,df2.name.middlename,df2.name.lastname)).show()

+--------------------+-----+------+------+----------------+
|                name|   id|gender|salary|        fullName|
+--------------------+-----+------+------+----------------+
|    {James, , Smith}|36636|     M|  3100|    James  Smith|
|   {Michael, Rose, }|40288|     M|  4300|   Michael Rose |
|{Robert, , Williams}|42114|     M|  1400|Robert  Williams|
|{Maria, Anne, Jones}|39192|     F|  5500|Maria Anne Jones|
|  {Jen, Mary, Brown}|     |     F|    -1|  Jen Mary Brown|
+--------------------+-----+------+------+----------------+



In [17]:
df4 =df2.withColumnRenamed('name','fname').show()

+--------------------+-----+------+------+
|               fname|   id|gender|salary|
+--------------------+-----+------+------+
|    {James, , Smith}|36636|     M|  3100|
|   {Michael, Rose, }|40288|     M|  4300|
|{Robert, , Williams}|42114|     M|  1400|
|{Maria, Anne, Jones}|39192|     F|  5500|
|  {Jen, Mary, Brown}|     |     F|    -1|
+--------------------+-----+------+------+



In [18]:
from pyspark.sql.functions import *

In [19]:
df2.drop('name').show()

+-----+------+------+
|   id|gender|salary|
+-----+------+------+
|36636|     M|  3100|
|40288|     M|  4300|
|42114|     M|  1400|
|39192|     F|  5500|
|     |     F|    -1|
+-----+------+------+



In [47]:
import pyspark
from pyspark.sql import SparkSession
from pyspark.sql.types import StructType,StructField, StringType, IntegerType,ArrayType,MapType
from pyspark.sql.functions import col,struct,when

spark = SparkSession.builder.master("local[1]") \
                    .appName('pyspark-examples') \
                    .getOrCreate()

data = [("James","","Smith","36636","M",3000),
    ("Michael","Rose","","40288","M",4000),
    ("Robert","","Williams","42114","M",4000),
    ("Maria","Anne","Jones","39192","F",4000),
    ("Jen","Mary","Brown","","F",-1)
  ]

schema = StructType([ 
    StructField("firstname",StringType(),True), 
    StructField("middlename",StringType(),True), 
    StructField("lastname",StringType(),True), 
    StructField("id", StringType(), True), 
    StructField("gender", StringType(), True), 
    StructField("salary", IntegerType(), True) 
  ])
 
df = spark.createDataFrame(data=data,schema=schema)
df.printSchema()
df.show(truncate=False)

structureData = [
    (("James","","Smith"),"36636","M",3100),
    (("Michael","Rose",""),"40288","M",4300),
    (("Robert","","Williams"),"42114","M",1400),
    (("Maria","Anne","Jones"),"39192","F",5500),
    (("Jen","Mary","Brown"),"","F",-1)
  ]
structureSchema = StructType([
        StructField('name', StructType([
             StructField('firstname', StringType(), True),
             StructField('middlename', StringType(), True),
             StructField('lastname', StringType(), True)
             ])),
         StructField('id', StringType(), True),
         StructField('gender', StringType(), True),
         StructField('salary', IntegerType(), True)
         ])

df2 = spark.createDataFrame(data=structureData,schema=structureSchema)
df2.printSchema()
df2.show(truncate=False)


updatedDF = df2.withColumn("OtherInfo", 
    struct(col("id").alias("identifier"),
    col("gender").alias("gender"),
    col("salary").alias("salary"),
    when(col("salary").cast(IntegerType()) < 2000,"Low")
      .when(col("salary").cast(IntegerType()) < 4000,"Medium")
      .otherwise("High").alias("Salary_Grade")
  )).drop("id","gender","salary")

updatedDF.printSchema()

root
 |-- firstname: string (nullable = true)
 |-- middlename: string (nullable = true)
 |-- lastname: string (nullable = true)
 |-- id: string (nullable = true)
 |-- gender: string (nullable = true)
 |-- salary: integer (nullable = true)

+---------+----------+--------+-----+------+------+
|firstname|middlename|lastname|id   |gender|salary|
+---------+----------+--------+-----+------+------+
|James    |          |Smith   |36636|M     |3000  |
|Michael  |Rose      |        |40288|M     |4000  |
|Robert   |          |Williams|42114|M     |4000  |
|Maria    |Anne      |Jones   |39192|F     |4000  |
|Jen      |Mary      |Brown   |     |F     |-1    |
+---------+----------+--------+-----+------+------+

root
 |-- name: struct (nullable = true)
 |    |-- firstname: string (nullable = true)
 |    |-- middlename: string (nullable = true)
 |    |-- lastname: string (nullable = true)
 |-- id: string (nullable = true)
 |-- gender: string (nullable = true)
 |-- salary: integer (nullable = true)



In [48]:
updatedDF.show()

+--------------------+--------------------+
|                name|           OtherInfo|
+--------------------+--------------------+
|    {James, , Smith}|{36636, M, 3100, ...|
|   {Michael, Rose, }|{40288, M, 4300, ...|
|{Robert, , Williams}|{42114, M, 1400, ...|
|{Maria, Anne, Jones}|{39192, F, 5500, ...|
|  {Jen, Mary, Brown}|      {, F, -1, Low}|
+--------------------+--------------------+



In [49]:
data = [("James","","Smith","36636","M",60000),
        ("Michael","Rose","","40288","M",70000),
        ("Robert","","Williams","42114","",400000),
        ("Maria","Anne","Jones","39192","F",500000),
        ("Jen","Mary","Brown","","F",0)]

columns = ["first_name","middle_name","last_name","dob","gender","salary"]
df = spark.createDataFrame(data = data, schema = columns)
df.printSchema()
df.show(truncate=False)

root
 |-- first_name: string (nullable = true)
 |-- middle_name: string (nullable = true)
 |-- last_name: string (nullable = true)
 |-- dob: string (nullable = true)
 |-- gender: string (nullable = true)
 |-- salary: long (nullable = true)

+----------+-----------+---------+-----+------+------+
|first_name|middle_name|last_name|dob  |gender|salary|
+----------+-----------+---------+-----+------+------+
|James     |           |Smith    |36636|M     |60000 |
|Michael   |Rose       |         |40288|M     |70000 |
|Robert    |           |Williams |42114|      |400000|
|Maria     |Anne       |Jones    |39192|F     |500000|
|Jen       |Mary       |Brown    |     |F     |0     |
+----------+-----------+---------+-----+------+------+



In [50]:
from pyspark.sql.functions import col, when
df2 = df.withColumn("new_gender", when(col("gender") == "M","Male")
                                 .when(col("gender") == "F","Female")
                                 .otherwise("Unknown"))
df2.show(truncate=False)

+----------+-----------+---------+-----+------+------+----------+
|first_name|middle_name|last_name|dob  |gender|salary|new_gender|
+----------+-----------+---------+-----+------+------+----------+
|James     |           |Smith    |36636|M     |60000 |Male      |
|Michael   |Rose       |         |40288|M     |70000 |Male      |
|Robert    |           |Williams |42114|      |400000|Unknown   |
|Maria     |Anne       |Jones    |39192|F     |500000|Female    |
|Jen       |Mary       |Brown    |     |F     |0     |Female    |
+----------+-----------+---------+-----+------+------+----------+



In [51]:
from pyspark.sql.functions import expr
df3 = df.withColumn("new_gender", expr("case when gender = 'M' then 'Male' " + 
                       "when gender = 'F' then 'Female' " +
                       "else 'Unknown' end"))
df3.show(truncate=False)

+----------+-----------+---------+-----+------+------+----------+
|first_name|middle_name|last_name|dob  |gender|salary|new_gender|
+----------+-----------+---------+-----+------+------+----------+
|James     |           |Smith    |36636|M     |60000 |Male      |
|Michael   |Rose       |         |40288|M     |70000 |Male      |
|Robert    |           |Williams |42114|      |400000|Unknown   |
|Maria     |Anne       |Jones    |39192|F     |500000|Female    |
|Jen       |Mary       |Brown    |     |F     |0     |Female    |
+----------+-----------+---------+-----+------+------+----------+



In [52]:
data2 = [(66, "a", "4"), (67, "a", "0"), (70, "b", "4"), (71, "d", "4")]
df5 = spark.createDataFrame(data = data2, schema = ["id", "code", "amt"])
df5.show()         

+---+----+---+
| id|code|amt|
+---+----+---+
| 66|   a|  4|
| 67|   a|  0|
| 70|   b|  4|
| 71|   d|  4|
+---+----+---+



In [53]:
df5.withColumn("new_column", when((col("code") == "a") | (col("code") == "d"), "A")
      .when((col("code") == "b") & (col("amt") == "4"), "B")
      .otherwise("A1")).show()

+---+----+---+----------+
| id|code|amt|new_column|
+---+----+---+----------+
| 66|   a|  4|         A|
| 67|   a|  0|         A|
| 70|   b|  4|         B|
| 71|   d|  4|         A|
+---+----+---+----------+



In [54]:
columns = ["name","languagesAtSchool","currentState"]
data = [("James,,Smith",["Java","Scala","C++"],"CA"), \
      ("Michael,Rose,",["Spark","Java","C++"],"NJ"), \
      ("Robert,,Williams",["CSharp","VB"],"NV")]

df = spark.createDataFrame(data=data,schema=columns)
  
df.select(df.languagesAtSchool).rdd.flatMap(lambda x : x).collect()
df.select(df.languagesAtSchool).rdd.flatMap(lambda x : x).flatMap(lambda x : x).collect() # multiple flatMaps are used in single dataframe

['Java', 'Scala', 'C++', 'Spark', 'Java', 'C++', 'CSharp', 'VB']

In [55]:
from pyspark.sql.functions import explode
df2 = df.select(df.name, explode(df.languagesAtSchool))

In [56]:
df2.printSchema()

root
 |-- name: string (nullable = true)
 |-- col: string (nullable = true)



In [57]:
df2.show()

+----------------+------+
|            name|   col|
+----------------+------+
|    James,,Smith|  Java|
|    James,,Smith| Scala|
|    James,,Smith|   C++|
|   Michael,Rose,| Spark|
|   Michael,Rose,|  Java|
|   Michael,Rose,|   C++|
|Robert,,Williams|CSharp|
|Robert,,Williams|    VB|
+----------------+------+



In [58]:
df.show()

+----------------+------------------+------------+
|            name| languagesAtSchool|currentState|
+----------------+------------------+------------+
|    James,,Smith|[Java, Scala, C++]|          CA|
|   Michael,Rose,|[Spark, Java, C++]|          NJ|
|Robert,,Williams|      [CSharp, VB]|          NV|
+----------------+------------------+------------+



In [59]:
arrayStructureData = [
        (("James","","Smith"),["Java","Scala","C++"],"OH","M"),
        (("Anna","Rose",""),["Spark","Java","C++"],"NY","F"),
        (("Julia","","Williams"),["CSharp","VB"],"OH","F"),
        (("Maria","Anne","Jones"),["CSharp","VB"],"NY","M"),
        (("Jen","Mary","Brown"),["CSharp","VB"],"NY","M"),
        (("Mike","Mary","Williams"),["Python","VB"],"OH","M")
        ]
        
arrayStructureSchema = StructType([
        StructField('name', StructType([
             StructField('firstname', StringType(), True),
             StructField('middlename', StringType(), True),
             StructField('lastname', StringType(), True)
             ])),
         StructField('languages', ArrayType(StringType()), True),
         StructField('state', StringType(), True),
         StructField('gender', StringType(), True)
         ])


df = spark.createDataFrame(data = arrayStructureData, schema = arrayStructureSchema)
df.printSchema()


root
 |-- name: struct (nullable = true)
 |    |-- firstname: string (nullable = true)
 |    |-- middlename: string (nullable = true)
 |    |-- lastname: string (nullable = true)
 |-- languages: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- state: string (nullable = true)
 |-- gender: string (nullable = true)



In [60]:
df.show()

+--------------------+------------------+-----+------+
|                name|         languages|state|gender|
+--------------------+------------------+-----+------+
|    {James, , Smith}|[Java, Scala, C++]|   OH|     M|
|      {Anna, Rose, }|[Spark, Java, C++]|   NY|     F|
| {Julia, , Williams}|      [CSharp, VB]|   OH|     F|
|{Maria, Anne, Jones}|      [CSharp, VB]|   NY|     M|
|  {Jen, Mary, Brown}|      [CSharp, VB]|   NY|     M|
|{Mike, Mary, Will...|      [Python, VB]|   OH|     M|
+--------------------+------------------+-----+------+



In [61]:
df.select(explode(df.languages).alias("lang")).show()

+------+
|  lang|
+------+
|  Java|
| Scala|
|   C++|
| Spark|
|  Java|
|   C++|
|CSharp|
|    VB|
|CSharp|
|    VB|
|CSharp|
|    VB|
|Python|
|    VB|
+------+



In [62]:
df.select(explode(df.languages)).filter((df.state == "OH") & (df.languages[0] == "Java")).show()

+-----+
|  col|
+-----+
| Java|
|Scala|
|  C++|
+-----+



In [63]:
df.select(col('*'), explode(df.languages).alias("lang")).show()

+--------------------+------------------+-----+------+------+
|                name|         languages|state|gender|  lang|
+--------------------+------------------+-----+------+------+
|    {James, , Smith}|[Java, Scala, C++]|   OH|     M|  Java|
|    {James, , Smith}|[Java, Scala, C++]|   OH|     M| Scala|
|    {James, , Smith}|[Java, Scala, C++]|   OH|     M|   C++|
|      {Anna, Rose, }|[Spark, Java, C++]|   NY|     F| Spark|
|      {Anna, Rose, }|[Spark, Java, C++]|   NY|     F|  Java|
|      {Anna, Rose, }|[Spark, Java, C++]|   NY|     F|   C++|
| {Julia, , Williams}|      [CSharp, VB]|   OH|     F|CSharp|
| {Julia, , Williams}|      [CSharp, VB]|   OH|     F|    VB|
|{Maria, Anne, Jones}|      [CSharp, VB]|   NY|     M|CSharp|
|{Maria, Anne, Jones}|      [CSharp, VB]|   NY|     M|    VB|
|  {Jen, Mary, Brown}|      [CSharp, VB]|   NY|     M|CSharp|
|  {Jen, Mary, Brown}|      [CSharp, VB]|   NY|     M|    VB|
|{Mike, Mary, Will...|      [Python, VB]|   OH|     M|Python|
|{Mike, 

In [64]:
df.select(col('*'), explode(df.languages).alias("lang")).filter((df.state == 'OH') & (array_contains(df.languages, 'Java'))).show()

+----------------+------------------+-----+------+-----+
|            name|         languages|state|gender| lang|
+----------------+------------------+-----+------+-----+
|{James, , Smith}|[Java, Scala, C++]|   OH|     M| Java|
|{James, , Smith}|[Java, Scala, C++]|   OH|     M|Scala|
|{James, , Smith}|[Java, Scala, C++]|   OH|     M|  C++|
+----------------+------------------+-----+------+-----+



In [65]:
from pyspark.sql.functions import array_contains
df.filter((df.state == 'OH') & (array_contains(df.languages, 'Java'))).show()

+----------------+------------------+-----+------+
|            name|         languages|state|gender|
+----------------+------------------+-----+------+
|{James, , Smith}|[Java, Scala, C++]|   OH|     M|
+----------------+------------------+-----+------+



In [66]:
from pyspark.sql.types import StructType,StructField, StringType, IntegerType,ArrayType,MapType

arrayStructureData = [
        (("James","","Smith"),["Java","Scala","C++"],"OH","M",{"home":"City Center", "work": "City Tech Park"}),
        (("Anna","Rose",""),["Spark","Java","C++"],"NY","F",{"home":"Manhattan", "work": "Wall Street"}),
        (("Julia","","Williams"),["CSharp","VB"],"OH","F",{"home":"City Center", "work": "City Tech Park"}),
        (("Maria","Anne","Jones"),["CSharp","VB"],"NY","M",{"home":"Manhattan", "work": "Wall Street"}),
        (("Jen","Mary","Brown"),["CSharp","VB"],"NY","M",{}),
        (("Mike","Mary","Williams"),["Python","VB"],"OH","M",{})
        ]
        
arrayStructureSchema = StructType([
        StructField('name', StructType([
             StructField('firstname', StringType(), True),
             StructField('middlename', StringType(), True),
             StructField('lastname', StringType(), True)
             ])),
         StructField('languages', ArrayType(StringType()), True),
         StructField('state', StringType(), True),
         StructField('gender', StringType(), True),
    	 StructField('address',MapType(StringType(), StringType()), True)
         ])


df = spark.createDataFrame(data = arrayStructureData, schema = arrayStructureSchema)
df.printSchema()


root
 |-- name: struct (nullable = true)
 |    |-- firstname: string (nullable = true)
 |    |-- middlename: string (nullable = true)
 |    |-- lastname: string (nullable = true)
 |-- languages: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- state: string (nullable = true)
 |-- gender: string (nullable = true)
 |-- address: map (nullable = true)
 |    |-- key: string
 |    |-- value: string (valueContainsNull = true)



In [67]:
df.show()

+--------------------+------------------+-----+------+--------------------+
|                name|         languages|state|gender|             address|
+--------------------+------------------+-----+------+--------------------+
|    {James, , Smith}|[Java, Scala, C++]|   OH|     M|{work -> City Tec...|
|      {Anna, Rose, }|[Spark, Java, C++]|   NY|     F|{work -> Wall Str...|
| {Julia, , Williams}|      [CSharp, VB]|   OH|     F|{work -> City Tec...|
|{Maria, Anne, Jones}|      [CSharp, VB]|   NY|     M|{work -> Wall Str...|
|  {Jen, Mary, Brown}|      [CSharp, VB]|   NY|     M|                  {}|
|{Mike, Mary, Will...|      [Python, VB]|   OH|     M|                  {}|
+--------------------+------------------+-----+------+--------------------+



In [68]:
df.filter(df.address.home == "City Center").show()

+-------------------+------------------+-----+------+--------------------+
|               name|         languages|state|gender|             address|
+-------------------+------------------+-----+------+--------------------+
|   {James, , Smith}|[Java, Scala, C++]|   OH|     M|{work -> City Tec...|
|{Julia, , Williams}|      [CSharp, VB]|   OH|     F|{work -> City Tec...|
+-------------------+------------------+-----+------+--------------------+



In [69]:
df.filter(df.address['home'] == "City Center").show()

+-------------------+------------------+-----+------+--------------------+
|               name|         languages|state|gender|             address|
+-------------------+------------------+-----+------+--------------------+
|   {James, , Smith}|[Java, Scala, C++]|   OH|     M|{work -> City Tec...|
|{Julia, , Williams}|      [CSharp, VB]|   OH|     F|{work -> City Tec...|
+-------------------+------------------+-----+------+--------------------+



In [70]:
df.filter((df.address['work'] == "Wall Street") & (array_contains(df.languages, 'Java')) & (df.name.firstname == "Anna")).show()

+--------------+------------------+-----+------+--------------------+
|          name|         languages|state|gender|             address|
+--------------+------------------+-----+------+--------------------+
|{Anna, Rose, }|[Spark, Java, C++]|   NY|     F|{work -> Wall Str...|
+--------------+------------------+-----+------+--------------------+



In [71]:
df.filter((df.address['work'] == "Wall Street") & (array_contains(df.languages, 'Java')) & ()).show()

Py4JError: An error occurred while calling o655.and. Trace:
py4j.Py4JException: Method and([class java.util.ArrayList]) does not exist
	at py4j.reflection.ReflectionEngine.getMethod(ReflectionEngine.java:318)
	at py4j.reflection.ReflectionEngine.getMethod(ReflectionEngine.java:326)
	at py4j.Gateway.invoke(Gateway.java:274)
	at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)
	at py4j.commands.CallCommand.execute(CallCommand.java:79)
	at py4j.ClientServerConnection.waitForCommands(ClientServerConnection.java:182)
	at py4j.ClientServerConnection.run(ClientServerConnection.java:106)
	at java.base/java.lang.Thread.run(Thread.java:1589)



In [72]:
df.select(concat_ws(" ",df.name.firstname, df.name.middlename, df.name.lastname).alias("name"))

DataFrame[name: string]

In [73]:
df = df.withColumn("name", concat_ws(" ",df.name.firstname, df.name.middlename, df.name.lastname))

In [74]:
df.show()

+------------------+------------------+-----+------+--------------------+
|              name|         languages|state|gender|             address|
+------------------+------------------+-----+------+--------------------+
|      James  Smith|[Java, Scala, C++]|   OH|     M|{work -> City Tec...|
|        Anna Rose |[Spark, Java, C++]|   NY|     F|{work -> Wall Str...|
|   Julia  Williams|      [CSharp, VB]|   OH|     F|{work -> City Tec...|
|  Maria Anne Jones|      [CSharp, VB]|   NY|     M|{work -> Wall Str...|
|    Jen Mary Brown|      [CSharp, VB]|   NY|     M|                  {}|
|Mike Mary Williams|      [Python, VB]|   OH|     M|                  {}|
+------------------+------------------+-----+------+--------------------+



In [75]:
df.filter((df.address['work'] == "Wall Street") & (array_contains(df.languages, 'Java')) & (df.name.like("%Jen%"))).show()

+----+---------+-----+------+-------+
|name|languages|state|gender|address|
+----+---------+-----+------+-------+
+----+---------+-----+------+-------+



In [76]:
from pyspark.sql.functions import to_csv
df.filter((df.address['work'] == "Wall Street") & (array_contains(df.languages, 'Java')) & (to_csv(df.name).like("%Ann%"))).show()

AnalysisException: cannot resolve 'to_csv(name)' due to data type mismatch: argument 1 requires struct type, however, 'name' is of string type.;
'Filter (((address#1079[work] = Wall Street) AND array_contains(languages#1076, Java)) AND to_csv(name#1171, Some(Asia/Calcutta)) LIKE %Ann%)
+- Project [concat_ws( , name#1075.firstname, name#1075.middlename, name#1075.lastname) AS name#1171, languages#1076, state#1077, gender#1078, address#1079]
   +- LogicalRDD [name#1075, languages#1076, state#1077, gender#1078, address#1079], false


In [77]:
df.show()

+------------------+------------------+-----+------+--------------------+
|              name|         languages|state|gender|             address|
+------------------+------------------+-----+------+--------------------+
|      James  Smith|[Java, Scala, C++]|   OH|     M|{work -> City Tec...|
|        Anna Rose |[Spark, Java, C++]|   NY|     F|{work -> Wall Str...|
|   Julia  Williams|      [CSharp, VB]|   OH|     F|{work -> City Tec...|
|  Maria Anne Jones|      [CSharp, VB]|   NY|     M|{work -> Wall Str...|
|    Jen Mary Brown|      [CSharp, VB]|   NY|     M|                  {}|
|Mike Mary Williams|      [Python, VB]|   OH|     M|                  {}|
+------------------+------------------+-----+------+--------------------+



In [78]:
simpleData = [("James","Sales","NY",90000,34,10000),
    ("Michael","Sales","NY",86000,56,20000),
    ("Robert","Sales","CA",81000,30,23000),
    ("Maria","Finance","CA",90000,24,23000),
    ("Raman","Finance","CA",99000,40,24000),
    ("Scott","Finance","NY",83000,36,19000),
    ("Jen","Finance","NY",79000,53,15000),
    ("Jeff","Marketing","CA",80000,25,18000),
    ("Kumar","Marketing","NY",91000,50,21000)
  ]

schema = ["employee_name","department","state","salary","age","bonus"]
df = spark.createDataFrame(data=simpleData, schema = schema)
df.printSchema()
df.show(truncate=False)

root
 |-- employee_name: string (nullable = true)
 |-- department: string (nullable = true)
 |-- state: string (nullable = true)
 |-- salary: long (nullable = true)
 |-- age: long (nullable = true)
 |-- bonus: long (nullable = true)

+-------------+----------+-----+------+---+-----+
|employee_name|department|state|salary|age|bonus|
+-------------+----------+-----+------+---+-----+
|James        |Sales     |NY   |90000 |34 |10000|
|Michael      |Sales     |NY   |86000 |56 |20000|
|Robert       |Sales     |CA   |81000 |30 |23000|
|Maria        |Finance   |CA   |90000 |24 |23000|
|Raman        |Finance   |CA   |99000 |40 |24000|
|Scott        |Finance   |NY   |83000 |36 |19000|
|Jen          |Finance   |NY   |79000 |53 |15000|
|Jeff         |Marketing |CA   |80000 |25 |18000|
|Kumar        |Marketing |NY   |91000 |50 |21000|
+-------------+----------+-----+------+---+-----+



In [79]:
from pyspark.sql.functions import *

In [80]:
df.groupby("department").agg(avg("salary").alias("avg_salary")).filter(col("avg_salary") > 80000).show()

+----------+-----------------+
|department|       avg_salary|
+----------+-----------------+
|     Sales|85666.66666666667|
|   Finance|          87750.0|
| Marketing|          85500.0|
+----------+-----------------+



In [81]:
df.select(col('*')).groupby('department', 'state').sum('salary').where(col('sum(salary)') > 80000).show()

+----------+-----+-----------+
|department|state|sum(salary)|
+----------+-----+-----------+
|   Finance|   NY|     162000|
| Marketing|   NY|      91000|
|     Sales|   CA|      81000|
|   Finance|   CA|     189000|
|     Sales|   NY|     176000|
+----------+-----+-----------+



In [82]:
df.distinct().count()

9

In [83]:
df.count()

9

In [84]:
df.distinct().show()

+-------------+----------+-----+------+---+-----+
|employee_name|department|state|salary|age|bonus|
+-------------+----------+-----+------+---+-----+
|        Raman|   Finance|   CA| 99000| 40|24000|
|        Scott|   Finance|   NY| 83000| 36|19000|
|        James|     Sales|   NY| 90000| 34|10000|
|        Maria|   Finance|   CA| 90000| 24|23000|
|        Kumar| Marketing|   NY| 91000| 50|21000|
|      Michael|     Sales|   NY| 86000| 56|20000|
|          Jen|   Finance|   NY| 79000| 53|15000|
|         Jeff| Marketing|   CA| 80000| 25|18000|
|       Robert|     Sales|   CA| 81000| 30|23000|
+-------------+----------+-----+------+---+-----+



In [85]:
df.distinct().show()

+-------------+----------+-----+------+---+-----+
|employee_name|department|state|salary|age|bonus|
+-------------+----------+-----+------+---+-----+
|        Raman|   Finance|   CA| 99000| 40|24000|
|        Scott|   Finance|   NY| 83000| 36|19000|
|        James|     Sales|   NY| 90000| 34|10000|
|        Maria|   Finance|   CA| 90000| 24|23000|
|        Kumar| Marketing|   NY| 91000| 50|21000|
|      Michael|     Sales|   NY| 86000| 56|20000|
|          Jen|   Finance|   NY| 79000| 53|15000|
|         Jeff| Marketing|   CA| 80000| 25|18000|
|       Robert|     Sales|   CA| 81000| 30|23000|
+-------------+----------+-----+------+---+-----+



In [86]:
df.select('salary').distinct().count()

8

In [87]:
df.select(count_distinct(df.salary)).show()

+----------------------+
|count(DISTINCT salary)|
+----------------------+
|                     8|
+----------------------+



In [88]:
df.select(approx_count_distinct(df.salary)).show()

+-----------------------------+
|approx_count_distinct(salary)|
+-----------------------------+
|                            8|
+-----------------------------+



In [89]:
df.select(collect_list('salary')).show()

+--------------------+
|collect_list(salary)|
+--------------------+
|[90000, 86000, 81...|
+--------------------+



In [90]:
df.select(collect_set('salary')).show(truncate = False)

+--------------------------------------------------------+
|collect_set(salary)                                     |
+--------------------------------------------------------+
|[79000, 83000, 91000, 99000, 90000, 80000, 86000, 81000]|
+--------------------------------------------------------+



In [91]:
df.select(mean('salary')).show()

+-----------------+
|      avg(salary)|
+-----------------+
|86555.55555555556|
+-----------------+



In [92]:
df.select(percentile_approx('salary',0.5)).show()

+-------------------------------------+
|percentile_approx(salary, 0.5, 10000)|
+-------------------------------------+
|                                86000|
+-------------------------------------+



In [93]:
df.orderBy('salary').show()

+-------------+----------+-----+------+---+-----+
|employee_name|department|state|salary|age|bonus|
+-------------+----------+-----+------+---+-----+
|          Jen|   Finance|   NY| 79000| 53|15000|
|         Jeff| Marketing|   CA| 80000| 25|18000|
|       Robert|     Sales|   CA| 81000| 30|23000|
|        Scott|   Finance|   NY| 83000| 36|19000|
|      Michael|     Sales|   NY| 86000| 56|20000|
|        Maria|   Finance|   CA| 90000| 24|23000|
|        James|     Sales|   NY| 90000| 34|10000|
|        Kumar| Marketing|   NY| 91000| 50|21000|
|        Raman|   Finance|   CA| 99000| 40|24000|
+-------------+----------+-----+------+---+-----+



In [94]:
a = df.groupby().avg('salary').rdd.collect()[0][0]
df.select('salary', a).show()
type(lit(a).cast(IntegerType()))

TypeError: Invalid argument, not a string or column: 86555.55555555556 of type <class 'float'>. For column literals, use 'lit', 'array', 'struct' or 'create_map' function.

In [95]:
df.select(col('*')).groupby('department', 'state').sum('salary').where(col('sum(salary)') > 80000).show()

+----------+-----+-----------+
|department|state|sum(salary)|
+----------+-----+-----------+
|   Finance|   NY|     162000|
| Marketing|   NY|      91000|
|     Sales|   CA|      81000|
|   Finance|   CA|     189000|
|     Sales|   NY|     176000|
+----------+-----+-----------+

