In [1]:
from pyspark.sql import SparkSession
import pandas as pd
from pyspark.sql.functions import col

#Create SparkSession 
spark = SparkSession.builder \
     .master("local[1]") \
     .appName("SparkByExamples.com") \
     .getOrCreate()

In [2]:
import findspark
findspark.init()

In [3]:
import pyspark

In [4]:
df = spark.read.csv("Absenteeism_new_data.csv",inferSchema=True, header=True)

# [PySpark When Otherwise | SQL Case When Usage](https://sparkbyexamples.com/pyspark/pyspark-when-otherwise/)

In [5]:
from pyspark.sql.functions import *

In [14]:
df.withColumn("new",
             when(df.Pets==0,"nopets").
              when(df.Pets==1,"only one pet").
              when(df.Pets>1, "many pets").
              otherwise(df.Pets)
             ).select(["pets","new"]).show(10)

df.select(df.Pets, 
          when(df.Pets==0,"nopets").
              when(df.Pets==1,"only one pet").
              when(df.Pets>1, "many pets").
              otherwise(df.Pets).
         alias("new_pets")).show(10)


# using sql expr function 
df.withColumn("new_pet", expr(
    "CASE WHEN Pets = 0 THEN 'no_pets'"+
    "WHEN Pets = 1 THEN 'One_pet'"+
    "WHEN Pets >1 THEN 'More_pets'"+
    "ELSE Pets END"

)).select(["Pets","new_pet"]).show(5)

#using sql expr with select

df.select(col("Pets"), expr(
    "CASE WHEN Pets = 0 THEN 'no_pets'"+
    "WHEN Pets = 1 THEN 'One_pet'"+
    "WHEN Pets >1 THEN 'More_pets'"+
    "ELSE Pets END"

).alias("new_pets")).select(["Pets","new_pets"]).show(5)

#using sql query with temp table
df.createOrReplaceTempView("EMP")
spark.sql("select Pets, CASE WHEN Pets = 0 THEN 'No_pets' " + 
               "WHEN Pets = 1 THEN 'One_pet' WHEN Pets>1 THEN 'more_pets'" +
              "ELSE Pets END as new_pets from EMP").show(10)


+----+---------+
|pets|      new|
+----+---------+
|   0|   nopets|
|   4|many pets|
|   0|   nopets|
|   0|   nopets|
|   0|   nopets|
|   2|many pets|
|   0|   nopets|
|   0|   nopets|
|   0|   nopets|
|   0|   nopets|
+----+---------+
only showing top 10 rows

+----+---------+
|Pets| new_pets|
+----+---------+
|   0|   nopets|
|   4|many pets|
|   0|   nopets|
|   0|   nopets|
|   0|   nopets|
|   2|many pets|
|   0|   nopets|
|   0|   nopets|
|   0|   nopets|
|   0|   nopets|
+----+---------+
only showing top 10 rows

+----+---------+
|Pets|  new_pet|
+----+---------+
|   0|  no_pets|
|   4|More_pets|
|   0|  no_pets|
|   0|  no_pets|
|   0|  no_pets|
+----+---------+
only showing top 5 rows

+----+---------+
|Pets| new_pets|
+----+---------+
|   0|  no_pets|
|   4|More_pets|
|   0|  no_pets|
|   0|  no_pets|
|   0|  no_pets|
+----+---------+
only showing top 5 rows

+----+---------+
|Pets| new_pets|
+----+---------+
|   0|  No_pets|
|   4|more_pets|
|   0|  No_pets|
|   0|  No_pet

# [PySpark SQL expr() (Expression ) Function](https://sparkbyexamples.com/pyspark/pyspark-sql-expr-expression-function/)

 is a SQL function to execute ---
 SQL-like expressions 
 
 and to use an existing DataFrame column value as an expression argument to Pyspark built-in functions. 

In [15]:
# given in word document 

In [21]:
# concatenate two columns 
df.withColumn("new_col", expr("Pets||','||Education")).select(["pets","Education","new_col"]).show(5)

#using sql when - refer window functions

#adding incrment to date function

#df.select(df.date,df.increment,
 #    expr("add_months(date,increment)")
  #.alias("inc_date")).show()

#artmatic func

#df.select("Pets",expr("Education+1 as new_edu").show(10)




+----+---------+-------+
|pets|Education|new_col|
+----+---------+-------+
|   0|        3|    0,3|
|   4|        1|    4,1|
|   0|        1|    0,1|
|   0|        2|    0,2|
|   0|        1|    0,1|
+----+---------+-------+
only showing top 5 rows



# [PySpark lit() – Add Literal or Constant to DataFrame](https://sparkbyexamples.com/pyspark/pyspark-lit-add-literal-constant/)

to add a new column to DataFrame by assigning a literal or constant value


In [23]:
# same value 

df.select("Pets","Education",lit("newyork").alias("city")).show(10)

+----+---------+-------+
|Pets|Education|   city|
+----+---------+-------+
|   0|        3|newyork|
|   4|        1|newyork|
|   0|        1|newyork|
|   0|        2|newyork|
|   0|        1|newyork|
|   2|        1|newyork|
|   0|        1|newyork|
|   0|        3|newyork|
|   0|        1|newyork|
|   0|        1|newyork|
+----+---------+-------+
only showing top 10 rows



In [27]:
#using when - otherwise 

df.withColumn("li_value", 
              when(df.Pets<3, lit("less_pets")).otherwise(lit("More_pets"))).select(["Pets","li_value"]).show(5)



+----+---------+
|Pets| li_value|
+----+---------+
|   0|less_pets|
|   4|More_pets|
|   0|less_pets|
|   0|less_pets|
|   0|less_pets|
+----+---------+
only showing top 5 rows



# [ Convert String to Array Column](https://sparkbyexamples.com/pyspark/pyspark-convert-string-to-array-column/)

In [28]:
# df2 = df.select(split(col("name"),",").alias("NameArray")) \
#     .drop("name")
# df2.printSchema()
# df2.show()

# df.createOrReplaceTempView("PERSON")
# spark.sql("select SPLIT(name,',') as NameArray from PERSON") \
#     .show()

# splits at ,


# [Concat_ws](https://sparkbyexamples.com/pyspark/pyspark-convert-array-column-to-string-column/)

In [29]:
#array column to a String

#[jave, c, python]-- java,c,python

#concat_ws(column,where to concate(,))

# we can also do using sql functions

In [31]:
# from pyspark.sql.functions import col, concat_ws
# df2 = df.withColumn("languagesAtSchool",
#    concat_ws(",",col("languagesAtSchool")))


# df.createOrReplaceTempView("ARRAY_STRING")
# spark.sql("select name, concat_ws(',',languagesAtSchool) as languagesAtSchool," + \
#     " currentState from ARRAY_STRING") \
#     .show(truncate=False)


# [Get substring() from a column](https://sparkbyexamples.com/pyspark/pyspark-substring-from-a-column/)

In [32]:
#  extract the substring from a DataFrame string column
# by providing the 
# position and length of the string 
# substring(str, pos, len)


# df.select('date', substring('date', 1,4).alias('year'), \
#                   substring('date', 5,2).alias('month'), \
#                   substring('date', 7,2).alias('day')) 

In [None]:

#"Using with selectExpr"
# df2=df.selectExpr('date', 'substring(date, 1,4) as year', \
#                   'substring(date, 5,2) as month', \
#                   'substring(date, 7,2) as day')

# #Using substr from Column type
# df3=df.withColumn('year', col('date').substr(1, 4))\
#   .withColumn('month',col('date').substr(5, 2))\
#   .withColumn('day', col('date').substr(7, 2))

# [Traslate](https://sparkbyexamples.com/pyspark/pyspark-replace-column-values/#translate-replace-character-by-character)

In [None]:
#PySpark Replace Column Values in DataFrame
#regex_replace 

# df.withColumn('address', regexp_replace('address', 'Rd', 'Road')) \
#   .show(truncate=False)

print("*****************************************************************************************")

#Replace values from Dictionary

# stateDic={'CA':'California','NY':'New York','DE':'Delaware'}
# df2=df.rdd.map(lambda x: 
#     (x.id,x.address,stateDic[x.state]) 
#     ).toDF(["id","address","state"])
# df2.show()

print("********************************************************************************************")

#Using translate to replace character by character

# from pyspark.sql.functions import translate
# df.withColumn('address', translate('address', '123', 'ABC')) \
#   .show(truncate=False)

print('********************************************************************************************')

