In [1]:
import os, sys
os.environ['HADOOP_CONF_DIR'] = '/etc/hadoop/conf'
os.environ['YARN_CONF_DIR'] = '/etc/hadoop/conf'
os.environ['PYSPARK_PYTHON'] =  'python3.9'
os.environ['PYSPARK_DRIVER_PYTHON'] = 'python3.9'
os.environ['HADOOP_USER_NAME']='ssenigov'

from pyspark.sql import SparkSession
from pyspark import SparkContext, SparkConf

In [None]:
conf = (SparkConf().setAppName('Casting_Types').setMaster('yarn') 
    .set('spark.sql.adaptive.enabled', False)  )
spark = SparkSession.builder.config(conf=conf).getOrCreate()

In [3]:
# ANSI mode disabled by default
print("spark.sql.ansi.enabled =", 
      spark.conf.get("spark.sql.ansi.enabled")) 

df = spark.sql(" select cast('abc' as int) as casted_value ")
df.show()

spark.sql.ansi.enabled = false


[Stage 0:>                                                          (0 + 1) / 1]

+------------+
|casted_value|
+------------+
|        null|
+------------+



                                                                                

In [4]:
spark.conf.set("spark.sql.ansi.enabled", True)

df = spark.sql(" select cast('abc' as int) as value ")
df.show()

NumberFormatException: [CAST_INVALID_INPUT] The value 'abc' of the type "STRING" cannot be cast to "INT" because it is malformed. Correct the value as per the syntax, or change its target type. Use `try_cast` to tolerate malformed input and return NULL instead. If necessary set "spark.sql.ansi.enabled" to "false" to bypass this error.
== SQL(line 1, position 9) ==
 select cast('abc' as int) as value 
        ^^^^^^^^^^^^^^^^^^


In [5]:
spark.conf.set("spark.sql.ansi.enabled", True)
# ANSI mode enabled 
print("spark.sql.ansi.enabled =", 
      spark.conf.get("spark.sql.ansi.enabled")) 

df = spark.sql(" select cast('123' as int) as casted_value ")
df.show()

spark.sql.ansi.enabled = true


[Stage 1:>                                                          (0 + 1) / 1]

+------------+
|casted_value|
+------------+
|         123|
+------------+



                                                                                

In [6]:
# data = [('Anna', 24), ('Robert', 22), ('Lily', 19)]
# mydf = spark.createDataFrame(data, "name: string, age: int")

# spark.sql(" select * from {df} ", df=mydf).show()

In [7]:
sql = """ select f_int, f_str, f_str_date
        from values
          (null,    '0', '2025-03-01'), 
          (11,     '11', '2025-03-02'), 
          (22,     '22', '2025-03-03') 
       as (f_int, f_str,   f_str_date) """ 
spark.sql(sql).createOrReplaceTempView('tbl')
spark.table('tbl').printSchema()

root
 |-- f_int: integer (nullable = true)
 |-- f_str: string (nullable = false)
 |-- f_str_date: string (nullable = false)



In [8]:
 sql = """ select
             cast(f_int as boolean)  f_int_to_boolean,
             cast(f_int as string)   f_int_to_string,
             cast(f_str as short)    f_str_to_short,
             cast(f_str as byte)     f_str_to_byte,
             cast(f_str as int)      f_str_to_int,
             cast(f_str as integer)  f_str_to_integer,
             cast(f_str as bigint)   f_str_to_bigint,
             cast(f_str as long)     f_str_to_long,
             cast(f_str as decimal)  f_str_to_decimal,
             cast(f_str as float)    f_str_to_float,
             cast(f_str as double)   f_str_to_double,            
             cast(f_str_date as date) f_sdate_to_date
        from tbl """ 
df = spark.sql(sql)
for col in df.dtypes:
    print(f"{col[0]:<20} {col[1]}")

f_int_to_boolean     boolean
f_int_to_string      string
f_str_to_short       smallint
f_str_to_byte        tinyint
f_str_to_int         int
f_str_to_integer     int
f_str_to_bigint      bigint
f_str_to_long        bigint
f_str_to_decimal     decimal(10,0)
f_str_to_float       float
f_str_to_double      double
f_sdate_to_date      date


In [9]:
 sql = """ select
             boolean(f_int)  f_int_to_boolean,
             string(f_int)   f_int_to_string,
             --short(f_str)    f_str_to_short,
             --byte(f_str)     f_str_to_byte,
             int(f_str)      f_str_to_int,
             --integer(f_str)  f_str_to_integer,
             bigint(f_str)   f_str_to_bigint,
             --long(f_str)     f_str_to_long,
             decimal(f_str)  f_str_to_decimal,
             float(f_str)    f_str_to_float,
             double(f_str)   f_str_to_double,            
             date(f_str_date) f_sdate_to_date
        from tbl """ 
df = spark.sql(sql)
for col in df.dtypes:
    print(f"{col[0]:<20} {col[1]}")

f_int_to_boolean     boolean
f_int_to_string      string
f_str_to_int         int
f_str_to_bigint      bigint
f_str_to_decimal     decimal(10,0)
f_str_to_float       float
f_str_to_double      double
f_sdate_to_date      date


In [10]:
df = spark.table('tbl').selectExpr(
   " cast(f_int as boolean) as f_int_to_boolean ",
   " cast(f_int as string)  as f_int_to_string ", 
   " int(f_str)             as f_str_to_int ", 
   " date(f_str_date)       as f_sdate_to_date ")
for col in df.dtypes:
    print(f"{col[0]:<20} {col[1]}")

f_int_to_boolean     boolean
f_int_to_string      string
f_str_to_int         int
f_sdate_to_date      date


In [11]:
from pyspark.sql.functions import col
from pyspark.sql.types import BooleanType, StringType, IntegerType, DateType

df = spark.table('tbl').select( 
    col('f_int').cast(BooleanType()).alias('f_int_to_boolean'), 
    col('f_int').cast(StringType()).alias('f_int_to_string'), 
    col('f_str').cast(IntegerType()).alias('f_str_to_int'), 
    col('f_str_date').cast(DateType()).alias('f_sdate_to_date'),
    
    col('f_int').cast('boolean').alias('f_int_to_boolean'), 
    col('f_int').cast('string').alias('f_int_to_string'), 
    col('f_str').cast('int').alias('f_str_to_int'),
    col('f_str_date').cast('date').alias('f_sdate_to_date'))

for col in df.dtypes:
    print(f"{col[0]:<16} {col[1]}")

f_int_to_boolean boolean
f_int_to_string  string
f_str_to_int     int
f_sdate_to_date  date
f_int_to_boolean boolean
f_int_to_string  string
f_str_to_int     int
f_sdate_to_date  date


In [12]:
spark.stop()