In [2]:
# Importando bibliotecas
from pyspark.sql import SparkSession
from pyspark.sql.types import StructType, StructField, StringType, IntegerType
import os
from warnings import filterwarnings
filterwarnings("ignore")

# Criando sessão Spark
spark = (
    SparkSession
    .builder
    .appName("art11")
    .getOrCreate()
)

# Definindo variáveis de diretório
home_path = os.path.expanduser('~')
data_path = os.path.join(home_path, 'dev/panini-tech-lab/data/flights-data/summary-data/csv/2015-summary.csv')

# Definindo schema para o arquivo CSV a ser lido
data_schema = StructType([
    StructField("DEST_COUNTRY_NAME", StringType(), nullable=True, metadata={"description": "País de destino dos vôos contabilizados"}),
    StructField("ORIGIN_COUNTRY_NAME", StringType(), nullable=True, metadata={"description": "País de origem dos vôos contabilizados"}),
    StructField("count", IntegerType(), nullable=True, metadata={"description": "Contagem total de vôos entre os países de origem e de destino do registro"})
])

# Realizando a leitura dos dados
df = (
    spark.read.format("csv")
    .schema(data_schema)
    .option("header", "true")
    .load(data_path)
)

# Criando tabela temporária
df.createOrReplaceTempView("tbl_flights")

# Verificando amostra dos dados
df.printSchema()
df.show(5)

root
 |-- DEST_COUNTRY_NAME: string (nullable = true)
 |-- ORIGIN_COUNTRY_NAME: string (nullable = true)
 |-- count: integer (nullable = true)

+-----------------+-------------------+-----+
|DEST_COUNTRY_NAME|ORIGIN_COUNTRY_NAME|count|
+-----------------+-------------------+-----+
|    United States|            Romania|   15|
|    United States|            Croatia|    1|
|    United States|            Ireland|  344|
|            Egypt|      United States|   15|
|    United States|              India|   62|
+-----------------+-------------------+-----+
only showing top 5 rows



In [22]:
# Importando funções
from pyspark.sql.functions import lit, col, expr

# Criando literais em consultas
df_lit = df.select(
    col("*"),
    lit(1).alias("lit_one"),
    expr("2 as lit_two"),
    lit("str").alias("lit_str"),
    expr("'str_expr' AS lit_str_expr")
)

# Verificando schema
df_lit.printSchema()
df_lit.show(5)

root
 |-- DEST_COUNTRY_NAME: string (nullable = true)
 |-- ORIGIN_COUNTRY_NAME: string (nullable = true)
 |-- count: integer (nullable = true)
 |-- lit_one: integer (nullable = false)
 |-- lit_two: integer (nullable = false)
 |-- lit_str: string (nullable = false)
 |-- lit_str_expr: string (nullable = false)

+-----------------+-------------------+-----+-------+-------+-------+------------+
|DEST_COUNTRY_NAME|ORIGIN_COUNTRY_NAME|count|lit_one|lit_two|lit_str|lit_str_expr|
+-----------------+-------------------+-----+-------+-------+-------+------------+
|    United States|            Romania|   15|      1|      2|    str|    str_expr|
|    United States|            Croatia|    1|      1|      2|    str|    str_expr|
|    United States|            Ireland|  344|      1|      2|    str|    str_expr|
|            Egypt|      United States|   15|      1|      2|    str|    str_expr|
|    United States|              India|   62|      1|      2|    str|    str_expr|
+-----------------+------

In [35]:
# Criando literais e alterando tipo primitivo
df_casting1 = df.select(
    lit(1).alias("lit_int"),
    lit(1).cast("string").alias("lit_str_cast")
)

# Visualizando schema e amostra
df_casting1.printSchema()
df_casting1.show(5)

root
 |-- lit_int: integer (nullable = false)
 |-- lit_str_cast: boolean (nullable = false)

+-------+------------+
|lit_int|lit_str_cast|
+-------+------------+
|      1|        true|
|      1|        true|
|      1|        true|
|      1|        true|
|      1|        true|
+-------+------------+
only showing top 5 rows



In [31]:
# Coletando uma linha
df_casting1.take(1)

[Row(lit_int=1, lit_str_cast='1')]

In [38]:
# Importando tipos primitivos
from tokenize import Double
from pyspark.sql.types import StringType, DoubleType, BooleanType

# Criando literais e alterando tipo primitivo
df_casting2 = df.select(
    lit(1).alias("int"),
    lit(1).cast(StringType()).alias("str"),
    lit(1).cast(DoubleType()).alias("double"),
    lit(1).cast(BooleanType()).alias("bool_true"),
    lit(0).cast(BooleanType()).alias("bool_false")
)

# Visualizando schema e amostra
df_casting2.printSchema()
df_casting2.show(5)

# Coletando registro no driver
row = df_casting2.take(1)
print(row)

root
 |-- int: integer (nullable = false)
 |-- str: string (nullable = false)
 |-- double: double (nullable = false)
 |-- bool_true: boolean (nullable = false)
 |-- bool_false: boolean (nullable = false)

+---+---+------+---------+----------+
|int|str|double|bool_true|bool_false|
+---+---+------+---------+----------+
|  1|  1|   1.0|     true|     false|
|  1|  1|   1.0|     true|     false|
|  1|  1|   1.0|     true|     false|
|  1|  1|   1.0|     true|     false|
|  1|  1|   1.0|     true|     false|
+---+---+------+---------+----------+
only showing top 5 rows

[Row(int=1, str='1', double=1.0, bool_true=True, bool_false=False)]


In [42]:
# Gerando erros de conversão
df_casting3 = df.select(
    "DEST_COUNTRY_NAME",
    col("DEST_COUNTRY_NAME").cast(DoubleType()).alias("dest_country_double")
)

# Verificando tipo primitivo e amostra
df_casting3.printSchema()
df_casting3.show(5)

# Coletando registro no driver
row = df_casting3.take(1)
print(row)

root
 |-- DEST_COUNTRY_NAME: string (nullable = true)
 |-- dest_country_double: double (nullable = true)

+-----------------+-------------------+
|DEST_COUNTRY_NAME|dest_country_double|
+-----------------+-------------------+
|    United States|               null|
|    United States|               null|
|    United States|               null|
|            Egypt|               null|
|    United States|               null|
+-----------------+-------------------+
only showing top 5 rows

[Row(DEST_COUNTRY_NAME='United States', dest_country_double=None)]


In [50]:
# Convertendo via SparkSQL
df_casting4 = spark.sql("""
    SELECT
        1 AS integer,
        cast(1 AS string) AS string,
        cast(1 AS double) AS double,
        cast(1 AS boolean) AS bool_true,
        cast(1 AS boolean) AS bool_false

    FROM tbl_flights
""")

# Verificando schema e amostra
df_casting4.printSchema()
df_casting4.show(5)

# Coletando amostra para o driver
print(df_casting4.take(1))

root
 |-- integer: integer (nullable = false)
 |-- string: string (nullable = false)
 |-- double: double (nullable = false)
 |-- bool_true: boolean (nullable = false)
 |-- bool_false: boolean (nullable = false)

+-------+------+------+---------+----------+
|integer|string|double|bool_true|bool_false|
+-------+------+------+---------+----------+
|      1|     1|   1.0|     true|      true|
|      1|     1|   1.0|     true|      true|
|      1|     1|   1.0|     true|      true|
|      1|     1|   1.0|     true|      true|
|      1|     1|   1.0|     true|      true|
+-------+------+------+---------+----------+
only showing top 5 rows

[Row(integer=1, string='1', double=1.0, bool_true=True, bool_false=True)]


___

In [14]:
# Importando funções
from pyspark.sql.functions import lit, col, expr, when

# Criando literais
df.select(
    "*",
    lit(1).alias("flag"),
    when((col("count") > 30), lit(1)).otherwise(lit(0)).alias("flag2")
).show(5)

+-----------------+-------------------+-----+----+-----+
|DEST_COUNTRY_NAME|ORIGIN_COUNTRY_NAME|count|flag|flag2|
+-----------------+-------------------+-----+----+-----+
|    United States|            Romania|   15|   1|    0|
|    United States|            Croatia|    1|   1|    0|
|    United States|            Ireland|  344|   1|    1|
|            Egypt|      United States|   15|   1|    0|
|    United States|              India|   62|   1|    1|
+-----------------+-------------------+-----+----+-----+
only showing top 5 rows

