In [8]:
# Importando bibliotecas
from pyspark.sql import SparkSession
from pyspark.sql.types import StructType, StructField, StringType, IntegerType
import os

# Criando objeto de sessão
spark = (
    SparkSession
    .builder
    .appName("art10-colunas-expressoes")
    .getOrCreate()
)

# Definindo variáveis de diretório
home_path = os.path.expanduser('~')
data_path = os.path.join(home_path, 'dev/panini-tech-lab/data/flights-data/summary-data/csv/2015-summary.csv')

# Definindo schema para o arquivo CSV a ser lido
data_schema = StructType([
    StructField("DEST_COUNTRY_NAME", StringType(), nullable=True, metadata={"description": "País de destino dos vôos contabilizados"}),
    StructField("ORIGIN_COUNTRY_NAME", StringType(), nullable=True, metadata={"description": "País de origem dos vôos contabilizados"}),
    StructField("count", IntegerType(), nullable=True, metadata={"description": "Contagem total de vôos entre os países de origem e de destino do registro"})
])

# Realizando a leitura dos dados
df = (
    spark.read.format("csv")
    .schema(data_schema)
    .option("header", "true")
    .load(data_path)
)

# Verificando amostra dos dados
df.printSchema()
df.show(5)

root
 |-- DEST_COUNTRY_NAME: string (nullable = true)
 |-- ORIGIN_COUNTRY_NAME: string (nullable = true)
 |-- count: integer (nullable = true)

+-----------------+-------------------+-----+
|DEST_COUNTRY_NAME|ORIGIN_COUNTRY_NAME|count|
+-----------------+-------------------+-----+
|    United States|            Romania|   15|
|    United States|            Croatia|    1|
|    United States|            Ireland|  344|
|            Egypt|      United States|   15|
|    United States|              India|   62|
+-----------------+-------------------+-----+
only showing top 5 rows



## Registros (Row)

In [9]:
# Visualizando uma linha do DataFrame
row = df.first()

In [10]:
# Visualizando objeto 
print(row)
print(type(row))

Row(DEST_COUNTRY_NAME='United States', ORIGIN_COUNTRY_NAME='Romania', count=15)
<class 'pyspark.sql.types.Row'>


In [11]:
# Extraindo informações de um registro
print(f'País de origem: {row.ORIGIN_COUNTRY_NAME}')
print(f'País de destino: {row.DEST_COUNTRY_NAME}')
print(f'Contagem de vôos: {row[-1]}')

País de origem: Romania
País de destino: United States
Contagem de vôos: 15


In [18]:
# Coletando as primeiras linhas de um DataFrame
n_rows = df.take(3)
print(f'Três primeiros registros: \n{n_rows}')

# Coletando todas as linhas de um DataFrame
all_rows = df.collect()
print(f'\nAlguns registros específicos: \n{all_rows[5:10]}')

Três primeiros registros: 
[Row(DEST_COUNTRY_NAME='United States', ORIGIN_COUNTRY_NAME='Romania', count=15), Row(DEST_COUNTRY_NAME='United States', ORIGIN_COUNTRY_NAME='Croatia', count=1), Row(DEST_COUNTRY_NAME='United States', ORIGIN_COUNTRY_NAME='Ireland', count=344)]

Alguns registros específicos: 
[Row(DEST_COUNTRY_NAME='United States', ORIGIN_COUNTRY_NAME='Singapore', count=1), Row(DEST_COUNTRY_NAME='United States', ORIGIN_COUNTRY_NAME='Grenada', count=62), Row(DEST_COUNTRY_NAME='Costa Rica', ORIGIN_COUNTRY_NAME='United States', count=588), Row(DEST_COUNTRY_NAME='Senegal', ORIGIN_COUNTRY_NAME='United States', count=40), Row(DEST_COUNTRY_NAME='Moldova', ORIGIN_COUNTRY_NAME='United States', count=1)]


In [24]:
# Retornando coluna de um DataFrame
print(df.DEST_COUNTRY_NAME)
print(df["DEST_COUNTRY_NAME"])

# Visualizando tipo primitivo
print(type(df["DEST_COUNTRY_NAME"]))

Column<'DEST_COUNTRY_NAME'>
Column<'DEST_COUNTRY_NAME'>
<class 'pyspark.sql.column.Column'>


In [29]:
# Visualizando colunas
print(f'Lista de colunas do DataFrame: {df.columns}')

# O que ocorre quando referenciamos colunas?
print(f'\nIndexando colunas: {df["DEST_COUNTRY_NAME"]}')

# Tipo primitivo
print(f'\nTipo primitivo da coluna: {type(df["DEST_COUNTRY_NAME"])}')

Lista de colunas do DatFrame: ['DEST_COUNTRY_NAME', 'ORIGIN_COUNTRY_NAME', 'count']

Indexando colunas: Column<'DEST_COUNTRY_NAME'>

Tipo primitivo da coluna: <class 'pyspark.sql.column.Column'>


In [30]:
# Importando funções de referenciamento de colunas
from pyspark.sql.functions import col, column, expr

# Construindo expressões via col
print(col("valor_A") + col("valor_B"))

# Construindo expressões via column
print(column("valor_A") + col("valor_B"))

# Construindo expressões via expr
print(expr("valor_A + valor_B"))

Column<'(valor_A + valor_B)'>
Column<'(valor_A + valor_B)'>
Column<'(valor_A + valor_B)'>


In [33]:
# Criando expressão
(((col("some_col") + 5) * 200) - 6) < col("other_col")

# Modo alternativo
expr("(((some_col + 5) * 200) * 6) < other_col")

Column<'((((some_col + 5) * 200) * 6) < other_col)'>

In [34]:
# Criando expressão
(((col("some_col") + 5) * 200) - 6) < col("other_col")

Column<'((((some_col + 5) * 200) - 6) < other_col)'>

In [35]:
# Selecionando coluna
df.select("DEST_COUNTRY_NAME").show(2)

+-----------------+
|DEST_COUNTRY_NAME|
+-----------------+
|    United States|
|    United States|
+-----------------+
only showing top 2 rows



In [40]:
# Diferentes formas aplicar uma consulta
df.select(
    "DEST_COUNTRY_NAME",
    col("DEST_COUNTRY_NAME"),
    column("DEST_COUNTRY_NAME"),
    expr("DEST_COUNTRY_NAME")
).show(2)

+-----------------+-----------------+-----------------+-----------------+
|DEST_COUNTRY_NAME|DEST_COUNTRY_NAME|DEST_COUNTRY_NAME|DEST_COUNTRY_NAME|
+-----------------+-----------------+-----------------+-----------------+
|    United States|    United States|    United States|    United States|
|    United States|    United States|    United States|    United States|
+-----------------+-----------------+-----------------+-----------------+
only showing top 2 rows



In [47]:
# Exemplificando operações
df.select(
    "count",
    col("count") * 2,
    expr("count * 2")
).show(2)

+-----+-----------+-----------+
|count|(count * 2)|(count * 2)|
+-----+-----------+-----------+
|   15|         30|         30|
|    1|          2|          2|
+-----+-----------+-----------+
only showing top 2 rows



In [49]:
# Adicionando alias às expressões
df.select(
    col("ORIGIN_COUNTRY_NAME").alias("pais_origem"),
    col("DEST_COUNTRY_NAME").alias("pais_destino"),
    col("count").alias("qtd_voos")
).show(5)

+-------------+-------------+--------+
|  pais_origem| pais_destino|qtd_voos|
+-------------+-------------+--------+
|      Romania|United States|      15|
|      Croatia|United States|       1|
|      Ireland|United States|     344|
|United States|        Egypt|      15|
|        India|United States|      62|
+-------------+-------------+--------+
only showing top 5 rows



In [50]:
# Criando consulta com nomes modificados
df.select(
    expr("ORIGIN_COUNTRY_NAME AS pais_origem_expr"),
    expr("DEST_COUNTRY_NAME AS pais_destino_expr"),
    expr("count AS qtd_voos_expr")
).show(5)

+----------------+-----------------+-------------+
|pais_origem_expr|pais_destino_expr|qtd_voos_expr|
+----------------+-----------------+-------------+
|         Romania|    United States|           15|
|         Croatia|    United States|            1|
|         Ireland|    United States|          344|
|   United States|            Egypt|           15|
|           India|    United States|           62|
+----------------+-----------------+-------------+
only showing top 5 rows



In [60]:
# Criando consulta com nomes modificados
df.select(
    expr("concat(ORIGIN_COUNTRY_NAME, ' > ', DEST_COUNTRY_NAME) AS origem_para_destino"),
    expr("count AS qtd_voos"),
    expr("count * 2 AS qtd_voos_dobro")
).show(5, truncate=False)

+-----------------------+--------+--------------+
|origem_para_destino    |qtd_voos|qtd_voos_dobro|
+-----------------------+--------+--------------+
|Romania > United States|15      |30            |
|Croatia > United States|1       |2             |
|Ireland > United States|344     |688           |
|United States > Egypt  |15      |30            |
|India > United States  |62      |124           |
+-----------------------+--------+--------------+
only showing top 5 rows



In [61]:
# Reconstruindo consulta anterior com selectExpr()
df.selectExpr(
    "concat(ORIGIN_COUNTRY_NAME, ' > ', DEST_COUNTRY_NAME) AS origem_para_destino",
    "count AS qtd_voos",
    "count * 2 AS qtd_voos_dobro"
).show(5, truncate=False)

+-----------------------+--------+--------------+
|origem_para_destino    |qtd_voos|qtd_voos_dobro|
+-----------------------+--------+--------------+
|Romania > United States|15      |30            |
|Croatia > United States|1       |2             |
|Ireland > United States|344     |688           |
|United States > Egypt  |15      |30            |
|India > United States  |62      |124           |
+-----------------------+--------+--------------+
only showing top 5 rows



In [63]:
# Mais um exemplo de consulta
df.selectExpr(
    "*",
    "upper(DEST_COUNTRY_NAME) AS upper_dest",
    "power(count, 2) AS power2_count"
).show(5)

+-----------------+-------------------+-----+-------------+------------+
|DEST_COUNTRY_NAME|ORIGIN_COUNTRY_NAME|count|   upper_dest|power2_count|
+-----------------+-------------------+-----+-------------+------------+
|    United States|            Romania|   15|UNITED STATES|       225.0|
|    United States|            Croatia|    1|UNITED STATES|         1.0|
|    United States|            Ireland|  344|UNITED STATES|    118336.0|
|            Egypt|      United States|   15|        EGYPT|       225.0|
|    United States|              India|   62|UNITED STATES|      3844.0|
+-----------------+-------------------+-----+-------------+------------+
only showing top 5 rows



In [62]:
# Consultando coluna inexistente
df.selectExpr(
    "ERROR_DEST_COUNTRY_NAME"
).show(5)

AnalysisException: Column 'ERROR_DEST_COUNTRY_NAME' does not exist. Did you mean one of the following? [DEST_COUNTRY_NAME, ORIGIN_COUNTRY_NAME, count]; line 1 pos 0;
'Project ['ERROR_DEST_COUNTRY_NAME]
+- Relation [DEST_COUNTRY_NAME#32,ORIGIN_COUNTRY_NAME#33,count#34] csv


In [48]:
col("count")aaaaaaa

Column<'count'>

In [28]:
# Importando objeto do tipo Row
from pyspark.sql import Row

# Criando linhas de um DataFrame
my_rows = [
    Row("Brazil", "Brazil", 100),
    Row("Brazil", "Argentina", 50)
]

# Criando um DataFrame
my_df = spark.createDataFrame(my_rows, data_schema)
my_df.show()

                                                                                

+-----------------+-------------------+-----+
|DEST_COUNTRY_NAME|ORIGIN_COUNTRY_NAME|count|
+-----------------+-------------------+-----+
|           Brazil|             Brazil|  100|
|           Brazil|          Argentina|   50|
+-----------------+-------------------+-----+

