**_Objetivo:_** Neste notebook, serão consolidados códigos para explorações práticas envolvendo o contéudo presente no capítulo 5 do livro Spark - The Definitive Guide: Basic Structured Operations. No cenário proposto, exemplificações sobre a utilização de DataFrames e transformações serão fornecidas de modo a garantir uma maior experiência prática na leitura e no tratamento de dados utilizando Spark.

In [1]:
# Importando bibliotecas
import os
from pyspark.sql import SparkSession

# Definindo variáveis de diretório
GITHUB_PATH = r'D:\Users\thiagoPanini\OneDrive\Desenvolvimento\estudos\big data\spark\livros\spark-the-definitive-guide\book-github-resources\Spark-The-Definitive-Guide-master\data'
FILE_PATH = r'flight-data\json\2015-summary.json'
DATA_PATH = os.path.join(GITHUB_PATH, FILE_PATH)

# Inicializando sessão spark
spark = SparkSession.builder.getOrCreate()
spark

In [2]:
# Lendo arquivo
df = spark.read.format('json').load(DATA_PATH)

# Printando schema
df.printSchema()

root
 |-- DEST_COUNTRY_NAME: string (nullable = true)
 |-- ORIGIN_COUNTRY_NAME: string (nullable = true)
 |-- count: long (nullable = true)



# Schema

In [3]:
# Atributo schema do DataFrame
df.schema

StructType(List(StructField(DEST_COUNTRY_NAME,StringType,true),StructField(ORIGIN_COUNTRY_NAME,StringType,true),StructField(count,LongType,true)))

In [4]:
# Verificando campos
df.schema.fields

[StructField(DEST_COUNTRY_NAME,StringType,true),
 StructField(ORIGIN_COUNTRY_NAME,StringType,true),
 StructField(count,LongType,true)]

In [5]:
# Importando tipos primitivos Spark
from pyspark.sql.types import StructField, StructType, StringType, LongType

# Criando metadados manualmente
DF_SCHEMA = StructType([
    StructField("DEST_COUNTRY_NAME", StringType(), True),
    StructField("ORIGIN_COUNTRY_NAME", StringType(), True),
    StructField("count", LongType(), False, metadata={"hello": "world"})
])

# Lendo dados com schema explícito
df = spark.read.format("json")\
    .schema(DF_SCHEMA)\
    .load(DATA_PATH)

# Colunas e Expressões

In [6]:
# Importando funções
from pyspark.sql.functions import col, column

# Referenciando colunas
col('algum_nome_de_coluna')
column('algum_nome_de_coluna')

Column<'algum_nome_de_coluna'>

In [7]:
# Importando funções
from pyspark.sql.functions import col, expr

# Transformação via "col"
(((col("column_A") + 5) * 200) - 6) < col("column_B")

# Transformação via "expr"
expr("(((column_A + 5) * 200) - 6) < column_B")

Column<'((((column_A + 5) * 200) - 6) < column_B)'>

In [8]:
# Verificando linhas
print(df.first())

# Acessando valores
print()
print(f'Dest: {df.first()[0]}')
print(f'Orig: {df.first()[1]}')
print(f'Count: {df.first()[-1]}')

Row(DEST_COUNTRY_NAME='United States', ORIGIN_COUNTRY_NAME='Romania', count=15)

Dest: United States
Orig: Romania
Count: 15


In [9]:
# Criando linhas manualmente
from pyspark.sql import Row

row = Row('Brasil', 'United States', 10)

# Transformações em DataFrames

In [10]:
# Lendo base de dados
df = spark.read.format('json').load(DATA_PATH)
df.show(5)

+-----------------+-------------------+-----+
|DEST_COUNTRY_NAME|ORIGIN_COUNTRY_NAME|count|
+-----------------+-------------------+-----+
|    United States|            Romania|   15|
|    United States|            Croatia|    1|
|    United States|            Ireland|  344|
|            Egypt|      United States|   15|
|    United States|              India|   62|
+-----------------+-------------------+-----+
only showing top 5 rows



In [11]:
# Importando módulos
from pyspark.sql.types import StructType, StructField, StringType, LongType
from pyspark.sql import Row

# Criando schema manualmente
MY_SCHEMA = StructType([
    StructField("col_1", StringType(), True),
    StructField("col_2", StringType(), True),
    StructField("col_3", LongType(), False)
])

# Criando registro manualmente
MY_ROW = Row("Hello", "World", 1)

# Criando DataFrame manualmente
df = spark.createDataFrame([MY_ROW], MY_SCHEMA)
df.show()

+-----+-----+-----+
|col_1|col_2|col_3|
+-----+-----+-----+
|Hello|World|    1|
+-----+-----+-----+



## select e selectExpr

In [12]:
# Lendo base de dados
df = spark.read.format('json').load(DATA_PATH)
df.show(5)

+-----------------+-------------------+-----+
|DEST_COUNTRY_NAME|ORIGIN_COUNTRY_NAME|count|
+-----------------+-------------------+-----+
|    United States|            Romania|   15|
|    United States|            Croatia|    1|
|    United States|            Ireland|  344|
|            Egypt|      United States|   15|
|    United States|              India|   62|
+-----------------+-------------------+-----+
only showing top 5 rows



In [13]:
# Selecionando coluna
df.select('DEST_COUNTRY_NAME').show(2)

# Análogo a: 
# SELECT DEST_COUNTRY_NAME FROM df LIMIT 2

+-----------------+
|DEST_COUNTRY_NAME|
+-----------------+
|    United States|
|    United States|
+-----------------+
only showing top 2 rows



In [14]:
# Selecionando múltiplas colunas
df.select('DEST_COUNTRY_NAME', 'ORIGIN_COUNTRY_NAME').show(2)

# Análogo a:
# SELECT DEST_COUNTRY_NAME, ORIGIN_COUNTRY_NAME FROM df LIMIT 2

+-----------------+-------------------+
|DEST_COUNTRY_NAME|ORIGIN_COUNTRY_NAME|
+-----------------+-------------------+
|    United States|            Romania|
|    United States|            Croatia|
+-----------------+-------------------+
only showing top 2 rows



In [15]:
# Importando funções
from pyspark.sql.functions import col, column, expr

# Referenciando colunas
df.select(
    col('DEST_COUNTRY_NAME'),
    column('DEST_COUNTRY_NAME'),
    expr('DEST_COUNTRY_NAME')
).show(2)

+-----------------+-----------------+-----------------+
|DEST_COUNTRY_NAME|DEST_COUNTRY_NAME|DEST_COUNTRY_NAME|
+-----------------+-----------------+-----------------+
|    United States|    United States|    United States|
|    United States|    United States|    United States|
+-----------------+-----------------+-----------------+
only showing top 2 rows



In [16]:
# Renomeando coluna via expr()
df.select(expr('DEST_COUNTRY_NAME AS destination')).show(2)

+-------------+
|  destination|
+-------------+
|United States|
|United States|
+-------------+
only showing top 2 rows



In [17]:
# Renomeando coluna via col()
df.select(col('DEST_COUNTRY_NAME').alias('destination')).show(2)

+-------------+
|  destination|
+-------------+
|United States|
|United States|
+-------------+
only showing top 2 rows



In [18]:
# Selecionando múltiplas colunas com select + expr
df.select(
    expr('DEST_COUNTRY_NAME AS destination'), 
    expr('ORIGIN_COUNTRY_NAME AS origin')
).show(2)

+-------------+-------+
|  destination| origin|
+-------------+-------+
|United States|Romania|
|United States|Croatia|
+-------------+-------+
only showing top 2 rows



In [19]:
# Selecionando múltiplas colunas com selectExpr
df.selectExpr('DEST_COUNTRY_NAME AS destination', 
              'ORIGIN_COUNTRY_NAME AS origin').show(2)

+-------------+-------+
|  destination| origin|
+-------------+-------+
|United States|Romania|
|United States|Croatia|
+-------------+-------+
only showing top 2 rows



In [20]:
# Adicionando colunas
df.selectExpr(
    "*", # todas as colunas
    "(DEST_COUNTRY_NAME = ORIGIN_COUNTRY_NAME) AS within_country"
).show(2)

# Em SQL:
# SELECT *, (DEST_COUNTRY_NAME = ORIGIN_COUNTRY_NAME) AS within_country 
# FROM df LIMIT 2

+-----------------+-------------------+-----+--------------+
|DEST_COUNTRY_NAME|ORIGIN_COUNTRY_NAME|count|within_country|
+-----------------+-------------------+-----+--------------+
|    United States|            Romania|   15|         false|
|    United States|            Croatia|    1|         false|
+-----------------+-------------------+-----+--------------+
only showing top 2 rows



In [21]:
# Selecionando apenas agregados (sem GROUP BY)
df.selectExpr("count(DEST_COUNTRY_NAME) AS total_rows", "sum(count) AS sum_count").show()

+----------+---------+
|total_rows|sum_count|
+----------+---------+
|       256|   453316|
+----------+---------+



## Literais

In [22]:
# Importando função
from pyspark.sql.functions import lit

# Adicionando coluna com literal (select)
df.select(expr("*"), lit(1).alias("One")).show(2)

+-----------------+-------------------+-----+---+
|DEST_COUNTRY_NAME|ORIGIN_COUNTRY_NAME|count|One|
+-----------------+-------------------+-----+---+
|    United States|            Romania|   15|  1|
|    United States|            Croatia|    1|  1|
+-----------------+-------------------+-----+---+
only showing top 2 rows



In [23]:
# Adicionando coluna com literal (selectExpr)
df.selectExpr("*", "1 AS One").show(2)

+-----------------+-------------------+-----+---+
|DEST_COUNTRY_NAME|ORIGIN_COUNTRY_NAME|count|One|
+-----------------+-------------------+-----+---+
|    United States|            Romania|   15|  1|
|    United States|            Croatia|    1|  1|
+-----------------+-------------------+-----+---+
only showing top 2 rows



## Adição de Colunas

In [24]:
# Adição de literais
df.withColumn("One", lit(1)).show(2)

+-----------------+-------------------+-----+---+
|DEST_COUNTRY_NAME|ORIGIN_COUNTRY_NAME|count|One|
+-----------------+-------------------+-----+---+
|    United States|            Romania|   15|  1|
|    United States|            Croatia|    1|  1|
+-----------------+-------------------+-----+---+
only showing top 2 rows



In [25]:
# Adição de flags
df.withColumn("within_country", expr("(DEST_COUNTRY_NAME = ORIGIN_COUNTRY_NAME)")).show(2)

+-----------------+-------------------+-----+--------------+
|DEST_COUNTRY_NAME|ORIGIN_COUNTRY_NAME|count|within_country|
+-----------------+-------------------+-----+--------------+
|    United States|            Romania|   15|         false|
|    United States|            Croatia|    1|         false|
+-----------------+-------------------+-----+--------------+
only showing top 2 rows



## Renomeando Colunas

In [26]:
# Renomeando colunas
df.withColumnRenamed("DEST_COUNTRY_NAME", "dest").show(2)

+-------------+-------------------+-----+
|         dest|ORIGIN_COUNTRY_NAME|count|
+-------------+-------------------+-----+
|United States|            Romania|   15|
|United States|            Croatia|    1|
+-------------+-------------------+-----+
only showing top 2 rows



## Removendo Colunas

In [27]:
# Removendo uma coluna
df.drop("DEST_COUNTRY_NAME").show(2)

+-------------------+-----+
|ORIGIN_COUNTRY_NAME|count|
+-------------------+-----+
|            Romania|   15|
|            Croatia|    1|
+-------------------+-----+
only showing top 2 rows



In [28]:
# Removendo múltiplas colunas
df.drop("ORIGIN_COUNTRY_NAME", "count").show(2)

+-----------------+
|DEST_COUNTRY_NAME|
+-----------------+
|    United States|
|    United States|
+-----------------+
only showing top 2 rows



## Casting

In [29]:
print('Schema original:')
df.printSchema()

# Criando nova coluna com tipo primitivo alterado
df_cast = df.withColumn("count_2", col("count").cast("int"))
print('Schema após casting:')
df_cast.printSchema()

Schema original:
root
 |-- DEST_COUNTRY_NAME: string (nullable = true)
 |-- ORIGIN_COUNTRY_NAME: string (nullable = true)
 |-- count: long (nullable = true)

Schema após casting:
root
 |-- DEST_COUNTRY_NAME: string (nullable = true)
 |-- ORIGIN_COUNTRY_NAME: string (nullable = true)
 |-- count: long (nullable = true)
 |-- count_2: integer (nullable = true)



In [30]:
# Alterando schema via selectExpr
df_cast2 = df_cast.withColumn("count_3", expr("cast(count AS STRING)"))
df_cast2.printSchema()

root
 |-- DEST_COUNTRY_NAME: string (nullable = true)
 |-- ORIGIN_COUNTRY_NAME: string (nullable = true)
 |-- count: long (nullable = true)
 |-- count_2: integer (nullable = true)
 |-- count_3: string (nullable = true)



In [31]:
df.withColumn("within_country", col("DEST_COUNTRY_NAME") == col("ORIGIN_COUNTRY_NAME")).show(2)

+-----------------+-------------------+-----+--------------+
|DEST_COUNTRY_NAME|ORIGIN_COUNTRY_NAME|count|within_country|
+-----------------+-------------------+-----+--------------+
|    United States|            Romania|   15|         false|
|    United States|            Croatia|    1|         false|
+-----------------+-------------------+-----+--------------+
only showing top 2 rows



In [32]:
df.withColumn("within_country", expr('DEST_COUNTRY_NAME = ORIGIN_COUNTRY_NAME')).show(2)

+-----------------+-------------------+-----+--------------+
|DEST_COUNTRY_NAME|ORIGIN_COUNTRY_NAME|count|within_country|
+-----------------+-------------------+-----+--------------+
|    United States|            Romania|   15|         false|
|    United States|            Croatia|    1|         false|
+-----------------+-------------------+-----+--------------+
only showing top 2 rows



## Filtrando Registros

In [33]:
# Filtrando registros (referência de colunas)
df.where(col("count") < 2).show(2)

+-----------------+-------------------+-----+
|DEST_COUNTRY_NAME|ORIGIN_COUNTRY_NAME|count|
+-----------------+-------------------+-----+
|    United States|            Croatia|    1|
|    United States|          Singapore|    1|
+-----------------+-------------------+-----+
only showing top 2 rows



In [34]:
# Filtrando registros (expressões em String)
df.where("count < 2").show(2)

+-----------------+-------------------+-----+
|DEST_COUNTRY_NAME|ORIGIN_COUNTRY_NAME|count|
+-----------------+-------------------+-----+
|    United States|            Croatia|    1|
|    United States|          Singapore|    1|
+-----------------+-------------------+-----+
only showing top 2 rows



In [35]:
# Aplicando múltiplos filtros (forma não usual)
df.where("count < 2 AND ORIGIN_COUNTRY_NAME != 'Croatia'").show(2)

+-----------------+-------------------+-----+
|DEST_COUNTRY_NAME|ORIGIN_COUNTRY_NAME|count|
+-----------------+-------------------+-----+
|    United States|          Singapore|    1|
|          Moldova|      United States|    1|
+-----------------+-------------------+-----+
only showing top 2 rows



In [36]:
# Aplicando múltiplos filtros (forma usual)
df.where("count < 2")\
    .where("ORIGIN_COUNTRY_NAME != 'Croatia'")\
    .show(2)

+-----------------+-------------------+-----+
|DEST_COUNTRY_NAME|ORIGIN_COUNTRY_NAME|count|
+-----------------+-------------------+-----+
|    United States|          Singapore|    1|
|          Moldova|      United States|    1|
+-----------------+-------------------+-----+
only showing top 2 rows



## Removendo Duplicatas


In [37]:
# Removendo duplicatas em uma coluna
df.select('DEST_COUNTRY_NAME').distinct().count()

132

In [38]:
# Removendo duplicatas em duas colunas
df.select('DEST_COUNTRY_NAME', 'ORIGIN_COUNTRY_NAME').distinct().count()

256

# Amostragem Aleatória

In [39]:
# Realizando amostragem em um DataFrame
df.sample(withReplacement=False, fraction=0.5, seed=42).count()

132

In [40]:
# Separando DataFrame em blocos distintos
dataframes = df.randomSplit(weights=[0.25, 0.75], seed=42)
print(f'DF Original: {df.count()}')
print(f'DF[0]: {dataframes[0].count()}')
print(f'DF[1]: {dataframes[1].count()}')

DF Original: 256
DF[0]: 63
DF[1]: 193


Como a função `randomSplit()` retorna uma lista composta por DataFrames de acordo com os pesos passados, seu retorno pode já ser associado a novos DataFrames, evitando assim a necessidade de realizar indexação.

In [41]:
# Separando e associando DataFrames
train, test = df.randomSplit(weights=[0.25, 0.75], seed=42)
print(f'train: {train.count()}')
print(f'test: {test.count()}')

train: 63
test: 193


## Unindo DataFrames


In [42]:
from pyspark.sql import Row

# Criando registros manualmente
newRows = [
    Row('Brazil', 'Canada', 5),
    Row('Brazil', 'Argentina', 1)
]

# Construindo um novo DataFrame
originalSchema = df.schema
paralellizedRows = spark.sparkContext.parallelize(newRows)
newDF = spark.createDataFrame(paralellizedRows, originalSchema)

# Unindo DataFrames e realizando consulta
df.union(newDF)\
    .where(col("DEST_COUNTRY_NAME") == "Brazil").show()

+-----------------+-------------------+-----+
|DEST_COUNTRY_NAME|ORIGIN_COUNTRY_NAME|count|
+-----------------+-------------------+-----+
|           Brazil|      United States|  853|
|           Brazil|             Canada|    5|
|           Brazil|          Argentina|    1|
+-----------------+-------------------+-----+



## Ordenando DataFrames

In [43]:
# Importando funções
from pyspark.sql.functions import asc, desc

# Ordenando registros - descendente
df.sort(col("count").desc()).show(5)

+-----------------+-------------------+------+
|DEST_COUNTRY_NAME|ORIGIN_COUNTRY_NAME| count|
+-----------------+-------------------+------+
|    United States|      United States|370002|
|    United States|             Canada|  8483|
|           Canada|      United States|  8399|
|    United States|             Mexico|  7187|
|           Mexico|      United States|  7140|
+-----------------+-------------------+------+
only showing top 5 rows



In [44]:
# Ordenando registros - múltiplas colunas
df.sort(expr("count asc"), col("DEST_COUNTRY_NAME").asc()).show(5)

+-----------------+-------------------+-----+
|DEST_COUNTRY_NAME|ORIGIN_COUNTRY_NAME|count|
+-----------------+-------------------+-----+
|     Burkina Faso|      United States|    1|
|    Cote d'Ivoire|      United States|    1|
|           Cyprus|      United States|    1|
|         Djibouti|      United States|    1|
|        Indonesia|      United States|    1|
+-----------------+-------------------+-----+
only showing top 5 rows



## Limitando Registros

In [45]:
# Limitando registros
df.sort(col("count").desc()).limit(3).show()

+-----------------+-------------------+------+
|DEST_COUNTRY_NAME|ORIGIN_COUNTRY_NAME| count|
+-----------------+-------------------+------+
|    United States|      United States|370002|
|    United States|             Canada|  8483|
|           Canada|      United States|  8399|
+-----------------+-------------------+------+



# Reparticionamento

In [51]:
# Coletando o número de partições
print(f'Partições do DataFrame: {df.rdd.getNumPartitions()}')

# Reparticionando - número fixo
df_rep5 = df.repartition(5)
print(f'Partições do DataFrame: {df_rep5.rdd.getNumPartitions()}')

# Reparticionando - coluna da base
df_base5 = df.repartition(5, col("DEST_COUNTRY_NAME"))
print(f'Partições do DataFrame: {df_base5.rdd.getNumPartitions()}')

Partições do DataFrame: 1
Partições do DataFrame: 5
Partições do DataFrame: 5


## Coletando Registros no Driver

In [67]:
# Preparando DataFrame reduzido
df_collect = df.limit(5)

# Coletando N registros
df_collect.take(3)

[Row(DEST_COUNTRY_NAME='United States', ORIGIN_COUNTRY_NAME='Romania', count=15),
 Row(DEST_COUNTRY_NAME='United States', ORIGIN_COUNTRY_NAME='Croatia', count=1),
 Row(DEST_COUNTRY_NAME='United States', ORIGIN_COUNTRY_NAME='Ireland', count=344)]

In [68]:
# Coletando todos os registros
df_collect.collect()

[Row(DEST_COUNTRY_NAME='United States', ORIGIN_COUNTRY_NAME='Romania', count=15),
 Row(DEST_COUNTRY_NAME='United States', ORIGIN_COUNTRY_NAME='Croatia', count=1),
 Row(DEST_COUNTRY_NAME='United States', ORIGIN_COUNTRY_NAME='Ireland', count=344),
 Row(DEST_COUNTRY_NAME='Egypt', ORIGIN_COUNTRY_NAME='United States', count=15),
 Row(DEST_COUNTRY_NAME='United States', ORIGIN_COUNTRY_NAME='India', count=62)]

In [69]:
# Coletando DataFrame como um iterator de partições
for i in df_collect.toLocalIterator():
    print(i)

Row(DEST_COUNTRY_NAME='United States', ORIGIN_COUNTRY_NAME='Romania', count=15)
Row(DEST_COUNTRY_NAME='United States', ORIGIN_COUNTRY_NAME='Croatia', count=1)
Row(DEST_COUNTRY_NAME='United States', ORIGIN_COUNTRY_NAME='Ireland', count=344)
Row(DEST_COUNTRY_NAME='Egypt', ORIGIN_COUNTRY_NAME='United States', count=15)
Row(DEST_COUNTRY_NAME='United States', ORIGIN_COUNTRY_NAME='India', count=62)
