In [1]:
# Importando bibliotecas
from pyspark.sql import SparkSession
from pyspark.sql.types import StructType, StructField, StringType, IntegerType
import os
from warnings import filterwarnings
filterwarnings("ignore")

# Criando sessão Spark
spark = (
    SparkSession
    .builder
    .appName("art11")
    .getOrCreate()
)

# Definindo variáveis de diretório
home_path = os.path.expanduser('~')
data_path = os.path.join(home_path, 'dev/panini-tech-lab/data/flights-data/summary-data/csv/2015-summary.csv')

# Definindo schema para o arquivo CSV a ser lido
data_schema = StructType([
    StructField("DEST_COUNTRY_NAME", StringType(), nullable=True, metadata={"description": "País de destino dos vôos contabilizados"}),
    StructField("ORIGIN_COUNTRY_NAME", StringType(), nullable=True, metadata={"description": "País de origem dos vôos contabilizados"}),
    StructField("count", IntegerType(), nullable=True, metadata={"description": "Contagem total de vôos entre os países de origem e de destino do registro"})
])

# Realizando a leitura dos dados
df = (
    spark.read.format("csv")
    .schema(data_schema)
    .option("header", "true")
    .load(data_path)
)

# Criando tabela temporária
df.createOrReplaceTempView("tbl_flights")

# Verificando amostra dos dados
df.printSchema()
df.show(5)

22/08/03 20:51:49 WARN Utils: Your hostname, panini-ubuntu resolves to a loopback address: 127.0.1.1; using 10.0.0.110 instead (on interface enp3s0)
22/08/03 20:51:49 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address


Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).


22/08/03 20:52:01 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
root
 |-- DEST_COUNTRY_NAME: string (nullable = true)
 |-- ORIGIN_COUNTRY_NAME: string (nullable = true)
 |-- count: integer (nullable = true)



[Stage 0:>                                                          (0 + 1) / 1]

In [None]:
# Importando funções adicionais
from pyspark.sql.functions import col, expr

# Adicionando coluna
df_double_count = df.withColumn("double_count", expr("count * 2"))

# Visualizando resultado
df_double_count.show(5)

+-----------------+-------------------+-----+------------+
|DEST_COUNTRY_NAME|ORIGIN_COUNTRY_NAME|count|double_count|
+-----------------+-------------------+-----+------------+
|    United States|            Romania|   15|          30|
|    United States|            Croatia|    1|           2|
|    United States|            Ireland|  344|         688|
|            Egypt|      United States|   15|          30|
|    United States|              India|   62|         124|
+-----------------+-------------------+-----+------------+
only showing top 5 rows



In [None]:
# Transformando e consultando
df.withColumn("triple_count", col("count") * 3)\
    .select(
        col("ORIGIN_COUNTRY_NAME").alias("pais_origem"),
        expr("DEST_COUNTRY_NAME AS pais_destino"),
        "count",
        "triple_count"
    ).show(5, truncate=False)

+-------------+-------------+-----+------------+
|pais_origem  |pais_destino |count|triple_count|
+-------------+-------------+-----+------------+
|Romania      |United States|15   |45          |
|Croatia      |United States|1    |3           |
|Ireland      |United States|344  |1032        |
|United States|Egypt        |15   |45          |
|India        |United States|62   |186         |
+-------------+-------------+-----+------------+
only showing top 5 rows



In [None]:
# Renomeando colunas
df_renamed = df.withColumnRenamed("DEST_COUNTRY_NAME", "pais_destino")\
    .withColumnRenamed("ORIGIN_COUNTRY_NAME", "pais_origem")\
    .withColumnRenamed("count", "qtd_voos")

# Visualizando novo DataFrame
df_renamed.show(5)

+-------------+-------------+--------+
| pais_destino|  pais_origem|qtd_voos|
+-------------+-------------+--------+
|United States|      Romania|      15|
|United States|      Croatia|       1|
|United States|      Ireland|     344|
|        Egypt|United States|      15|
|United States|        India|      62|
+-------------+-------------+--------+
only showing top 5 rows



In [None]:
# Renomando colunas através de consultas
df_renamed_select = df.select(
    col("DEST_COUNTRY_NAME").alias("pais_destino"),
    expr("ORIGIN_COUNTRY_NAME AS pais_origem"),
    expr("count AS contagem")
)

# Visualizando resultado
df_renamed_select.show(5)

+-------------+-------------+--------+
| pais_destino|  pais_origem|contagem|
+-------------+-------------+--------+
|United States|      Romania|      15|
|United States|      Croatia|       1|
|United States|      Ireland|     344|
|        Egypt|United States|      15|
|United States|        India|      62|
+-------------+-------------+--------+
only showing top 5 rows



In [None]:
# Renomeando colunas com SparkSQL
df_renamed_sql = spark.sql("""
    SELECT
        DEST_COUNTRY_NAME AS pais_destino,
        ORIGIN_COUNTRY_NAME AS pais_origem,
        count AS qtd_viagens

    FROM tbl_flights
""")

# Visualizando resultado
df_renamed_sql.show(5)

+-------------+-------------+-----------+
| pais_destino|  pais_origem|qtd_viagens|
+-------------+-------------+-----------+
|United States|      Romania|         15|
|United States|      Croatia|          1|
|United States|      Ireland|        344|
|        Egypt|United States|         15|
|United States|        India|         62|
+-------------+-------------+-----------+
only showing top 5 rows



In [None]:
# Eliminando colunas
df_dropped = df.drop("count")

# Visualizando
df_dropped.show(5)

+-----------------+-------------------+
|DEST_COUNTRY_NAME|ORIGIN_COUNTRY_NAME|
+-----------------+-------------------+
|    United States|            Romania|
|    United States|            Croatia|
|    United States|            Ireland|
|            Egypt|      United States|
|    United States|              India|
+-----------------+-------------------+
only showing top 5 rows



In [None]:
df.drop(*["ORIGIN_COUNTRY_NAME", "DEST_COUNTRY_NAME"]).show(5)

+-----+
|count|
+-----+
|   15|
|    1|
|  344|
|   15|
|   62|
+-----+
only showing top 5 rows



In [None]:
# Eliminando múltiplas colunas
df.drop("ORIGIN_COUNTRY_NAME", "DEST_COUNTRY_NAME").show(5)

# Forma alternativa
to_drop = ["ORIGIN_COUNTRY_NAME", "DEST_COUNTRY_NAME"]
df.drop(*to_drop).show(5)

+-----+
|count|
+-----+
|   15|
|    1|
|  344|
|   15|
|   62|
+-----+
only showing top 5 rows

+-----+
|count|
+-----+
|   15|
|    1|
|  344|
|   15|
|   62|
+-----+
only showing top 5 rows

