In [1]:
# Importando bibliotecas
from pyspark.sql import SparkSession
import os

# Inicializando sessão
spark = (SparkSession
    .builder
    .appName("art05-leitura-e-escrita")
    .getOrCreate())

# Definindo variáveis de diretório para leitura dos arquivos
parent_dir = ''.join(os.path.pardir + "/") * 3
flights_dir = os.path.join(parent_dir, 'data/flights-data/summary-data')

# Definindo variáveis de leitura para cada formato
csv_data_path = os.path.join(flights_dir, 'csv/2015-summary.csv')
json_data_path = os.path.join(flights_dir, 'json/2015-summary.json')
orc_data_path = os.path.join(flights_dir, 'orc/2010-summary.orc/part-r-00000-2c4f7d96-e703-4de3-af1b-1441d172c80f.snappy.orc')
avro_data_path = os.path.join(flights_dir, 'avro/part-00000-tid-7128780539805330008-467d814d-6f80-4951-a951-f9f7fb8e3930-1434-1-c000.avro')
parquet_data_path = os.path.join(flights_dir, 'parquet/2010-summary.parquet/part-r-00000-1a9822ba-b8fb-4d8e-844a-ea30d0801b9e.gz.parquet')
spark

22/07/19 20:47:53 WARN Utils: Your hostname, panini-ubuntu resolves to a loopback address: 127.0.1.1; using 10.0.0.106 instead (on interface enp3s0)
22/07/19 20:47:53 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address


Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).


22/07/19 20:47:58 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
22/07/19 20:48:01 WARN Utils: Service 'SparkUI' could not bind on port 4040. Attempting port 4041.


## Lendo Arquivo CSV

In [None]:
# Realizando a leitura de arquivo CSV com opções características
df_csv = (spark.read.format("csv")
    .option("inferSchema", "true")
    .option("header", "true")
    .load(csv_data_path))

# Visualizando resultado
df_csv.show(10)

In [None]:
# Verificando schema
df_csv.printSchema()

## Leitura de arquivo JSON

In [None]:
# Importando tipos primtiivos
from pyspark.sql.types import StructType, StructField, StringType, IntegerType

# Definindo schema
flights_schema = StructType([
    StructField("DEST_COUNTRY_NAME", StringType(), nullable=True),
    StructField("ORIGIN_COUNTRY_NAME", StringType(), nullable=True),
    StructField("count", IntegerType(), nullable=True)
])

# Realizando a leitura de arquivo JSON com schema explícito
df_json = (spark.read.format("json")
    .schema(flights_schema)
    .load(json_data_path))

# Verificando schema
print(df_json.printSchema())

# Visualizando resultado
df_json.show(10)

## Lendo arquivos ORC e PARQUET

In [None]:
# Lendo arquivo ORC
df_orc = spark.read.format("orc").load(orc_data_path)

# Lendo arquivos parquert
df_parquet = spark.read.format("parquet").load(parquet_data_path)

In [None]:
# Validando schema
print(f'Schemas iguais? {df_orc.schema == df_parquet.schema}\n')

# Amostra orc
df_orc.show(5)

# Amostra parquet
df_parquet.show(5)

## Bônus: Leitura de S3

In [2]:
# Configurando sessão para leitura de dados do s3
spark._jsc.hadoopConfiguration().set("fs.s3a.access.key", "AKIASG54ID4RYWWGUQWG")
spark._jsc.hadoopConfiguration().set("fs.s3a.secret.key", "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")
#spark._jsc.hadoopConfiguration().set("com.amazonaws.services.s3.enableV4", "true")
spark._jsc.hadoopConfiguration().set("fs.s3a.endpoint", "s3.amazonaws.com")

df_s3 = spark.read.format("csv")\
    .option("header", "true")\
    .option("inferSchema", "true")\
    .load("s3a://aws-training-toolkit-152329264931-us-east-1/data/flights-data/summary-data/csv/2012-summary.csv")
df_s3.show(5)

22/07/19 20:48:20 WARN MetricsConfig: Cannot locate configuration: tried hadoop-metrics2-s3a-file-system.properties,hadoop-metrics2.properties


[Stage 2:>                                                          (0 + 1) / 1]

+-----------------+-------------------+-----+
|DEST_COUNTRY_NAME|ORIGIN_COUNTRY_NAME|count|
+-----------------+-------------------+-----+
|    United States|            Croatia|    1|
|    United States|            Ireland|  252|
|            Egypt|      United States|   13|
|    United States|              India|   62|
|    United States|          Singapore|   25|
+-----------------+-------------------+-----+
only showing top 5 rows



                                                                                

In [3]:
df_s3.show(5)

[Stage 2:>                                                          (0 + 1) / 1]

+-----------------+-------------------+-----+
|DEST_COUNTRY_NAME|ORIGIN_COUNTRY_NAME|count|
+-----------------+-------------------+-----+
|    United States|            Croatia|    1|
|    United States|            Ireland|  252|
|            Egypt|      United States|   13|
|    United States|              India|   62|
|    United States|          Singapore|   25|
+-----------------+-------------------+-----+
only showing top 5 rows



                                                                                