In [2]:
# Importando biblitoecas
from pyspark.sql import SparkSession
from pyspark.sql.types import StructType, StructField, \
    StringType, IntegerType, DoubleType, LongType
from pyspark.sql.functions import col, expr
import os 
from warnings import filterwarnings
filterwarnings("ignore")

# Criando objeto de sessão
spark = SparkSession\
    .builder\
    .appName("agregacoes")\
    .getOrCreate()

# Definindo variáveis de diretório
home_path = os.path.expanduser('~')
data_path = os.path.join(home_path, 'dev/panini-tech-lab/data')
iot_path = os.path.join(data_path, 'iot-devices/iot_devices.json')

# Definindo schema para o arquivo a ser lido
iot_schema = StructType([
    StructField("device_id", IntegerType(), nullable=False),
    StructField("device_name", StringType(), nullable=True),
    StructField("ip", StringType(), nullable=True),
    StructField("cca2", StringType(), nullable=True),
    StructField("cca3", StringType(), nullable=True),
    StructField("cn", StringType(), nullable=True),
    StructField("latitude", DoubleType(), nullable=True),
    StructField("longitude", DoubleType(), nullable=True),
    StructField("scale", StringType(), nullable=True),
    StructField("temp", IntegerType(), nullable=True),
    StructField("humidity", IntegerType(), nullable=True),
    StructField("battery_level", StringType(), nullable=True),
    StructField("c02_level", IntegerType(), nullable=True),
    StructField("lcd", StringType(), nullable=True),
    StructField("timestamp", LongType(), nullable=False)
])

# Lendo dados
df_iot = spark.read.format("json")\
    .schema(iot_schema).load(iot_path)

# Criando tabelas temporárias
df_iot.createOrReplaceTempView("tbl_iot")

# Visualizando dados
df_iot.printSchema()
df_iot.show(5)

root
 |-- device_id: integer (nullable = true)
 |-- device_name: string (nullable = true)
 |-- ip: string (nullable = true)
 |-- cca2: string (nullable = true)
 |-- cca3: string (nullable = true)
 |-- cn: string (nullable = true)
 |-- latitude: double (nullable = true)
 |-- longitude: double (nullable = true)
 |-- scale: string (nullable = true)
 |-- temp: integer (nullable = true)
 |-- humidity: integer (nullable = true)
 |-- battery_level: string (nullable = true)
 |-- c02_level: integer (nullable = true)
 |-- lcd: string (nullable = true)
 |-- timestamp: long (nullable = true)

+---------+--------------------+-------------+----+----+-------------+--------+---------+-------+----+--------+-------------+---------+------+-------------+
|device_id|         device_name|           ip|cca2|cca3|           cn|latitude|longitude|  scale|temp|humidity|battery_level|c02_level|   lcd|    timestamp|
+---------+--------------------+-------------+----+----+-------------+--------+---------+-------+-

## count

In [5]:
# Importando função
from pyspark.sql.functions import count

# Contando leituras realizadas
df_iot.select(
    count("device_id")
).show()

[Stage 5:>                                                          (0 + 4) / 4]

+----------------+
|count(device_id)|
+----------------+
|          198164|
+----------------+



                                                                                

## countDistinct

In [11]:
# Importando função
from pyspark.sql.functions import countDistinct

# Contando países distintos
df_iot.select(
    countDistinct("cca3")
).show()

+--------------------+
|count(DISTINCT cca3)|
+--------------------+
|                 205|
+--------------------+



## first e last

In [12]:
# Importando funções
from pyspark.sql.functions import first, last

# Obtendo primeira e última medição de temperatura
df_iot.select(
    first("temp").alias("primeira_temperatura"),
    last("temp").alias("ultima_temperatura")
).show()

+--------------------+------------------+
|primeira_temperatura|ultima_temperatura|
+--------------------+------------------+
|                  34|                12|
+--------------------+------------------+



## min e max

In [15]:
# Importando funções
from pyspark.sql.functions import min, max

# Obtendo mínimo e máximo de temperatura
df_iot.where(col("scale") == "Celsius")\
    .select(
        min("temp").alias("min_temp"),
        max("temp").alias("max_temp")
    ).show()

+--------+--------+
|min_temp|max_temp|
+--------+--------+
|      10|      34|
+--------+--------+



### sum

In [17]:
# Importando função
from pyspark.sql.functions import sum

# Soma total de nível de gás carbônico
df_iot.select(
    sum("c02_level")
).show()

+--------------+
|sum(c02_level)|
+--------------+
|     237750022|
+--------------+



### avg

In [18]:
# Importando função
from pyspark.sql.functions import avg

# Obtendo a umidade média da base
df_iot.select(
    avg("humidity")
).show()

+-----------------+
|    avg(humidity)|
+-----------------+
|61.99212773258513|
+-----------------+



### var e stddev


In [20]:
# Importando funções
from pyspark.sql.functions import var_pop, var_samp, \
    stddev_pop, stddev_samp

# Obtendo variância e desvio padrão de gás carbônico
df_iot.select(
    var_pop("c02_level").alias("var_pop_co2"),
    var_samp("c02_level").alias("var_samp_co2"),
    stddev_pop("c02_level").alias("stddev_pop_co2"),
    stddev_samp("c02_level").alias("stddev_samp_co2")
).show()



+------------------+------------------+------------------+-----------------+
|       var_pop_co2|      var_samp_co2|    stddev_pop_co2|  stddev_samp_co2|
+------------------+------------------+------------------+-----------------+
|53388.466026752736|53388.735443677324|231.05944262624874|231.0600256290069|
+------------------+------------------+------------------+-----------------+



                                                                                

## Grouping

In [34]:
# Importando função
from pyspark.sql.functions import desc

# Retornando os países com maior média de temperatura
df_high_temp = df_iot\
    .groupBy("cca3")\
    .avg("temp")\
    .orderBy(desc("avg(temp)"))

# Visualizando resultado
df_high_temp.show(10)

+----+------------------+
|cca3|         avg(temp)|
+----+------------------+
| AIA|31.142857142857142|
| GRL|              29.5|
| GAB|              28.0|
| VUT|              27.3|
| LCA|              27.0|
| MWI|26.666666666666668|
| TKM|26.666666666666668|
| IRQ|26.428571428571427|
| LAO|26.285714285714285|
| IOT|              26.0|
+----+------------------+
only showing top 10 rows



In [40]:
# Coletando temperatura média e picos de umidade por país
df_iot\
    .groupBy("cca3").agg(
        count("device_name").alias("qtd_medicoes"),
        avg("temp").alias("temperatura_media"),
        max("humidity").alias("max_humid"),
        min("humidity").alias("min_humid")
    ).orderBy(desc("max_humid")).show(10)



+----+------------+------------------+---------+---------+
|cca3|qtd_medicoes| temperatura_media|max_humid|min_humid|
+----+------------+------------------+---------+---------+
| VIR|          51|20.392156862745097|       99|       25|
| PNG|          16|             23.25|       99|       30|
| HRV|         193|22.854922279792746|       99|       25|
| POL|        2744|21.983965014577258|       99|       25|
| GBR|        6486|22.085877274128894|       99|       25|
| JAM|          44|22.113636363636363|       99|       26|
| ARE|         123|22.211382113821138|       99|       25|
| JOR|          46|21.065217391304348|       99|       27|
| CRI|         116| 22.45689655172414|       99|       25|
| URY|         117|21.572649572649574|       99|       25|
+----+------------+------------------+---------+---------+
only showing top 10 rows



                                                                                

In [41]:
df_iot.show(5)

+---------+--------------------+-------------+----+----+-------------+--------+---------+-------+----+--------+-------------+---------+------+-------------+
|device_id|         device_name|           ip|cca2|cca3|           cn|latitude|longitude|  scale|temp|humidity|battery_level|c02_level|   lcd|    timestamp|
+---------+--------------------+-------------+----+----+-------------+--------+---------+-------+----+--------+-------------+---------+------+-------------+
|        1|meter-gauge-1xbYRYcj| 68.161.225.1|  US| USA|United States|    38.0|    -97.0|Celsius|  34|      51|            8|      868| green|1458444054093|
|        2|   sensor-pad-2n2Pea|213.161.254.1|  NO| NOR|       Norway|   62.47|     6.15|Celsius|  11|      70|            7|     1473|   red|1458444054119|
|        3| device-mac-36TWSKiT|    88.36.5.1|  IT| ITA|        Italy|   42.83|    12.83|Celsius|  19|      44|            2|     1556|   red|1458444054120|
|        4|   sensor-pad-4mzWkz|66.39.173.154|  US| USA|Un

In [49]:
# Agrupando por país e lcd
df_iot.groupby("cn", "lcd").agg(
    expr("count(1) AS qtd_medicoes"),
    expr("round(avg(c02_level), 1) AS co2_medio"),
    expr("round(stddev_pop(battery_level), 1) AS stddev_battery"),
    expr("round(max(temp), 1) AS max_temp"),
    expr("round(min(temp), 1) AS min_temp"),
).orderBy(desc("qtd_medicoes")).show(10, truncate=False)



+-----------------+------+------------+---------+--------------+--------+--------+
|cn               |lcd   |qtd_medicoes|co2_medio|stddev_battery|max_temp|min_temp|
+-----------------+------+------------+---------+--------------+--------+--------+
|United States    |yellow|34329       |1200.9   |2.9           |34      |10      |
|United States    |green |17166       |900.6    |2.9           |34      |10      |
|United States    |red   |17050       |1500.5   |2.9           |34      |10      |
|China            |yellow|7229        |1200.1   |2.9           |34      |10      |
|Japan            |yellow|6087        |1201.9   |2.9           |34      |10      |
|Republic of Korea|yellow|5902        |1197.8   |2.9           |34      |10      |
|Germany          |yellow|4002        |1198.7   |2.9           |34      |10      |
|China            |red   |3616        |1500.1   |2.9           |34      |10      |
|China            |green |3610        |900.2    |2.9           |34      |10      |
|Uni

                                                                                