In [29]:
import pyspark.sql.functions as F
from pyspark.sql.types import *
from pyspark.sql import Window
import pandas as pd

### Leitura de dados

In [4]:
linkage_path = 'tmp/exemplo_linkage_cases-brazil-cities-time.csv'

In [7]:
linkage = spark.read.csv(linkage_path, header=False)

In [8]:
linkage.printSchema()

root
 |-- _c0: string (nullable = true)
 |-- _c1: string (nullable = true)
 |-- _c2: string (nullable = true)
 |-- _c3: string (nullable = true)
 |-- _c4: string (nullable = true)
 |-- _c5: string (nullable = true)
 |-- _c6: string (nullable = true)
 |-- _c7: string (nullable = true)
 |-- _c8: string (nullable = true)
 |-- _c9: string (nullable = true)
 |-- _c10: string (nullable = true)
 |-- _c11: string (nullable = true)
 |-- _c12: string (nullable = true)



In [13]:
# Renomeando colunas
header = ["id1", "id2", "city1", "city2", "regiao_saude1", "regiao_saude2", "codmun1", "codmun2", "date1", "date2", "epi_week1", "epi_week2", "score"]
for nome_antigo, novo_nome in zip(linkage.columns, header):
    linkage = linkage.withColumnRenamed(nome_antigo, novo_nome)

In [14]:
linkage.printSchema()

root
 |-- id1: string (nullable = true)
 |-- id2: string (nullable = true)
 |-- city1: string (nullable = true)
 |-- city2: string (nullable = true)
 |-- regiao_saude1: string (nullable = true)
 |-- regiao_saude2: string (nullable = true)
 |-- codmun1: string (nullable = true)
 |-- codmun2: string (nullable = true)
 |-- date1: string (nullable = true)
 |-- date2: string (nullable = true)
 |-- epi_week1: string (nullable = true)
 |-- epi_week2: string (nullable = true)
 |-- score: string (nullable = true)



### Filtrando dados

In [17]:
linkage.limit(10).toPandas()

Unnamed: 0,id1,id2,city1,city2,regiao_saude1,regiao_saude2,codmun1,codmun2,date1,date2,epi_week1,epi_week2,score
0,0,0,São Paulo/SP,São Paulo/SP,São Paulo,São Paulo,355030,355030,20200225,20200225,9,9,1.0
1,2,2,São Paulo/SP,São Paulo/SP,São Paulo,São Paulo,355030,355030,20200226,20200226,9,9,1.0
2,4,4,São Paulo/SP,São Paulo/SP,São Paulo,São Paulo,355030,355030,20200227,20200227,9,9,1.0
3,6,6,São Paulo/SP,São Paulo/SP,São Paulo,São Paulo,355030,355030,20200228,20200228,9,9,1.0
4,1,1,TOTAL,TOTAL,,,0,0,20200225,20200225,9,9,0.7237569060773481
5,5,5,TOTAL,TOTAL,,,0,0,20200227,20200227,9,9,0.7237569060773481
6,3,3,TOTAL,TOTAL,,,0,0,20200226,20200226,9,9,0.7237569060773481
7,8,8,São Paulo/SP,São Paulo/SP,São Paulo,São Paulo,355030,355030,20200229,20200229,9,9,1.0
8,7,7,TOTAL,TOTAL,,,0,0,20200228,20200228,9,9,0.7237569060773481
9,10,10,São Paulo/SP,São Paulo/SP,São Paulo,São Paulo,355030,355030,20200301,20200301,10,10,1.0


In [24]:
# linkage = linkage.sample(0.25).cache()
# linkage = linkage.limit(100).cache()

In [30]:
# Converter o tipo correto
linkage = linkage.withColumn('score', F.col('score').cast(DoubleType()))

In [31]:
linkage.printSchema()

root
 |-- id1: string (nullable = true)
 |-- id2: string (nullable = true)
 |-- city1: string (nullable = true)
 |-- city2: string (nullable = true)
 |-- regiao_saude1: string (nullable = true)
 |-- regiao_saude2: string (nullable = true)
 |-- codmun1: string (nullable = true)
 |-- codmun2: string (nullable = true)
 |-- date1: string (nullable = true)
 |-- date2: string (nullable = true)
 |-- epi_week1: string (nullable = true)
 |-- epi_week2: string (nullable = true)
 |-- score: double (nullable = true)



In [32]:
# Descrição dos dados
linkage.select('score').summary().toPandas().set_index('summary') #.T

Unnamed: 0_level_0,score
summary,Unnamed: 1_level_1
count,97.0
mean,0.957281995785157
stddev,0.1003974082159323
min,0.7237569060773481
25%,1.0
50%,1.0
75%,1.0
max,1.0


In [34]:
# Distribuição dos dados
linkage.groupBy(F.round('score', 1)).count().show()

+---------------+-----+
|round(score, 1)|count|
+---------------+-----+
|            1.0|   82|
|            0.7|   15|
|           null|    3|
+---------------+-----+



### Extraindo dados de colunas
* Dia, mes, ano
* Extraindo o estado do municipio

In [36]:
linkage.limit(10).toPandas()

Unnamed: 0,id1,id2,city1,city2,regiao_saude1,regiao_saude2,codmun1,codmun2,date1,date2,epi_week1,epi_week2,score
0,2,2,São Paulo/SP,São Paulo/SP,São Paulo,São Paulo,355030,355030,20200226,20200226,9,9,1.000000
1,7,7,TOTAL,TOTAL,,,0,0,20200228,20200228,9,9,0.723757
2,10,10,São Paulo/SP,São Paulo/SP,São Paulo,São Paulo,355030,355030,20200301,20200301,10,10,1.000000
3,14,14,São Paulo/SP,São Paulo/SP,São Paulo,São Paulo,355030,355030,20200303,20200303,10,10,1.000000
4,13,13,TOTAL,TOTAL,,,0,0,20200302,20200302,10,10,0.723757
...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,400,400,Belo Jardim/PE,Belo Jardim/PE,Caruaru,Caruaru,260170,260170,20200319,20200319,12,12,1.000000
96,410,410,Campo Grande/MS,Campo Grande/MS,Campo Grande,Campo Grande,500270,500270,20200319,20200319,12,12,1.000000
97,413,413,Caxias do Sul/RS,Caxias do Sul/RS,Região 23 - Caxias e Hortênsias,Região 23 - Caxias e Hortênsias,430510,430510,20200319,20200319,12,12,1.000000
98,416,416,Cotia/SP,Cotia/SP,Mananciais,Mananciais,351300,351300,20200319,20200319,12,12,1.000000


In [45]:
linkage = linkage.withColumn('dia', F.substring('date1', 7, 2))
linkage = linkage.withColumn('mes', F.substring('date1', 5, 2))
linkage = linkage.withColumn('ano', F.substring('date1', 1, 4))

In [46]:
linkage.select('date1', 'ano', 'mes', 'dia').limit(20).toPandas()

Unnamed: 0,date1,ano,mes,dia
0,20200226,2020,2,26
1,20200228,2020,2,28
2,20200301,2020,3,1
3,20200303,2020,3,3
4,20200302,2020,3,2
5,20200301,2020,3,1
6,20200305,2020,3,5
7,20200306,2020,3,6
8,20200307,2020,3,7
9,20200307,2020,3,7


In [None]:
# Exercicio
# Extrair o estado do cod do municipio

### Operações entre colunas 
* +, -, *, /

In [48]:
# Soma de colunas
linkage.select('dia', 'mes', (F.col('dia') + F.col('mes')).alias('soma')).show()

+---+---+----+
|dia|mes|soma|
+---+---+----+
| 26| 02|28.0|
| 28| 02|30.0|
| 01| 03| 4.0|
| 03| 03| 6.0|
| 02| 03| 5.0|
| 01| 03| 4.0|
| 05| 03| 8.0|
| 06| 03| 9.0|
| 07| 03|10.0|
| 07| 03|10.0|
| 06| 03| 9.0|
| 06| 03| 9.0|
| 07| 03|10.0|
| 07| 03|10.0|
| 06| 03| 9.0|
| 08| 03|11.0|
| 08| 03|11.0|
| 10| 03|13.0|
| 10| 03|13.0|
| 10| 03|13.0|
+---+---+----+
only showing top 20 rows



In [50]:
# Multiplicação e soma de colunas
linkage.select('dia', 'mes', (F.col('dia') + (F.col('mes') - F.lit(1)) * 30).alias('soma')).show()

+---+---+----+
|dia|mes|soma|
+---+---+----+
| 26| 02|56.0|
| 28| 02|58.0|
| 01| 03|61.0|
| 03| 03|63.0|
| 02| 03|62.0|
| 01| 03|61.0|
| 05| 03|65.0|
| 06| 03|66.0|
| 07| 03|67.0|
| 07| 03|67.0|
| 06| 03|66.0|
| 06| 03|66.0|
| 07| 03|67.0|
| 07| 03|67.0|
| 06| 03|66.0|
| 08| 03|68.0|
| 08| 03|68.0|
| 10| 03|70.0|
| 10| 03|70.0|
| 10| 03|70.0|
+---+---+----+
only showing top 20 rows



In [54]:
# Divisão
linkage.select('codmun1', (F.col('codmun1') / 10000).cast(IntegerType()).alias('xx')).show()

+-------+---+
|codmun1| xx|
+-------+---+
| 355030| 35|
|      0|  0|
| 355030| 35|
| 355030| 35|
|      0|  0|
|      0|  0|
| 330040| 33|
| 291080| 29|
| 330040| 33|
| 530010| 53|
|     32|  0|
|      0|  0|
| 330455| 33|
| 355030| 35|
|     35|  0|
| 312230| 31|
|     32|  0|
|     32|  0|
| 355030| 35|
|      0|  0|
+-------+---+
only showing top 20 rows



### Comparando colunas
* Registros que têm TODAS as colunas de linkage iguais (menos os ids) e o score diferente de 1
* Registros que têm ALGUMA coluna do linkage diferente (menos os ids) e o score igual de 1
  * DICA: Usar `.filter` com as devidas condições