In [1]:
import pyspark.sql.functions as F
from pyspark.sql.types import *
from pyspark.sql import Window
import pandas as pd
import random

In [2]:
read_path = '/home/tatiane/Downloads/exemplo_linkage_cases-brazil-cities-time.csv/tmp/exemplo_linkage_cases-brazil-cities-time.csv'

In [3]:
df = spark.read.csv(read_path, sep = ",", multiLine = True)

In [4]:
#renomeando colunas
header = ["id1", "id2", "city1", "city2", "regiao_saude1", "regiao_saude2", "codmun1", "codmun2", "date1", "date2", "epi_week1", "epi_week2", "score"]

for nome_antigo, novo_nome in zip(df.columns, header):
    df = df.withColumnRenamed(nome_antigo, novo_nome)

In [5]:
df = df.withColumn('score', F.col('score').cast(DoubleType()))

In [6]:
# Descrição dos dados
df.select('score').summary().toPandas().set_index('summary') #.T #transforma coluna em linhas para facilitar a visão da tabela.

Unnamed: 0_level_0,score
summary,Unnamed: 1_level_1
count,2243800.0
mean,0.9604942705214136
stddev,0.0738695774735633
min,0.5165745856353592
25%,1.0
50%,1.0
75%,1.0
max,1.0


In [7]:
df.select(F.substring('score', 1, 3).alias('faixa_score')).show()

+-----------+
|faixa_score|
+-----------+
|        1.0|
|        1.0|
|        1.0|
|        1.0|
|        0.7|
|        0.7|
|        0.7|
|        1.0|
|        0.7|
|        1.0|
|        1.0|
|        1.0|
|        0.7|
|        0.7|
|        0.7|
|        0.7|
|        1.0|
|        1.0|
|        0.7|
|        0.7|
+-----------+
only showing top 20 rows



In [8]:
df.groupBy(F.substring('score', 1, 4).alias('faixa_score')).count().show()

+-----------+-------+
|faixa_score|  count|
+-----------+-------+
|        1.0|1727048|
|       0.55|   2288|
|       0.68|     52|
|       0.82| 489104|
|       0.72|   8606|
|       null|   5177|
|       0.96|  11116|
|       0.51|      2|
|       0.65|     26|
|       0.93|   5558|
+-----------+-------+



In [9]:
df.groupBy('score').count().show() #investigando se realmente tem nulos na base

+------------------+-------+
|             score|  count|
+------------------+-------+
|0.9654696132596685|  11116|
|0.5511049723756907|   2288|
|0.6892265193370166|     52|
|              null|   5177|
|               1.0|1727048|
|0.7237569060773481|   8606|
| 0.930939226519337|   5558|
|0.8273480662983426| 489104|
|0.6546961325966851|     26|
|0.5165745856353592|      2|
+------------------+-------+



In [10]:
# Distribuição dos dados
df.groupBy(F.round('score', 1)).count().show()

+---------------+-------+
|round(score, 1)|  count|
+---------------+-------+
|            0.7|   8684|
|           null|   5177|
|            1.0|1738164|
|            0.6|   2288|
|            0.8| 489104|
|            0.5|      2|
|            0.9|   5558|
+---------------+-------+



### Atividade: criar faixas de score
ex:

        
*>= 0.90 = 1 <br>
*>= 0.80 e < 0.90 = 2 <br>
*< 0.80 = 3

In [11]:
df = (df
      .withColumn('faixa_score', 
                  F.when(df.score >= 0.90, 1)
                  .when((df.score >= 0.80) & (df.score < 0.90), 2)
                  .when(df.score < 0.80, 3))
)

In [12]:
df.limit(10).toPandas()

Unnamed: 0,id1,id2,city1,city2,regiao_saude1,regiao_saude2,codmun1,codmun2,date1,date2,epi_week1,epi_week2,score,faixa_score
0,0,0,São Paulo/SP,São Paulo/SP,São Paulo,São Paulo,355030,355030,20200225,20200225,9,9,1.0,1
1,2,2,São Paulo/SP,São Paulo/SP,São Paulo,São Paulo,355030,355030,20200226,20200226,9,9,1.0,1
2,4,4,São Paulo/SP,São Paulo/SP,São Paulo,São Paulo,355030,355030,20200227,20200227,9,9,1.0,1
3,6,6,São Paulo/SP,São Paulo/SP,São Paulo,São Paulo,355030,355030,20200228,20200228,9,9,1.0,1
4,1,1,TOTAL,TOTAL,,,0,0,20200225,20200225,9,9,0.723757,3
5,5,5,TOTAL,TOTAL,,,0,0,20200227,20200227,9,9,0.723757,3
6,3,3,TOTAL,TOTAL,,,0,0,20200226,20200226,9,9,0.723757,3
7,8,8,São Paulo/SP,São Paulo/SP,São Paulo,São Paulo,355030,355030,20200229,20200229,9,9,1.0,1
8,7,7,TOTAL,TOTAL,,,0,0,20200228,20200228,9,9,0.723757,3
9,10,10,São Paulo/SP,São Paulo/SP,São Paulo,São Paulo,355030,355030,20200301,20200301,10,10,1.0,1


In [13]:
# Distribuição dos dados
df.groupBy(F.round('faixa_score', 1)).count().show()

+---------------------+-------+
|round(faixa_score, 1)|  count|
+---------------------+-------+
|                 null|   5177|
|                    1|1743722|
|                    3|  10974|
|                    2| 489104|
+---------------------+-------+



### Links uteis para pesquisa:

* https://www.w3schools.com/python/ref_string_split.asp
* https://docs.microsoft.com/pt-br/azure/databricks/spark/latest/dataframes-datasets/introduction-to-dataframes-python
* https://www.delftstack.com/pt/howto/python-pandas/how-to-add-new-column-to-existing-dataframe-in-python-pandas/#:~:text=Voc%C3%AA%20pode%20utilizar%20a%20fun%C3%A7%C3%A3o,inser%C3%A7%C3%A3o%20a%20partir%20de%20zero.
* https://spark.apache.org/docs/2.1.0/api/python/pyspark.sql.html