# DataFrame's

In [1]:
import pandas as pd
import numpy as np
from sklearn import datasets

### Data Set

In [2]:
iris = datasets.load_iris()
df = pd.DataFrame(
    data=np.c_[iris['data'], iris['target']],
        columns= iris['feature_names'] + ['species'])

df

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm),species
0,5.1,3.5,1.4,0.2,0.0
1,4.9,3.0,1.4,0.2,0.0
2,4.7,3.2,1.3,0.2,0.0
3,4.6,3.1,1.5,0.2,0.0
4,5.0,3.6,1.4,0.2,0.0
...,...,...,...,...,...
145,6.7,3.0,5.2,2.3,2.0
146,6.3,2.5,5.0,1.9,2.0
147,6.5,3.0,5.2,2.0,2.0
148,6.2,3.4,5.4,2.3,2.0


### Select

In [3]:
# Coluna como vetor
# Não pode selecionar duas ou mais
# df["sepal length (cm)"] 

# Coluna como dataframe
df[ ["petal width (cm)", "species"] ]


Unnamed: 0,petal width (cm),species
0,0.2,0.0
1,0.2,0.0
2,0.2,0.0
3,0.2,0.0
4,0.2,0.0
...,...,...
145,2.3,2.0
146,1.9,2.0
147,2.0,2.0
148,2.3,2.0


### Tranformação de uma coluna

In [4]:
species = {0 : "setosa", 1: "versicolor", 2: "virginica"}
df["species"] = df["species"].apply(lambda x: species[x])
original = df.copy()
df

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm),species
0,5.1,3.5,1.4,0.2,setosa
1,4.9,3.0,1.4,0.2,setosa
2,4.7,3.2,1.3,0.2,setosa
3,4.6,3.1,1.5,0.2,setosa
4,5.0,3.6,1.4,0.2,setosa
...,...,...,...,...,...
145,6.7,3.0,5.2,2.3,virginica
146,6.3,2.5,5.0,1.9,virginica
147,6.5,3.0,5.2,2.0,virginica
148,6.2,3.4,5.4,2.3,virginica


### Mutate

In [5]:
df["sepal_area"] = df["sepal length (cm)"] * df["sepal width (cm)"]
df.head()

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm),species,sepal_area
0,5.1,3.5,1.4,0.2,setosa,17.85
1,4.9,3.0,1.4,0.2,setosa,14.7
2,4.7,3.2,1.3,0.2,setosa,15.04
3,4.6,3.1,1.5,0.2,setosa,14.26
4,5.0,3.6,1.4,0.2,setosa,18.0


### Filter

In [6]:
# df.loc[df["species"] == "setosa"]
df.query("species == 'setosa'", inplace=True)
df.tail()

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm),species,sepal_area
45,4.8,3.0,1.4,0.3,setosa,14.4
46,5.1,3.8,1.6,0.2,setosa,19.38
47,4.6,3.2,1.4,0.2,setosa,14.72
48,5.3,3.7,1.5,0.2,setosa,19.61
49,5.0,3.3,1.4,0.2,setosa,16.5


### Group by + Count

In [7]:
original.groupby("species").size().reset_index(name='n')

Unnamed: 0,species,n
0,setosa,50
1,versicolor,50
2,virginica,50


In [8]:
original.groupby("species").mean()

Unnamed: 0_level_0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm)
species,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
setosa,5.006,3.428,1.462,0.246
versicolor,5.936,2.77,4.26,1.326
virginica,6.588,2.974,5.552,2.026


In [9]:
(original
    .loc[:, ["sepal length (cm)", "species"]]
    .query("`sepal length (cm)` < 6.0")
    .groupby("species")
    .median()
    .reset_index()
)

Unnamed: 0,species,sepal length (cm)
0,setosa,5.0
1,versicolor,5.6
2,virginica,5.8


# Pyspark

In [5]:
from pyspark.sql import SparkSession, Row

spark = SparkSession.builder.getOrCreate()

### Leitura do banco

In [17]:
df = spark.read.csv("../Listas/Lista_1/dados/AC-Parte_1.csv", header=True, sep=";")
df.createOrReplaceTempView("t1")

### Querys em SQL

In [18]:
seg_dose = f"""
    SELECT * FROM t1
    WHERE (
        vacina_descricao_dose='2ª Dose' OR
        vacina_descricao_dose='2ª Dose Revacinação')
    LIMIT 1
"""

fast_join = f"""
    SELECT name_health_region as regiao_saude
    FROM ({seg_dose}) as seg_dose
    LEFT JOIN codigos
    ON seg_dose.estabelecimento_municipio_codigo
    = codigos.est_mun_codigo
"""

qnt_vax = f"""
    SELECT regiao_saude, COUNT(*) AS N
    FROM ({fast_join})
    GROUP BY regiao_saude
"""

faixa = f"""
    WITH qntVax AS ({qnt_vax})
    SELECT regiao_saude, N,
        CASE WHEN N > (
            SELECT percentile_approx(N, 0.5) 
            FROM qntVax)
        THEN 'Alto'
        ELSE 'Baixo'
        END AS Faixa
        FROM qntVax
"""

bot5 = f"""
    WITH tabFaixa AS ({faixa})
    SELECT regiao_saude, N, Faixa 
    FROM (
        SELECT *, dense_rank()
        OVER (PARTITION BY Faixa ORDER BY N DESC) as posicao
        FROM tabFaixa
    )
    WHERE posicao <= 5
"""