[![pythonista](img/pythonista.png)](https://www.pythonista.io)

# Distribución y agrupamiento.

In [1]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName("Intro a UNION").getOrCreate()
ct = spark.sparkContext
%load_ext sparksql_magic

In [2]:
(spark.read.option("inferSchema", "true").
 option("header", "true").csv('data/IRIS.csv').
 createOrReplaceTempView("IRIS"))

In [6]:
%%sparksql
SELECT *
FROM IRIS;

only showing top 20 row(s)


0,1,2,3,4
sepal_length,sepal_width,petal_length,petal_width,species
5.1,3.5,1.4,0.2,Iris-setosa
4.9,3.0,1.4,0.2,Iris-setosa
4.7,3.2,1.3,0.2,Iris-setosa
4.6,3.1,1.5,0.2,Iris-setosa
5.0,3.6,1.4,0.2,Iris-setosa
5.4,3.9,1.7,0.4,Iris-setosa
4.6,3.4,1.4,0.3,Iris-setosa
5.0,3.4,1.5,0.2,Iris-setosa
4.4,2.9,1.4,0.2,Iris-setosa


## La claúsula `DISTRIBUTE BY`.

In [8]:
%%sparksql
SELECT 
    species,
    petal_length
FROM IRIS
TABLESAMPLE (BUCKET 3 OUT OF 30)
DISTRIBUTE BY species;

0,1
species,petal_length
Iris-virginica,5.8
Iris-virginica,5.5
Iris-virginica,5.3
Iris-virginica,5.4
Iris-virginica,5.1
Iris-setosa,1.5
Iris-versicolor,4.7
Iris-versicolor,3.3
Iris-versicolor,4.2


## La claúsula `CLUSTER BY`.

In [9]:
%%sparksql
SELECT 
    species, 
    petal_length
FROM IRIS
TABLESAMPLE (BUCKET 2 OUT OF 30)
CLUSTER BY species;

0,1
species,petal_length
Iris-setosa,1.2
Iris-setosa,1.4
Iris-versicolor,4.9
Iris-versicolor,3.3
Iris-versicolor,4.7
Iris-versicolor,4.4
Iris-versicolor,3.9
Iris-versicolor,4.8
Iris-versicolor,4.7


## `PARTITION BY`.

```
<func>(<col>) OVER(PARTITION BY <col>)
```

In [11]:
%%sparksql
SELECT 
    species,
    petal_length,
    avg(petal_length) OVER(PARTITION BY species) as promedio
FROM IRIS
TABLESAMPLE (BUCKET 2 OUT OF 30);

0,1,2
species,petal_length,promedio
Iris-setosa,1.5,1.4
Iris-setosa,1.3,1.4
Iris-versicolor,3.5,3.5
Iris-versicolor,3.5,3.5
Iris-virginica,5.3,5.2
Iris-virginica,4.9,5.2
Iris-virginica,5.4,5.2
Iris-virginica,5.2,5.2


In [12]:
%%sparksql
SELECT 
    species,
    avg(petal_length)
FROM IRIS
TABLESAMPLE (BUCKET 2 OUT OF 30)
GROUP BY species;

0,1
species,avg(petal_length)
Iris-virginica,4.933333333333334
Iris-setosa,1.3999999999999997
Iris-versicolor,4.0


### Funciones de ventana (windows functions).

https://spark.apache.org/docs/latest/sql-ref-syntax-qry-select-window.html

In [14]:
%%sparksql
SELECT 
    species,
    petal_length,
    RANK() OVER(PARTITION BY species ORDER BY petal_length DESC) as orden
FROM IRIS
TABLESAMPLE (BUCKET 2 OUT OF 30);

0,1,2
species,petal_length,orden
Iris-setosa,1.6,1
Iris-setosa,1.6,1
Iris-setosa,1.5,3
Iris-setosa,1.4,4
Iris-versicolor,4.7,1
Iris-versicolor,4.5,2
Iris-versicolor,4.2,3
Iris-virginica,5.1,1
Iris-virginica,5.0,2


<p style="text-align: center"><a rel="license" href="http://creativecommons.org/licenses/by/4.0/"><img alt="Licencia Creative Commons" style="border-width:0" src="https://i.creativecommons.org/l/by/4.0/80x15.png" /></a><br />Esta obra está bajo una <a rel="license" href="http://creativecommons.org/licenses/by/4.0/">Licencia Creative Commons Atribución 4.0 Internacional</a>.</p>
<p style="text-align: center">&copy; José Luis Chiquete Valdivieso. 2023.</p>