In [0]:
# Global data variables
SANDBOX_NAME = # Sandbox Name
DATA_PATH = "/data/sandboxes/" + SANDBOX_NAME + "/data/data/" 

# Nueva sección

# Nueva sección

# Nueva sección




# Organizar datos

Una de las fases del proceso de data wrangling consiste en dar una estructura a los datos, normálmente esta fase conlleva las siguientes operaciones:

- Establecer índices, renombrar columnas.

- Ordenar valores.

- Eliminar duplicados

- Filtrar registros y/o columnas

- Editar información

- Modificar la estructura de los datos



<div class="alert alert-danger" role="alert">
  <strong>NOTA:</strong> Los métodos para organizar los datos de DataFrames de Spark son siempre <b>transformaciones</b>. Es importante recordar que el resultado de una transformación de un DataFrame es siempre otro DataFrame.
</div>

In [0]:
vancouver_df = spark.read.csv(DATA_PATH + 'crime_in_vancouver.csv', sep=',', header=True, inferSchema=True)
pokemon_df = spark.read.csv(DATA_PATH + 'pokemon.csv', sep=',', header=True, inferSchema=True)

In [0]:
pokemon_df.show(6)

+-------------+------+------+---+------+-------+-------+-------+-----+----------+---------+
|         Name|Type 1|Type 2| HP|Attack|Defense|Sp. Atk|Sp. Def|Speed|Generation|Legendary|
+-------------+------+------+---+------+-------+-------+-------+-----+----------+---------+
|    Bulbasaur| Grass|Poison| 45|    49|     49|     65|     65|   45|         1|    false|
|      Ivysaur| Grass|Poison| 60|    62|     63|     80|     80|   60|         1|    false|
|     Venusaur| Grass|Poison| 80|    82|     83|    100|    100|   80|         1|    false|
|Mega Venusaur| Grass|Poison| 80|   100|    123|    122|    120|   80|         1|    false|
|   Charmander|  Fire|  null| 39|    52|     43|     60|     50|   65|         1|    false|
|   Charmeleon|  Fire|  null| 58|    64|     58|     80|     65|   80|         1|    false|
+-------------+------+------+---+------+-------+-------+-------+-----+----------+---------+
only showing top 6 rows



In [0]:
vancouver_df.show(5)

+--------------------+----+-----+---+----+------+--------------------+--------------------+---------+----------+
|                TYPE|YEAR|MONTH|DAY|HOUR|MINUTE|       HUNDRED_BLOCK|       NEIGHBOURHOOD|        X|         Y|
+--------------------+----+-----+---+----+------+--------------------+--------------------+---------+----------+
|Offence Against a...|2003|    5| 17|null|  null|OFFSET TO PROTECT...|                null|      0.0|       0.0|
|  Theft from Vehicle|2003|    1|  7|  10|     0|CARDERO ST / W GE...|            West End|490503.48|5459766.67|
|  Theft from Vehicle|2003|    7| 27|   5|     0|    53XX CHAMBERS ST| Renfrew-Collingwood|496145.89|5453740.68|
|Break and Enter R...|2003|    3|  8|   4|    15|     19XX E 12TH AVE|Kensington-Cedar ...|495302.97|5456313.79|
|  Theft from Vehicle|2003|   10|  9|  16|     0|     16XX CHARLES ST|  Grandview-Woodland|494877.89| 5457816.4|
+--------------------+----+-----+---+----+------+--------------------+--------------------+-----



## Filtrar Columnas

### Select

El método `select` es una **transformación** para seleccionar un subconjunto de columnas. `select` puede recibir una lista de columnas o los nombres de las columnas como parámetros independientes. Funciona como la sentencia SQL _SELECT_.

In [0]:
pk_name_type = pokemon_df.select('Name', 'Type 1', 'Type 2')
pk_name_type.show(5)

+-------------+------+------+
|         Name|Type 1|Type 2|
+-------------+------+------+
|    Bulbasaur| Grass|Poison|
|      Ivysaur| Grass|Poison|
|     Venusaur| Grass|Poison|
|Mega Venusaur| Grass|Poison|
|   Charmander|  Fire|  null|
+-------------+------+------+
only showing top 5 rows



In [0]:
columns = ['NEIGHBOURHOOD', 'TYPE', 'YEAR']

vancouver_df.select(columns).show(5)

+--------------------+--------------------+----+
|       NEIGHBOURHOOD|                TYPE|YEAR|
+--------------------+--------------------+----+
|                null|Offence Against a...|2003|
|            West End|  Theft from Vehicle|2003|
| Renfrew-Collingwood|  Theft from Vehicle|2003|
|Kensington-Cedar ...|Break and Enter R...|2003|
|  Grandview-Woodland|  Theft from Vehicle|2003|
+--------------------+--------------------+----+
only showing top 5 rows





### Drop

El método `drop` tiene la función contraria al `select`, elimina un subconjunto de columnas. En este caso no se puede pasar una lista de columnas, es necesario utlizar el operador `*` para convertirlo a parámetros indivuales.

**OJO:** Si se intenta eliminar una columna que no existe no devuelve error.

In [0]:
vancouver_df = vancouver_df.drop('X', 'Y', 'Z')
vancouver_df.show(3)

+--------------------+----+-----+---+----+------+--------------------+-------------------+
|                TYPE|YEAR|MONTH|DAY|HOUR|MINUTE|       HUNDRED_BLOCK|      NEIGHBOURHOOD|
+--------------------+----+-----+---+----+------+--------------------+-------------------+
|Offence Against a...|2003|    5| 17|null|  null|OFFSET TO PROTECT...|               null|
|  Theft from Vehicle|2003|    1|  7|  10|     0|CARDERO ST / W GE...|           West End|
|  Theft from Vehicle|2003|    7| 27|   5|     0|    53XX CHAMBERS ST|Renfrew-Collingwood|
+--------------------+----+-----+---+----+------+--------------------+-------------------+
only showing top 3 rows



In [0]:
drop_columns = ['Attack', 'Defense', 'Sp. Atk', 'Sp. Def', 'Speed']

pokemon_df.drop(*drop_columns).show(3)

+---------+------+------+---+----------+---------+
|     Name|Type 1|Type 2| HP|Generation|Legendary|
+---------+------+------+---+----------+---------+
|Bulbasaur| Grass|Poison| 45|         1|    false|
|  Ivysaur| Grass|Poison| 60|         1|    false|
| Venusaur| Grass|Poison| 80|         1|    false|
+---------+------+------+---+----------+---------+
only showing top 3 rows





## Renombrar columnas


### Una columna

El método para renombrar columnas en _pyspark_ es `withColumnRenamed`. Este método recibe dos parámetros, el nombre de la columna original y la nueva, por tanto sirve para renombar una única columna.

In [0]:
pokemon_df.withColumnRenamed('Type 1', 'type_1').show(3)

+---------+------+------+---+------+-------+-------+-------+-----+----------+---------+
|     Name|type_1|Type 2| HP|Attack|Defense|Sp. Atk|Sp. Def|Speed|Generation|Legendary|
+---------+------+------+---+------+-------+-------+-------+-----+----------+---------+
|Bulbasaur| Grass|Poison| 45|    49|     49|     65|     65|   45|         1|    false|
|  Ivysaur| Grass|Poison| 60|    62|     63|     80|     80|   60|         1|    false|
| Venusaur| Grass|Poison| 80|    82|     83|    100|    100|   80|         1|    false|
+---------+------+------+---+------+-------+-------+-------+-----+----------+---------+
only showing top 3 rows



 

Recuerda que se pueden concatenar transformaciones.

In [0]:
vancouver_df.withColumnRenamed('YEAR', 'year').withColumnRenamed('DAY', 'day').show(3)

+--------------------+----+-----+---+----+------+--------------------+-------------------+
|                TYPE|year|MONTH|day|HOUR|MINUTE|       HUNDRED_BLOCK|      NEIGHBOURHOOD|
+--------------------+----+-----+---+----+------+--------------------+-------------------+
|Offence Against a...|2003|    5| 17|null|  null|OFFSET TO PROTECT...|               null|
|  Theft from Vehicle|2003|    1|  7|  10|     0|CARDERO ST / W GE...|           West End|
|  Theft from Vehicle|2003|    7| 27|   5|     0|    53XX CHAMBERS ST|Renfrew-Collingwood|
+--------------------+----+-----+---+----+------+--------------------+-------------------+
only showing top 3 rows





### Varias columnas

Por ejemplo, imaginemos que queremos convertir a mínusculas y sin espacios todos los nombres de las columnas. En spark hay dos opciones: un bucle con `withColumnRenamed` renamed o utilizar el método `select` con `alias`.

Usando `withColumnRenamed`:

In [0]:
for col in pokemon_df.columns:
    pokemon_df = pokemon_df.withColumnRenamed(col, col.lower().replace(' ', '_').replace('.', ''))

In [0]:
pokemon_df.show(3)

+---------+------+------+---+------+-------+------+------+-----+----------+---------+
|     name|type_1|type_2| hp|attack|defense|sp_atk|sp_def|speed|generation|legendary|
+---------+------+------+---+------+-------+------+------+-----+----------+---------+
|Bulbasaur| Grass|Poison| 45|    49|     49|    65|    65|   45|         1|    false|
|  Ivysaur| Grass|Poison| 60|    62|     63|    80|    80|   60|         1|    false|
| Venusaur| Grass|Poison| 80|    82|     83|   100|   100|   80|         1|    false|
+---------+------+------+---+------+-------+------+------+-----+----------+---------+
only showing top 3 rows





El módulo `pyspark.sql.functions` contiene todas las funciones de spark implementadas para tratar con DataFrames. La función `alias` se puede utilizar junto con `select` para seleccionar una columna de un DataFrame cambiándole el nombre.

Usando `select` + `alias`:

In [0]:
from pyspark.sql import functions as F

In [0]:
F.col('TYPE').alias('type')

Column<b'TYPE AS `type`'>

In [0]:
vancouver_df.select(F.col('TYPE').alias('type'), F.col('NEIGHBOURHOOD').alias('neighbourhood')).show(3)

+--------------------+-------------------+
|                type|      neighbourhood|
+--------------------+-------------------+
|Offence Against a...|               null|
|  Theft from Vehicle|           West End|
|  Theft from Vehicle|Renfrew-Collingwood|
+--------------------+-------------------+
only showing top 3 rows



In [0]:
[F.col(c).alias(c.lower().replace(' ', '_')) for c in vancouver_df.columns]

[Column<b'TYPE AS `type`'>,
 Column<b'YEAR AS `year`'>,
 Column<b'MONTH AS `month`'>,
 Column<b'DAY AS `day`'>,
 Column<b'HOUR AS `hour`'>,
 Column<b'MINUTE AS `minute`'>,
 Column<b'HUNDRED_BLOCK AS `hundred_block`'>,
 Column<b'NEIGHBOURHOOD AS `neighbourhood`'>]

In [0]:
vancouver_df = vancouver_df.select([F.col(c).alias(c.lower().replace(' ', '_')) for c in vancouver_df.columns])

In [0]:
vancouver_df.show(3)

+--------------------+----+-----+---+----+------+--------------------+-------------------+
|                type|year|month|day|hour|minute|       hundred_block|      neighbourhood|
+--------------------+----+-----+---+----+------+--------------------+-------------------+
|Offence Against a...|2003|    5| 17|null|  null|OFFSET TO PROTECT...|               null|
|  Theft from Vehicle|2003|    1|  7|  10|     0|CARDERO ST / W GE...|           West End|
|  Theft from Vehicle|2003|    7| 27|   5|     0|    53XX CHAMBERS ST|Renfrew-Collingwood|
+--------------------+----+-----+---+----+------+--------------------+-------------------+
only showing top 3 rows





## Filtrar Registros

Los métodos `filter` y `where` se utlizan para quedarse con registros que cumplan cierta condición. Se pueden utlizar indistintamente. Para poner la condición es necesario usar la función `F.col()` para indicar a _spark_ el nombre de la columna del filtro.



__Valor exacto__

In [0]:
vancouver_2008 = vancouver_df.filter(F.col('year') == 2008)
vancouver_2008.show(5)

+------------------+----+-----+---+----+------+-------------------+--------------------+
|              type|year|month|day|hour|minute|      hundred_block|       neighbourhood|
+------------------+----+-----+---+----+------+-------------------+--------------------+
|Theft from Vehicle|2008|    2| 29|  16|     0|   3XX W GEORGIA ST|Central Business ...|
|  Theft of Vehicle|2008|    5| 15|  22|    30|31XX WELLINGTON AVE| Renfrew-Collingwood|
|Theft from Vehicle|2008|    7| 11|  14|     0|    21XX W 10TH AVE|           Kitsilano|
|  Theft of Vehicle|2008|    6| 17|  17|     0|  31XX WAVERLEY AVE|           Killarney|
|  Theft of Vehicle|2008|    3| 20|  23|     0|31XX W BROADWAY AVE|           Kitsilano|
+------------------+----+-----+---+----+------+-------------------+--------------------+
only showing top 5 rows



# Nueva sección

In [0]:
vancouver_2008.count()

35413

In [0]:
vancouver_df.count()

552055

In [0]:
pokemon_df.where(F.col('name') == 'Pikachu').show(10)

+-------+--------+------+---+------+-------+------+------+-----+----------+---------+
|   name|  type_1|type_2| hp|attack|defense|sp_atk|sp_def|speed|generation|legendary|
+-------+--------+------+---+------+-------+------+------+-----+----------+---------+
|Pikachu|Electric|  null| 35|    55|     40|    50|    50|   90|         1|    false|
+-------+--------+------+---+------+-------+------+------+-----+----------+---------+



 

__Mayor/Menor que__

In [0]:
vancouver_more_2008 = vancouver_df.filter(F.col('year') >= 2008)
vancouver_more_2008.show(5)

+------------------+----+-----+---+----+------+-------------------+--------------------+
|              type|year|month|day|hour|minute|      hundred_block|       neighbourhood|
+------------------+----+-----+---+----+------+-------------------+--------------------+
|Theft from Vehicle|2008|    2| 29|  16|     0|   3XX W GEORGIA ST|Central Business ...|
|  Theft of Vehicle|2008|    5| 15|  22|    30|31XX WELLINGTON AVE| Renfrew-Collingwood|
|Theft from Vehicle|2008|    7| 11|  14|     0|    21XX W 10TH AVE|           Kitsilano|
|  Theft of Vehicle|2008|    6| 17|  17|     0|  31XX WAVERLEY AVE|           Killarney|
|  Theft of Vehicle|2008|    3| 20|  23|     0|31XX W BROADWAY AVE|           Kitsilano|
+------------------+----+-----+---+----+------+-------------------+--------------------+
only showing top 5 rows



In [0]:
vancouver_more_2008.count()

328184



__Contiene substring__

In [0]:
vancouver_df.filter(F.col('type').like("%heft%")).show(5)

+------------------+----+-----+---+----+------+--------------------+-------------------+
|              type|year|month|day|hour|minute|       hundred_block|      neighbourhood|
+------------------+----+-----+---+----+------+--------------------+-------------------+
|Theft from Vehicle|2003|    1|  7|  10|     0|CARDERO ST / W GE...|           West End|
|Theft from Vehicle|2003|    7| 27|   5|     0|    53XX CHAMBERS ST|Renfrew-Collingwood|
|Theft from Vehicle|2003|   10|  9|  16|     0|     16XX CHARLES ST| Grandview-Woodland|
|Theft from Vehicle|2003|    6|  3|  21|     0|       53XX CECIL ST|Renfrew-Collingwood|
|Theft from Vehicle|2003|    6|  7|  10|     0|    16XX CHESTNUT ST|          Kitsilano|
+------------------+----+-----+---+----+------+--------------------+-------------------+
only showing top 5 rows





__Valor en/no en lista__

Usa el operando `~` para indicar negación.

In [0]:
neighbourhoods = ['West End', 'Kitsilano', 'Killarney']

vancouver_df.where(F.col('neighbourhood').isin(neighbourhoods)).show(5)

+------------------+----+-----+---+----+------+--------------------+-------------+
|              type|year|month|day|hour|minute|       hundred_block|neighbourhood|
+------------------+----+-----+---+----+------+--------------------+-------------+
|Theft from Vehicle|2003|    1|  7|  10|     0|CARDERO ST / W GE...|     West End|
|          Mischief|2003|   11| 18|  12|     0| 34XX W BROADWAY AVE|    Kitsilano|
|Theft from Vehicle|2003|    6|  7|  10|     0|    16XX CHESTNUT ST|    Kitsilano|
|Theft from Vehicle|2003|   12|  2|  15|    35|CARDERO ST / ROBS...|     West End|
|          Mischief|2003|   11|  2|   5|    10| 34XX W BROADWAY AVE|    Kitsilano|
+------------------+----+-----+---+----+------+--------------------+-------------+
only showing top 5 rows



In [0]:
vancouver_df.filter(~F.col('year').isin([2005, 2003])).show(5)

+--------------------+----+-----+---+----+------+--------------------+--------------------+
|                type|year|month|day|hour|minute|       hundred_block|       neighbourhood|
+--------------------+----+-----+---+----+------+--------------------+--------------------+
|  Theft from Vehicle|2004|   11| 15|   6|    30|      10XX BEACH AVE|Central Business ...|
|Vehicle Collision...|2004|    1| 27|  10|    45|KINGSWAY AVE / WI...|Kensington-Cedar ...|
|Offence Against a...|2004|    8| 10|null|  null|OFFSET TO PROTECT...|                null|
|  Theft from Vehicle|2004|   11|  6|  21|     0|      10XX BEACH AVE|Central Business ...|
|Offence Against a...|2004|    8| 10|null|  null|OFFSET TO PROTECT...|                null|
+--------------------+----+-----+---+----+------+--------------------+--------------------+
only showing top 5 rows





__Combinación de filtros (AND / OR)__

In [0]:
vancouver_df.where((F.col('type') == 'Homicide') & (F.col('year') == 2007)).show(3)

+--------+----+-----+---+----+------+--------------------+-------------+
|    type|year|month|day|hour|minute|       hundred_block|neighbourhood|
+--------+----+-----+---+----+------+--------------------+-------------+
|Homicide|2007|    8| 15|null|  null|OFFSET TO PROTECT...|         null|
|Homicide|2007|    3| 24|null|  null|OFFSET TO PROTECT...|         null|
|Homicide|2007|    6| 13|null|  null|OFFSET TO PROTECT...|         null|
+--------+----+-----+---+----+------+--------------------+-------------+
only showing top 3 rows



In [0]:
vancouver_df.filter((F.col('year') == 2006) | (F.col('year') == 2007)).show(3)

+--------------------+----+-----+---+----+------+--------------------+------------------+
|                type|year|month|day|hour|minute|       hundred_block|     neighbourhood|
+--------------------+----+-----+---+----+------+--------------------+------------------+
|Break and Enter R...|2006|    7| 23|  14|    30|     19XX W 35TH AVE|       Shaughnessy|
|  Theft from Vehicle|2006|    9| 27|   7|    17|51XX PRINCE EDWAR...|        Riley Park|
|  Theft from Vehicle|2006|   10| 22|  20|    30|        20XX WALL ST|Grandview-Woodland|
+--------------------+----+-----+---+----+------+--------------------+------------------+
only showing top 3 rows



 

## Registros Duplicados

Una de las fases del data wrangling es la identificación y eliminación de registros duplicados. 

__dropDulicates__

`dropDuplicates` toma un subconjunto de columnas para identificar duplicados y devuelve un nuevo DataFrame sin los registros duplicados.

In [0]:
pokemon_df.count()

800

In [0]:
pokemon_nodup = pokemon_df.dropDuplicates()
pokemon_nodup.show(3)

+--------+------+------+---+------+-------+------+------+-----+----------+---------+
|    name|type_1|type_2| hp|attack|defense|sp_atk|sp_def|speed|generation|legendary|
+--------+------+------+---+------+-------+------+------+-----+----------+---------+
|  Cleffa| Fairy|  null| 50|    25|     28|    45|    55|   15|         2|    false|
| Wailmer| Water|  null|130|    70|     35|    70|    35|   60|         3|    false|
|Froslass|   Ice| Ghost| 70|    80|     70|    80|    70|  110|         4|    false|
+--------+------+------+---+------+-------+------+------+-----+----------+---------+
only showing top 3 rows



In [0]:
pokemon_nodup.count()

800

 

No hay duplicados teniendo en cuenta todas las columnas. Considerando únicamente las columnas *type_1* y *type_2* hay varios.

In [0]:
pokemon_nodup.dropDuplicates(subset=['type_1', 'type_2']).count()

154



__distinct__

Una llamada al método `distinct` es lo mismo que al método `dropDuplicates` sin parámetro. Es decir, tiene en cuenta todas las columnas. También se utiliza normalmente para contar los valores únicos de una columna.

In [0]:
pokemon_df.distinct().count()

800

In [0]:
pokemon_df.select('type_1').distinct().count()

18

In [0]:
vancouver_df.select('year').distinct().show()

+----+
|year|
+----+
|2003|
|2007|
|2018|
|2015|
|2006|
|2013|
|2014|
|2004|
|2012|
|2009|
|2016|
|2005|
|2010|
|2011|
|2008|
|2017|
+----+



In [0]:
vancouver_df.select('year').distinct().count()

16



## Ordenar DataFrames

Ambos métodos `sort` y `orderBy` pueden ser usados indistintamente para ordenar DataFrames. Se utilizan los métodos `asc` y `desc` sobre las columnas para indicar si el orden es ascendiente o descendiente. Se puede ordenar por múltiples columnas.

In [0]:
pokemon_df.orderBy(F.col('attack').desc()).show(10)

+-------------------+-------+--------+---+------+-------+------+------+-----+----------+---------+
|               name| type_1|  type_2| hp|attack|defense|sp_atk|sp_def|speed|generation|legendary|
+-------------------+-------+--------+---+------+-------+------+------+-----+----------+---------+
|      Mega Mewtwo X|Psychic|Fighting|106|   190|    100|   154|   100|  130|         1|     true|
|     Mega Heracross|    Bug|Fighting| 80|   185|    115|    40|   105|   75|         2|    false|
|     Primal Groudon| Ground|    Fire|100|   180|    160|   150|    90|   90|         3|     true|
|      Mega Rayquaza| Dragon|  Flying|105|   180|    100|   180|   100|  115|         3|     true|
| DeoxysAttack Forme|Psychic|    null| 50|   180|     20|   180|    20|  150|         3|     true|
|Kyurem Black Kyurem| Dragon|     Ice|125|   170|    100|   120|    90|   95|         5|     true|
|      Mega Garchomp| Dragon|  Ground|108|   170|    115|   120|    95|   92|         4|    false|
|       Me

In [0]:
pokemon_df.sort(F.col('attack').asc()).show(5)

+--------+------+------+---+------+-------+------+------+-----+----------+---------+
|    name|type_1|type_2| hp|attack|defense|sp_atk|sp_def|speed|generation|legendary|
+--------+------+------+---+------+-------+------+------+-----+----------+---------+
| Happiny|Normal|  null|100|     5|      5|    15|    65|   30|         4|    false|
| Chansey|Normal|  null|250|     5|      5|    35|   105|   50|         1|    false|
| Shuckle|   Bug|  Rock| 20|    10|    230|    10|   230|    5|         2|    false|
|Magikarp| Water|  null| 20|    10|     55|    15|    20|   80|         1|    false|
| Blissey|Normal|  null|255|    10|     10|    75|   135|   55|         2|    false|
+--------+------+------+---+------+-------+------+------+-----+----------+---------+
only showing top 5 rows



In [0]:
pokemon_df.sort(F.col('attack').asc(), F.col('hp').desc()).show(5)

+--------+------+------+---+------+-------+------+------+-----+----------+---------+
|    name|type_1|type_2| hp|attack|defense|sp_atk|sp_def|speed|generation|legendary|
+--------+------+------+---+------+-------+------+------+-----+----------+---------+
| Chansey|Normal|  null|250|     5|      5|    35|   105|   50|         1|    false|
| Happiny|Normal|  null|100|     5|      5|    15|    65|   30|         4|    false|
| Blissey|Normal|  null|255|    10|     10|    75|   135|   55|         2|    false|
| Shuckle|   Bug|  Rock| 20|    10|    230|    10|   230|    5|         2|    false|
|Magikarp| Water|  null| 20|    10|     55|    15|    20|   80|         1|    false|
+--------+------+------+---+------+-------+------+------+-----+----------+---------+
only showing top 5 rows





## Agrupar registros

`groupBy` sirve para agrupar los datos sobre los campos indicados haciendo una operación sobre ellos. Las operaciones a calcular se indican dentro de `agg()` y deben encontrarse dentro de `pyspark.sql.functions`. 

In [0]:
pokemon_df.groupBy('type_1').agg(F.avg('attack')).show(5)

+------+-----------------+
|type_1|      avg(attack)|
+------+-----------------+
| Water|74.15178571428571|
|Poison|74.67857142857143|
| Steel|92.70370370370371|
|  Rock|92.86363636363636|
|   Ice|            72.75|
+------+-----------------+
only showing top 5 rows



# Nueva sección



Se puede utilizar `alias` para definir el nombre de la columna de salida.

In [0]:
pokemon_df.groupBy('type_1').agg(F.avg('defense').alias('avg_def')).show(5)

+------+------------------+
|type_1|           avg_def|
+------+------------------+
| Water| 72.94642857142857|
|Poison| 68.82142857142857|
| Steel|126.37037037037037|
|  Rock|100.79545454545455|
|   Ice| 71.41666666666667|
+------+------------------+
only showing top 5 rows





Se pueden incluir en el groupBy multiples columnas y operaciones.

In [0]:
pokemon_df.groupBy('type_1', 'legendary').agg(F.max('hp')).show(5)

+-------+---------+-------+
| type_1|legendary|max(hp)|
+-------+---------+-------+
|   Dark|    false|    110|
|Psychic|     true|    106|
|  Steel|    false|     80|
|   Fire|    false|    110|
|  Water|    false|    170|
+-------+---------+-------+
only showing top 5 rows



In [0]:
pokemon_df.groupBy('legendary').agg(F.avg('attack'), F.avg('defense'), F.avg('hp')).show()

+---------+------------------+-----------------+-----------------+
|legendary|       avg(attack)|     avg(defense)|          avg(hp)|
+---------+------------------+-----------------+-----------------+
|     true|116.67692307692307|99.66153846153846|92.73846153846154|
|    false| 75.66938775510204|71.55918367346939|67.18231292517007|
+---------+------------------+-----------------+-----------------+





El caso excepcional es si la operación es un `count` que no necesita el método `agg`.

In [0]:
vancouver_df.groupBy('year').count().show()

+----+-----+
|year|count|
+----+-----+
|2003|49926|
|2007|37679|
|2018| 2561|
|2015|34342|
|2006|42323|
|2013|29067|
|2014|32644|
|2004|49278|
|2012|29224|
|2009|32164|
|2016|37823|
|2005|44665|
|2010|29705|
|2011|28570|
|2008|35413|
|2017|36671|
+----+-----+





Recuerda que se pueden contactenar transformaciones

In [0]:
vancouver_df.groupBy('year').count().orderBy(F.col('year').desc()).show()

+----+-----+
|year|count|
+----+-----+
|2018| 2561|
|2017|36671|
|2016|37823|
|2015|34342|
|2014|32644|
|2013|29067|
|2012|29224|
|2011|28570|
|2010|29705|
|2009|32164|
|2008|35413|
|2007|37679|
|2006|42323|
|2005|44665|
|2004|49278|
|2003|49926|
+----+-----+





## Limitar el número de registros

El método `limit` devuelve un nuevo DataFrame con únicamente _N_ filas.

# Nueva sección

# Nueva sección

In [0]:
vancouver_df.count()

552055

In [0]:
vancouver_1000 = vancouver_df.limit(1000)
vancouver_1000.show(5)

+--------------------+----+-----+---+----+------+--------------------+--------------------+
|                type|year|month|day|hour|minute|       hundred_block|       neighbourhood|
+--------------------+----+-----+---+----+------+--------------------+--------------------+
|Offence Against a...|2003|    5| 17|null|  null|OFFSET TO PROTECT...|                null|
|  Theft from Vehicle|2003|    1|  7|  10|     0|CARDERO ST / W GE...|            West End|
|  Theft from Vehicle|2003|    7| 27|   5|     0|    53XX CHAMBERS ST| Renfrew-Collingwood|
|Break and Enter R...|2003|    3|  8|   4|    15|     19XX E 12TH AVE|Kensington-Cedar ...|
|  Theft from Vehicle|2003|   10|  9|  16|     0|     16XX CHARLES ST|  Grandview-Woodland|
+--------------------+----+-----+---+----+------+--------------------+--------------------+
only showing top 5 rows



In [0]:
vancouver_1000.count()

1000