# Transformaciones con Pyspark 2

---

## Primero lo primero... importamos los módulos que usaremos

In [1]:
from pyspark.sql import SparkSession, SQLContext
from pyspark.sql.functions import col, lit, concat

from pyspark.sql.types import IntegerType, StringType, FloatType

## Nos conectamos:

In [2]:
# spark_session = SparkSession.builder.appName('data engineering training').getOrCreate()
spark_session = (
    SparkSession
        .builder
        .appName('data engineering training')
        .getOrCreate()
)
spark_context = spark_session.sparkContext
sql_context = SQLContext(spark_context)



## Leemos el dataframe

In [3]:
scores_df = (
    sql_context
        .read
        .option('header', True)
        .csv('student_scores_2020.csv')
)

scores_df.show()

+---+----------+----------------+---------+----+----------+-----+
|_c0|student_id|    student_name|  subject|year|      date|score|
+---+----------+----------------+---------+----+----------+-----+
|  0|         1|    Ada Lovelace|Filosofía|2020|2020-03-31|  4.4|
|  1|         2|     Hedy Lamarr|Filosofía|2020|2020-03-31|  5.7|
|  2|         3|     Jude Milhon|Filosofía|2020|2020-03-31|  3.8|
|  3|         4|     Ángela Ruiz|Filosofía|2020|2020-03-31|  6.9|
|  4|         5|      Carol Shaw|Filosofía|2020|2020-03-31|  6.8|
|  5|         6| Marisol Alarcón|Filosofía|2020|2020-03-31|  6.1|
|  6|         7|    Katie Bouman|Filosofía|2020|2020-03-31|  3.4|
|  7|         8|     Alan Turing|Filosofía|2020|2020-03-31|  6.7|
|  8|         9|Jhon von Neumann|Filosofía|2020|2020-03-31|  6.3|
|  9|        10|Richard Stallman|Filosofía|2020|2020-03-31|  3.8|
| 10|         1|    Ada Lovelace|Filosofía|2020|2020-06-31|  5.7|
| 11|         2|     Hedy Lamarr|Filosofía|2020|2020-06-31|  4.4|
| 12|     


---

In [21]:
my_list = [1, 2, 3, 'hola', []]

my_list

[1, 2, 3, 'hola', []]

In [24]:
my_list[2] = 6538432849327

In [25]:
my_list

[1, 2, 6538432849327, 'hola', []]

In [27]:
my_tuple = (1,2,3,4,5,'fdashldsa', [], ())
my_tuple

(1, 2, 3, 4, 5, 'fdashldsa', [], ())

In [29]:
my_tuple[2] = 47109437021

TypeError: 'tuple' object does not support item assignment

In [30]:
my_set = {1, 1, 2, 2, 3, 3, 4, 4, 5, 5}

my_set

{1, 2, 3, 4, 5}

In [None]:
my_dict = {'key1':'value', }

---

In [20]:
row_list = scores_df.head(5)

for row in row_list:
    print('*****', row)



***** Row(_c0='0', student_id='1', student_name='Ada Lovelace', subject='Filosofía', year='2020', date='2020-03-31', score='4.4')
***** Row(_c0='1', student_id='2', student_name='Hedy Lamarr', subject='Filosofía', year='2020', date='2020-03-31', score='5.7')
***** Row(_c0='2', student_id='3', student_name='Jude Milhon', subject='Filosofía', year='2020', date='2020-03-31', score='3.8')
***** Row(_c0='3', student_id='4', student_name='Ángela Ruiz', subject='Filosofía', year='2020', date='2020-03-31', score='6.9')
***** Row(_c0='4', student_id='5', student_name='Carol Shaw', subject='Filosofía', year='2020', date='2020-03-31', score='6.8')


In [33]:
scores_df.show()

+---+----------+----------------+---------+----+----------+-----+
|_c0|student_id|    student_name|  subject|year|      date|score|
+---+----------+----------------+---------+----+----------+-----+
|  0|         1|    Ada Lovelace|Filosofía|2020|2020-03-31|  4.4|
|  1|         2|     Hedy Lamarr|Filosofía|2020|2020-03-31|  5.7|
|  2|         3|     Jude Milhon|Filosofía|2020|2020-03-31|  3.8|
|  3|         4|     Ángela Ruiz|Filosofía|2020|2020-03-31|  6.9|
|  4|         5|      Carol Shaw|Filosofía|2020|2020-03-31|  6.8|
|  5|         6| Marisol Alarcón|Filosofía|2020|2020-03-31|  6.1|
|  6|         7|    Katie Bouman|Filosofía|2020|2020-03-31|  3.4|
|  7|         8|     Alan Turing|Filosofía|2020|2020-03-31|  6.7|
|  8|         9|Jhon von Neumann|Filosofía|2020|2020-03-31|  6.3|
|  9|        10|Richard Stallman|Filosofía|2020|2020-03-31|  3.8|
| 10|         1|    Ada Lovelace|Filosofía|2020|2020-06-31|  5.7|
| 11|         2|     Hedy Lamarr|Filosofía|2020|2020-06-31|  4.4|
| 12|     

NoneType

## 2) Fe de Erratas: Las funciones que se importan nos sirven para usarlas dentro de otras funciones.

In [54]:

from pyspark.sql.functions import (
    min as spark_min,
    max as spark_max,
    avg as spark_svg,
    sum as spark_sum,
    round as spark_round,
    row_number
)


In [47]:
round(5.48379017492074, 3)

5.484

In [55]:
(
    scores_df
        .agg(spark_round(sum(col('score'))), min(col('score')), max(col('score')), avg(col('score')))
).show()

+--------------------+----------+----------+-----------------+
|round(sum(score), 0)|min(score)|max(score)|       avg(score)|
+--------------------+----------+----------+-----------------+
|              1210.0|       3.0|       6.9|5.042083333333329|
+--------------------+----------+----------+-----------------+



## group by


In [56]:
 (
scores_df
    .withColumn('score', col('score').cast(FloatType()))
    .groupBy('student_name', 'subject').avg('score')
 ).show()

+----------------+-------------------+-----------------+
|    student_name|            subject|       avg(score)|
+----------------+-------------------+-----------------+
|     Ángela Ruiz|       Neurociencia|5.074999928474426|
|     Hedy Lamarr|         Psicología|5.174999952316284|
| Marisol Alarcón|   Educación Cívica|5.200000047683716|
|     Alan Turing|       Neurociencia|5.075000047683716|
|    Ada Lovelace|Finanzas y Economía|5.025000035762787|
|    Ada Lovelace|       Programación|4.674999952316284|
|     Jude Milhon|       Neurociencia|5.024999976158142|
|Jhon von Neumann|         Psicología|5.274999976158142|
|Richard Stallman|Finanzas y Economía|4.400000035762787|
|    Katie Bouman|Finanzas y Economía|             5.25|
|      Carol Shaw|          Filosofía|5.150000095367432|
|     Jude Milhon|   Educación Cívica|5.199999988079071|
|     Hedy Lamarr|          Filosofía|5.425000071525574|
| Marisol Alarcón|Finanzas y Economía|5.275000035762787|
|     Jude Milhon|       Progra

In [None]:
scores_df = scores_df.withColumn('score', col('score').cast(FloatType()))
scores_df = scores_df.groupBy('student_name', 'subject').avg('score')

scores_df.show()

+----------------+-------------------+-----------------+
|    student_name|            subject|       avg(score)|
+----------------+-------------------+-----------------+
|     Ángela Ruiz|       Neurociencia|5.074999928474426|
|     Hedy Lamarr|         Psicología|5.174999952316284|
| Marisol Alarcón|   Educación Cívica|5.200000047683716|
|     Alan Turing|       Neurociencia|5.075000047683716|
|    Ada Lovelace|Finanzas y Economía|5.025000035762787|
|    Ada Lovelace|       Programación|4.674999952316284|
|     Jude Milhon|       Neurociencia|5.024999976158142|
|Jhon von Neumann|         Psicología|5.274999976158142|
|Richard Stallman|Finanzas y Economía|4.400000035762787|
|    Katie Bouman|Finanzas y Economía|             5.25|
|      Carol Shaw|          Filosofía|5.150000095367432|
|     Jude Milhon|   Educación Cívica|5.199999988079071|
|     Hedy Lamarr|          Filosofía|5.425000071525574|
| Marisol Alarcón|Finanzas y Economía|5.275000035762787|
|     Jude Milhon|       Progra

## Windows Functions

In [10]:
from pyspark.sql.window import Window

In [57]:
student_window = Window.partitionBy('student_name', 'subject').orderBy('date')


(
    scores_df
        .withColumn('prom', avg('score').over(student_window))
).show()

+---+----------+---------------+-------------------+----+----------+-----+-----------------+
|_c0|student_id|   student_name|            subject|year|      date|score|             prom|
+---+----------+---------------+-------------------+----+----------+-----+-----------------+
|203|         4|    Ángela Ruiz|       Neurociencia|2020|2020-03-31|  5.6|              5.6|
|213|         4|    Ángela Ruiz|       Neurociencia|2020|2020-06-31|  4.7|             5.15|
|223|         4|    Ángela Ruiz|       Neurociencia|2020|2020-09-20|  4.3|4.866666666666667|
|233|         4|    Ángela Ruiz|       Neurociencia|2020|2020-12-17|  5.7|            5.075|
|161|         2|    Hedy Lamarr|         Psicología|2020|2020-03-31|  4.5|              4.5|
|171|         2|    Hedy Lamarr|         Psicología|2020|2020-06-31|  6.2|             5.35|
|181|         2|    Hedy Lamarr|         Psicología|2020|2020-09-20|  3.4|              4.7|
|191|         2|    Hedy Lamarr|         Psicología|2020|2020-12-17|  

In [69]:
student_window = Window.partitionBy('student_name', 'subject').orderBy(col('date').desc())


(
    scores_df
        .withColumn('row_number', row_number().over(student_window))
        # .where(col('row_number') == 1)
).show()

+---+----------+---------------+-------------------+----+----------+-----+----------+
|_c0|student_id|   student_name|            subject|year|      date|score|row_number|
+---+----------+---------------+-------------------+----+----------+-----+----------+
|233|         4|    Ángela Ruiz|       Neurociencia|2020|2020-12-17|  5.7|         1|
|223|         4|    Ángela Ruiz|       Neurociencia|2020|2020-09-20|  4.3|         2|
|213|         4|    Ángela Ruiz|       Neurociencia|2020|2020-06-31|  4.7|         3|
|203|         4|    Ángela Ruiz|       Neurociencia|2020|2020-03-31|  5.6|         4|
|191|         2|    Hedy Lamarr|         Psicología|2020|2020-12-17|  6.6|         1|
|181|         2|    Hedy Lamarr|         Psicología|2020|2020-09-20|  3.4|         2|
|171|         2|    Hedy Lamarr|         Psicología|2020|2020-06-31|  6.2|         3|
|161|         2|    Hedy Lamarr|         Psicología|2020|2020-03-31|  4.5|         4|
|115|         6|Marisol Alarcón|   Educación Cívica|20

In [None]:

the zen of python

estructura SQL
crear columa
renombrar columnas

quitar columnas
seleccionar columnas

cambiar tipos de datos (castear)

funciones de pyapark (concat, col, lit  etc...)
group by
join
window function

transformaciones con SQL en el where