# Transformaciones con Pyspark 2

---

## Primero lo primero... importamos los módulos que usaremos

In [1]:
from pyspark.sql import SparkSession, SQLContext
from pyspark.sql.functions import col, lit, concat

from pyspark.sql.types import IntegerType, StringType, FloatType

## Nos conectamos:

In [2]:
# spark_session = SparkSession.builder.appName('data engineering training').getOrCreate()
spark_session = (
    SparkSession
        .builder
        .appName('data engineering training')
        .getOrCreate()
)
spark_context = spark_session.sparkContext
sql_context = SQLContext(spark_context)



## Leemos el dataframe

In [3]:
scores_df = (
    sql_context
        .read
        .option('header', True)
        .csv('student_scores_2020.csv')
)

scores_df.show()

+---+----------+----------------+---------+----+----------+-----+
|_c0|student_id|    student_name|  subject|year|      date|score|
+---+----------+----------------+---------+----+----------+-----+
|  0|         1|    Ada Lovelace|Filosofía|2020|2020-03-31|  4.4|
|  1|         2|     Hedy Lamarr|Filosofía|2020|2020-03-31|  5.7|
|  2|         3|     Jude Milhon|Filosofía|2020|2020-03-31|  3.8|
|  3|         4|     Ángela Ruiz|Filosofía|2020|2020-03-31|  6.9|
|  4|         5|      Carol Shaw|Filosofía|2020|2020-03-31|  6.8|
|  5|         6| Marisol Alarcón|Filosofía|2020|2020-03-31|  6.1|
|  6|         7|    Katie Bouman|Filosofía|2020|2020-03-31|  3.4|
|  7|         8|     Alan Turing|Filosofía|2020|2020-03-31|  6.7|
|  8|         9|Jhon von Neumann|Filosofía|2020|2020-03-31|  6.3|
|  9|        10|Richard Stallman|Filosofía|2020|2020-03-31|  3.8|
| 10|         1|    Ada Lovelace|Filosofía|2020|2020-06-31|  5.7|
| 11|         2|     Hedy Lamarr|Filosofía|2020|2020-06-31|  4.4|
| 12|     


---

In [4]:
my_list = [1, 2, 3, 'hola', []]

my_list

[1, 2, 3, 'hola', []]

In [5]:
my_list[2] = 6538432849327

In [6]:
my_list

[1, 2, 6538432849327, 'hola', []]

In [7]:
my_tuple = (1,2,3,4,5,'fdashldsa', [], ())
my_tuple

(1, 2, 3, 4, 5, 'fdashldsa', [], ())

In [8]:
# my_tuple[2] = 47109437021

In [9]:
my_set = {1, 1, 2, 2, 3, 3, 4, 4, 5, 5}

my_set

{1, 2, 3, 4, 5}

In [10]:
my_dict = {'key1':'value', }

---

In [11]:
row_list = scores_df.head(5)

for row in row_list:
    print('*****', row)




***** Row(_c0='0', student_id='1', student_name='Ada Lovelace', subject='Filosofía', year='2020', date='2020-03-31', score='4.4')
***** Row(_c0='1', student_id='2', student_name='Hedy Lamarr', subject='Filosofía', year='2020', date='2020-03-31', score='5.7')
***** Row(_c0='2', student_id='3', student_name='Jude Milhon', subject='Filosofía', year='2020', date='2020-03-31', score='3.8')
***** Row(_c0='3', student_id='4', student_name='Ángela Ruiz', subject='Filosofía', year='2020', date='2020-03-31', score='6.9')
***** Row(_c0='4', student_id='5', student_name='Carol Shaw', subject='Filosofía', year='2020', date='2020-03-31', score='6.8')


In [12]:
scores_df.show()

+---+----------+----------------+---------+----+----------+-----+
|_c0|student_id|    student_name|  subject|year|      date|score|
+---+----------+----------------+---------+----+----------+-----+
|  0|         1|    Ada Lovelace|Filosofía|2020|2020-03-31|  4.4|
|  1|         2|     Hedy Lamarr|Filosofía|2020|2020-03-31|  5.7|
|  2|         3|     Jude Milhon|Filosofía|2020|2020-03-31|  3.8|
|  3|         4|     Ángela Ruiz|Filosofía|2020|2020-03-31|  6.9|
|  4|         5|      Carol Shaw|Filosofía|2020|2020-03-31|  6.8|
|  5|         6| Marisol Alarcón|Filosofía|2020|2020-03-31|  6.1|
|  6|         7|    Katie Bouman|Filosofía|2020|2020-03-31|  3.4|
|  7|         8|     Alan Turing|Filosofía|2020|2020-03-31|  6.7|
|  8|         9|Jhon von Neumann|Filosofía|2020|2020-03-31|  6.3|
|  9|        10|Richard Stallman|Filosofía|2020|2020-03-31|  3.8|
| 10|         1|    Ada Lovelace|Filosofía|2020|2020-06-31|  5.7|
| 11|         2|     Hedy Lamarr|Filosofía|2020|2020-06-31|  4.4|
| 12|     

## 2) Fe de Erratas: Las funciones que se importan nos sirven para usarlas dentro de otras funciones.

In [13]:

from pyspark.sql.functions import (
    min as spark_min,
    max as spark_max,
    avg as spark_avg,
    sum as spark_sum,
    round as spark_round,
    row_number
)


In [14]:
round(5.48379017492074, 3)

5.484

In [15]:
(
    scores_df
        .agg(spark_round(spark_sum(col('score'))), spark_min(col('score')), spark_max(col('score')), spark_avg(col('score')))
).show()

+--------------------+----------+----------+-----------------+
|round(sum(score), 0)|min(score)|max(score)|       avg(score)|
+--------------------+----------+----------+-----------------+
|              1210.0|       3.0|       6.9|5.042083333333329|
+--------------------+----------+----------+-----------------+



## group by


In [16]:
 (
scores_df
    .withColumn('score', col('score').cast(FloatType()))
    .groupBy('student_name', 'subject').avg('score')
 ).show()

+----------------+-------------------+-----------------+
|    student_name|            subject|       avg(score)|
+----------------+-------------------+-----------------+
|     Ángela Ruiz|       Neurociencia|5.074999928474426|
|     Hedy Lamarr|         Psicología|5.174999952316284|
| Marisol Alarcón|   Educación Cívica|5.200000047683716|
|     Alan Turing|       Neurociencia|5.075000047683716|
|    Ada Lovelace|Finanzas y Economía|5.025000035762787|
|    Ada Lovelace|       Programación|4.674999952316284|
|     Jude Milhon|       Neurociencia|5.024999976158142|
|Jhon von Neumann|         Psicología|5.274999976158142|
|Richard Stallman|Finanzas y Economía|4.400000035762787|
|    Katie Bouman|Finanzas y Economía|             5.25|
|      Carol Shaw|          Filosofía|5.150000095367432|
|     Jude Milhon|   Educación Cívica|5.199999988079071|
|     Hedy Lamarr|          Filosofía|5.425000071525574|
| Marisol Alarcón|Finanzas y Economía|5.275000035762787|
|     Jude Milhon|       Progra

In [17]:
# scores_df = scores_df.withColumn('score', col('score').cast(FloatType()))
# scores_df = scores_df.groupBy('student_name', 'subject').avg('score')

# scores_df.show()

## Windows Functions

In [18]:
from pyspark.sql.window import Window

In [19]:
from pyspark.sql.functions import row_number

In [20]:
student_window = Window.partitionBy('student_name', 'subject').orderBy(col('date').desc())


(
    scores_df
        .withColumn('prom', spark_avg('score').over(student_window))
        .withColumn('sum', spark_sum('score').over(student_window))
        .withColumn('row_number', row_number().over(student_window))
).show()

+---+----------+---------------+-------------------+----+----------+-----+------------------+------------------+----------+
|_c0|student_id|   student_name|            subject|year|      date|score|              prom|               sum|row_number|
+---+----------+---------------+-------------------+----+----------+-----+------------------+------------------+----------+
|233|         4|    Ángela Ruiz|       Neurociencia|2020|2020-12-17|  5.7|               5.7|               5.7|         1|
|223|         4|    Ángela Ruiz|       Neurociencia|2020|2020-09-20|  4.3|               5.0|              10.0|         2|
|213|         4|    Ángela Ruiz|       Neurociencia|2020|2020-06-31|  4.7|4.8999999999999995|              14.7|         3|
|203|         4|    Ángela Ruiz|       Neurociencia|2020|2020-03-31|  5.6| 5.074999999999999|20.299999999999997|         4|
|191|         2|    Hedy Lamarr|         Psicología|2020|2020-12-17|  6.6|               6.6|               6.6|         1|
|181|   

In [21]:
student_window = Window.partitionBy('student_name', 'subject').orderBy(col('date').asc())


(
    scores_df
        .withColumn('row_number', row_number().over(student_window))
        .where(col('row_number') == 1)
).show()

+---+----------+----------------+-------------------+----+----------+-----+----------+
|_c0|student_id|    student_name|            subject|year|      date|score|row_number|
+---+----------+----------------+-------------------+----+----------+-----+----------+
|203|         4|     Ángela Ruiz|       Neurociencia|2020|2020-03-31|  5.6|         1|
|161|         2|     Hedy Lamarr|         Psicología|2020|2020-03-31|  4.5|         1|
| 85|         6| Marisol Alarcón|   Educación Cívica|2020|2020-03-31|  4.8|         1|
|207|         8|     Alan Turing|       Neurociencia|2020|2020-03-31|  4.1|         1|
|120|         1|    Ada Lovelace|Finanzas y Economía|2020|2020-03-31|  4.7|         1|
| 40|         1|    Ada Lovelace|       Programación|2020|2020-03-31|  4.6|         1|
|202|         3|     Jude Milhon|       Neurociencia|2020|2020-03-31|  3.5|         1|
|168|         9|Jhon von Neumann|         Psicología|2020|2020-03-31|  3.4|         1|
|129|        10|Richard Stallman|Finanzas y

## Joins

In [22]:
scores_df.show()

+---+----------+----------------+---------+----+----------+-----+
|_c0|student_id|    student_name|  subject|year|      date|score|
+---+----------+----------------+---------+----+----------+-----+
|  0|         1|    Ada Lovelace|Filosofía|2020|2020-03-31|  4.4|
|  1|         2|     Hedy Lamarr|Filosofía|2020|2020-03-31|  5.7|
|  2|         3|     Jude Milhon|Filosofía|2020|2020-03-31|  3.8|
|  3|         4|     Ángela Ruiz|Filosofía|2020|2020-03-31|  6.9|
|  4|         5|      Carol Shaw|Filosofía|2020|2020-03-31|  6.8|
|  5|         6| Marisol Alarcón|Filosofía|2020|2020-03-31|  6.1|
|  6|         7|    Katie Bouman|Filosofía|2020|2020-03-31|  3.4|
|  7|         8|     Alan Turing|Filosofía|2020|2020-03-31|  6.7|
|  8|         9|Jhon von Neumann|Filosofía|2020|2020-03-31|  6.3|
|  9|        10|Richard Stallman|Filosofía|2020|2020-03-31|  3.8|
| 10|         1|    Ada Lovelace|Filosofía|2020|2020-06-31|  5.7|
| 11|         2|     Hedy Lamarr|Filosofía|2020|2020-06-31|  4.4|
| 12|     

In [23]:
scores_df_2021 = (
    sql_context
        .read
        .option('header', True)
        .csv('student_scores_2021.csv')
)

scores_df_2021.show()

+---+----------+----------------+---------+----+----------+-----+
|_c0|student_id|    student_name|  subject|year|      date|score|
+---+----------+----------------+---------+----+----------+-----+
|240|         1|    Ada Lovelace|Filosofía|2021|2021-03-31|  5.0|
|241|         2|     Hedy Lamarr|Filosofía|2021|2021-03-31|  6.3|
|242|         3|     Jude Milhon|Filosofía|2021|2021-03-31|  5.6|
|243|         4|     Ángela Ruiz|Filosofía|2021|2021-03-31|  5.5|
|244|         5|      Carol Shaw|Filosofía|2021|2021-03-31|  3.3|
|245|         6| Marisol Alarcón|Filosofía|2021|2021-03-31|  5.1|
|246|         7|    Katie Bouman|Filosofía|2021|2021-03-31|  3.6|
|247|         8|     Alan Turing|Filosofía|2021|2021-03-31|  5.9|
|248|         9|Jhon von Neumann|Filosofía|2021|2021-03-31|  4.9|
|249|        10|Richard Stallman|Filosofía|2021|2021-03-31|  6.8|
|250|         1|    Ada Lovelace|Filosofía|2021|2021-06-31|  6.8|
|251|         2|     Hedy Lamarr|Filosofía|2021|2021-06-31|  5.6|
|252|     

In [24]:

name_condition = scores_df.student_name ==  scores_df_2021.student_name
subject_condition = scores_df.subject ==  scores_df_2021.subject


scores_df.join(scores_df_2021, name_condition & subject_condition, "inner").show(truncate=False)

+---+----------+------------+---------+----+----------+-----+---+----------+------------+---------+----+----------+-----+
|_c0|student_id|student_name|subject  |year|date      |score|_c0|student_id|student_name|subject  |year|date      |score|
+---+----------+------------+---------+----+----------+-----+---+----------+------------+---------+----+----------+-----+
|30 |1         |Ada Lovelace|Filosofía|2020|2020-12-17|5.3  |240|1         |Ada Lovelace|Filosofía|2021|2021-03-31|5.0  |
|20 |1         |Ada Lovelace|Filosofía|2020|2020-09-20|3.8  |240|1         |Ada Lovelace|Filosofía|2021|2021-03-31|5.0  |
|10 |1         |Ada Lovelace|Filosofía|2020|2020-06-31|5.7  |240|1         |Ada Lovelace|Filosofía|2021|2021-03-31|5.0  |
|0  |1         |Ada Lovelace|Filosofía|2020|2020-03-31|4.4  |240|1         |Ada Lovelace|Filosofía|2021|2021-03-31|5.0  |
|31 |2         |Hedy Lamarr |Filosofía|2020|2020-12-17|5.3  |241|2         |Hedy Lamarr |Filosofía|2021|2021-03-31|6.3  |
|21 |2         |Hedy Lam

In [25]:
'''
SELECT
    column1
    , column2
    ...
FROM
    table_name
    LEFT JOIN table2
        ON table_name.column1 == table2.column1
WHERE
    column1 > 1
LIMIT 15
'''

'\nSELECT\n    column1\n    , column2\n    ...\nFROM\n    table_name\n    LEFT JOIN table2\n        ON table_name.column1 == table2.column1\nWHERE\n    column1 > 1\nLIMIT 15\n'

In [26]:
(
    scores_df
    .select('student_name', 'subject', 'score')
    .join(
        scores_df_2021
            .select('student_name', 'subject', 'score')
            .withColumnRenamed('score', 'score2021'),
        name_condition & subject_condition,
        'left'
    )
    .where(col('score') > 4)
).show()



+---------------+---------+-----+---------------+---------+---------+
|   student_name|  subject|score|   student_name|  subject|score2021|
+---------------+---------+-----+---------------+---------+---------+
|    Hedy Lamarr|Filosofía|  5.7|    Hedy Lamarr|Filosofía|      6.7|
|    Hedy Lamarr|Filosofía|  5.7|    Hedy Lamarr|Filosofía|      6.8|
|    Hedy Lamarr|Filosofía|  5.7|    Hedy Lamarr|Filosofía|      5.6|
|    Hedy Lamarr|Filosofía|  5.7|    Hedy Lamarr|Filosofía|      6.3|
|    Ángela Ruiz|Filosofía|  6.9|    Ángela Ruiz|Filosofía|      6.8|
|    Ángela Ruiz|Filosofía|  6.9|    Ángela Ruiz|Filosofía|      3.5|
|    Ángela Ruiz|Filosofía|  6.9|    Ángela Ruiz|Filosofía|      6.4|
|    Ángela Ruiz|Filosofía|  6.9|    Ángela Ruiz|Filosofía|      5.5|
|     Carol Shaw|Filosofía|  6.8|     Carol Shaw|Filosofía|      5.4|
|     Carol Shaw|Filosofía|  6.8|     Carol Shaw|Filosofía|      4.4|
|     Carol Shaw|Filosofía|  6.8|     Carol Shaw|Filosofía|      6.4|
|     Carol Shaw|Fil

In [27]:
(
    scores_df
    .where('score >= 6 OR score < 4')
).show()

+---+----------+----------------+---------+----+----------+-----+
|_c0|student_id|    student_name|  subject|year|      date|score|
+---+----------+----------------+---------+----+----------+-----+
|  2|         3|     Jude Milhon|Filosofía|2020|2020-03-31|  3.8|
|  3|         4|     Ángela Ruiz|Filosofía|2020|2020-03-31|  6.9|
|  4|         5|      Carol Shaw|Filosofía|2020|2020-03-31|  6.8|
|  5|         6| Marisol Alarcón|Filosofía|2020|2020-03-31|  6.1|
|  6|         7|    Katie Bouman|Filosofía|2020|2020-03-31|  3.4|
|  7|         8|     Alan Turing|Filosofía|2020|2020-03-31|  6.7|
|  8|         9|Jhon von Neumann|Filosofía|2020|2020-03-31|  6.3|
|  9|        10|Richard Stallman|Filosofía|2020|2020-03-31|  3.8|
| 13|         4|     Ángela Ruiz|Filosofía|2020|2020-06-31|  6.3|
| 14|         5|      Carol Shaw|Filosofía|2020|2020-06-31|  6.4|
| 15|         6| Marisol Alarcón|Filosofía|2020|2020-06-31|  6.8|
| 17|         8|     Alan Turing|Filosofía|2020|2020-06-31|  3.4|
| 18|     

In [28]:
scores_df.createOrReplaceTempView('scores')
scores_df_2021.createOrReplaceTempView('scores2021')

In [29]:
(
    scores_df
    .select('student_name', 'subject', 'score')
    .join(
        scores_df_2021
            .select('student_name', 'subject', 'score')
            .withColumnRenamed('score', 'score2021'),
        name_condition & subject_condition,
        'left'
    )
    .where(col('score') > 4)
).show()

+---------------+---------+-----+---------------+---------+---------+
|   student_name|  subject|score|   student_name|  subject|score2021|
+---------------+---------+-----+---------------+---------+---------+
|    Hedy Lamarr|Filosofía|  5.7|    Hedy Lamarr|Filosofía|      6.7|
|    Hedy Lamarr|Filosofía|  5.7|    Hedy Lamarr|Filosofía|      6.8|
|    Hedy Lamarr|Filosofía|  5.7|    Hedy Lamarr|Filosofía|      5.6|
|    Hedy Lamarr|Filosofía|  5.7|    Hedy Lamarr|Filosofía|      6.3|
|    Ángela Ruiz|Filosofía|  6.9|    Ángela Ruiz|Filosofía|      6.8|
|    Ángela Ruiz|Filosofía|  6.9|    Ángela Ruiz|Filosofía|      3.5|
|    Ángela Ruiz|Filosofía|  6.9|    Ángela Ruiz|Filosofía|      6.4|
|    Ángela Ruiz|Filosofía|  6.9|    Ángela Ruiz|Filosofía|      5.5|
|     Carol Shaw|Filosofía|  6.8|     Carol Shaw|Filosofía|      5.4|
|     Carol Shaw|Filosofía|  6.8|     Carol Shaw|Filosofía|      4.4|
|     Carol Shaw|Filosofía|  6.8|     Carol Shaw|Filosofía|      6.4|
|     Carol Shaw|Fil

In [30]:
query = '''
SELECT
    scores.student_name
    , scored.subject
    , score2020
    , score2021
FROM
    scores
    LEFT JOIN (
        SELECT
            student_name
            , subject
            , score AS score2021
        FROM
            scores2021
    )
    ON scores.student_name = scores2021.student_name
        AND scores.subject = scores2021.subject
WHERE
    score2020 > 4
'''

query = '''
SELECT
    scores.student_name
    , scores.subject
    , score
FROM
    scores
WHERE
    score > 4
'''


In [31]:
result_df = sql_context.sql(query)
result_df.show()

+----------------+---------+-----+
|    student_name|  subject|score|
+----------------+---------+-----+
|     Hedy Lamarr|Filosofía|  5.7|
|     Ángela Ruiz|Filosofía|  6.9|
|      Carol Shaw|Filosofía|  6.8|
| Marisol Alarcón|Filosofía|  6.1|
|     Alan Turing|Filosofía|  6.7|
|Jhon von Neumann|Filosofía|  6.3|
|    Ada Lovelace|Filosofía|  5.7|
|     Ángela Ruiz|Filosofía|  6.3|
|      Carol Shaw|Filosofía|  6.4|
| Marisol Alarcón|Filosofía|  6.8|
|Richard Stallman|Filosofía|  5.4|
|     Hedy Lamarr|Filosofía|  6.3|
|     Jude Milhon|Filosofía|  5.5|
|     Ángela Ruiz|Filosofía|  6.8|
| Marisol Alarcón|Filosofía|  6.5|
|    Ada Lovelace|Filosofía|  5.3|
|     Hedy Lamarr|Filosofía|  5.3|
|     Jude Milhon|Filosofía|  6.7|
|     Ángela Ruiz|Filosofía|  6.6|
|    Katie Bouman|Filosofía|  6.1|
+----------------+---------+-----+
only showing top 20 rows



In [32]:
result_df.createOrReplaceTempView('result')

In [33]:

the zen of python

crear columa
renombrar columnas

quitar columnas
seleccionar columnas

cambiar tipos de datos (castear)

funciones de pyapark (concat, col, lit  etc...)
group by
window function

join


estructura SQL
transformaciones con SQL en el where
SQL

SyntaxError: invalid syntax (Temp/ipykernel_8/287766683.py, line 1)