In [1]:
# Respuesta
import os
os.environ['PYSPARK_PYTHON'] = '/usr/local/bin/python3.6'

from pyspark.sql import SparkSession

spark = SparkSession.builder.getOrCreate()

In [2]:
from pyspark.sql import functions as F



# Creación o modificación de columnas

En Spark hay un único método para la creación o modificación de columnas y es `withColumn`. Este método es de nuevo una transformación y toma dos parámetros: el nombre de la columna a crear (o sobreescribir) y la operación que crea la nueva columna.

Para una ejecución más óptima se recomienda utilizar únicamente las funciones de PySpark cuando se define la operación, pero como se detallará más adelante se pueden utilizar funciones propias. 

In [3]:
movies_df = spark.read.csv('Data/movie-ratings/movies.csv', sep=',', header=True, inferSchema=True)
ratings_df = spark.read.csv('Data/movie-ratings/tags.csv', sep=',', header=True, inferSchema=True)

In [4]:
from pyspark.sql.functions import rand
import numpy as np

ratings_df = ratings_df.withColumn("rating", rand())

In [5]:
ratings_movies_df = ratings_df.join(movies_df, on='movieId', how='inner')

In [6]:
ratings_movies_df.cache()

DataFrame[movieId: int, userId: int, tag: string, timestamp: string, rating: double, title: string, genres: string]



## Funciones de Spark



__valor fijo__

El ejemplo más sencillo es crear una columna con un valor fijo.

In [7]:
ratings_movies_df = ratings_movies_df.withColumn('now', F.lit('2019/01/21 14:08'))

In [8]:
ratings_movies_df.show(3)

+-------+------+----------+----------+------------------+--------------------+--------------------+----------------+
|movieId|userId|       tag| timestamp|            rating|               title|              genres|             now|
+-------+------+----------+----------+------------------+--------------------+--------------------+----------------+
|    318|     1|  narrated|1425942391|0.8815723431402016|Shawshank Redempt...|         Crime|Drama|2019/01/21 14:08|
|   4306|    20|Dreamworks|1459855607|0.9968457084194876|        Shrek (2001)|Adventure|Animati...|2019/01/21 14:08|
|  89302|    20|   England|1400778834|0.2875929481456937|   Page Eight (2011)|      Drama|Thriller|2019/01/21 14:08|
+-------+------+----------+----------+------------------+--------------------+--------------------+----------------+
only showing top 3 rows



In [9]:
ratings_movies_df = ratings_movies_df.withColumn('rating2', F.lit(0.4))

In [10]:
ratings_movies_df.show(3)

+-------+------+----------+----------+------------------+--------------------+--------------------+----------------+-------+
|movieId|userId|       tag| timestamp|            rating|               title|              genres|             now|rating2|
+-------+------+----------+----------+------------------+--------------------+--------------------+----------------+-------+
|    318|     1|  narrated|1425942391|0.8815723431402016|Shawshank Redempt...|         Crime|Drama|2019/01/21 14:08|    0.4|
|   4306|    20|Dreamworks|1459855607|0.9968457084194876|        Shrek (2001)|Adventure|Animati...|2019/01/21 14:08|    0.4|
|  89302|    20|   England|1400778834|0.2875929481456937|   Page Eight (2011)|      Drama|Thriller|2019/01/21 14:08|    0.4|
+-------+------+----------+----------+------------------+--------------------+--------------------+----------------+-------+
only showing top 3 rows





__duplicar columna__

In [11]:
ratings_movies_df.withColumn('title2', F.col('title'))\
                 .select('title', 'title2')\
                 .show(10)

+--------------------+--------------------+
|               title|              title2|
+--------------------+--------------------+
|Shawshank Redempt...|Shawshank Redempt...|
|        Shrek (2001)|        Shrek (2001)|
|   Page Eight (2011)|   Page Eight (2011)|
|   Page Eight (2011)|   Page Eight (2011)|
|   Page Eight (2011)|   Page Eight (2011)|
|   Page Eight (2011)|   Page Eight (2011)|
|      Skyfall (2012)|      Skyfall (2012)|
|Zero Motivation (...|Zero Motivation (...|
|Zero Motivation (...|Zero Motivation (...|
|Zero Motivation (...|Zero Motivation (...|
+--------------------+--------------------+
only showing top 10 rows





__operaciones aritmeticas__

In [12]:
ratings_movies_df.withColumn('rating_10', F.col('rating') * 2)\
                 .select('rating', 'rating_10')\
                 .show(10)

+-------------------+-------------------+
|             rating|          rating_10|
+-------------------+-------------------+
| 0.8815723431402016| 1.7631446862804032|
| 0.9968457084194876| 1.9936914168389752|
| 0.2875929481456937| 0.5751858962913874|
|0.23552414016865086|0.47104828033730173|
|0.07069456430435894|0.14138912860871788|
| 0.8257551826970266| 1.6515103653940533|
|0.12585434754888725| 0.2517086950977745|
|0.45601967086068707| 0.9120393417213741|
| 0.7190057406174745|  1.438011481234949|
| 0.4578086466008906| 0.9156172932017812|
+-------------------+-------------------+
only showing top 10 rows



In [13]:
ratings_movies_df.withColumn('rating_avg', (F.col('rating') + F.col('rating2')) /  2)\
                 .select('rating', 'rating2', 'rating_avg')\
                 .show(10)

+-------------------+-------+-------------------+
|             rating|rating2|         rating_avg|
+-------------------+-------+-------------------+
| 0.8815723431402016|    0.4| 0.6407861715701009|
| 0.9968457084194876|    0.4| 0.6984228542097438|
| 0.2875929481456937|    0.4|0.34379647407284686|
|0.23552414016865086|    0.4|0.31776207008432544|
|0.07069456430435894|    0.4|0.23534728215217948|
| 0.8257551826970266|    0.4| 0.6128775913485134|
|0.12585434754888725|    0.4|0.26292717377444363|
|0.45601967086068707|    0.4|0.42800983543034354|
| 0.7190057406174745|    0.4| 0.5595028703087372|
| 0.4578086466008906|    0.4| 0.4289043233004453|
+-------------------+-------+-------------------+
only showing top 10 rows



 

__if/else__

In [14]:
ratings_movies_df.withColumn('kind_rating', 
                              F.when(F.col('rating') >= 0.4, 'high').otherwise('low')).show(10)

+-------+------+-------------+----------+-------------------+--------------------+--------------------+----------------+-------+-----------+
|movieId|userId|          tag| timestamp|             rating|               title|              genres|             now|rating2|kind_rating|
+-------+------+-------------+----------+-------------------+--------------------+--------------------+----------------+-------+-----------+
|    318|     1|     narrated|1425942391| 0.8815723431402016|Shawshank Redempt...|         Crime|Drama|2019/01/21 14:08|    0.4|       high|
|   4306|    20|   Dreamworks|1459855607| 0.9968457084194876|        Shrek (2001)|Adventure|Animati...|2019/01/21 14:08|    0.4|       high|
|  89302|    20|      England|1400778834| 0.2875929481456937|   Page Eight (2011)|      Drama|Thriller|2019/01/21 14:08|    0.4|        low|
|  89302|    20|    espionage|1400778836|0.23552414016865086|   Page Eight (2011)|      Drama|Thriller|2019/01/21 14:08|    0.4|        low|
|  89302|    



Se pueden concatenar multiples sentencias _when_.

In [15]:
ratings_movies_df.withColumn('kind_rating', 
                              F.when(F.col('rating') >= 0.4, 'high')\
                               .when(F.col('rating') >= 0.2, 'med')\
                               .otherwise('low')).show(20)

+-------+------+-------------------+----------+-------------------+--------------------+--------------------+----------------+-------+-----------+
|movieId|userId|                tag| timestamp|             rating|               title|              genres|             now|rating2|kind_rating|
+-------+------+-------------------+----------+-------------------+--------------------+--------------------+----------------+-------+-----------+
|    318|     1|           narrated|1425942391| 0.8815723431402016|Shawshank Redempt...|         Crime|Drama|2019/01/21 14:08|    0.4|       high|
|   4306|    20|         Dreamworks|1459855607| 0.9968457084194876|        Shrek (2001)|Adventure|Animati...|2019/01/21 14:08|    0.4|       high|
|  89302|    20|            England|1400778834| 0.2875929481456937|   Page Eight (2011)|      Drama|Thriller|2019/01/21 14:08|    0.4|        med|
|  89302|    20|          espionage|1400778836|0.23552414016865086|   Page Eight (2011)|      Drama|Thriller|2019/01/2



__operaciones con strings__

In [16]:
ratings_movies_df.withColumn('title', F.upper(F.col('title'))).show(3)

+-------+------+----------+----------+------------------+--------------------+--------------------+----------------+-------+
|movieId|userId|       tag| timestamp|            rating|               title|              genres|             now|rating2|
+-------+------+----------+----------+------------------+--------------------+--------------------+----------------+-------+
|    318|     1|  narrated|1425942391|0.8815723431402016|SHAWSHANK REDEMPT...|         Crime|Drama|2019/01/21 14:08|    0.4|
|   4306|    20|Dreamworks|1459855607|0.9968457084194876|        SHREK (2001)|Adventure|Animati...|2019/01/21 14:08|    0.4|
|  89302|    20|   England|1400778834|0.2875929481456937|   PAGE EIGHT (2011)|      Drama|Thriller|2019/01/21 14:08|    0.4|
+-------+------+----------+----------+------------------+--------------------+--------------------+----------------+-------+
only showing top 3 rows



In [17]:
ratings_movies_df.withColumn('short_title', F.substring(F.col('title'), 0, 10))\
                 .select('title', 'short_title')\
                 .show(10)

+--------------------+-----------+
|               title|short_title|
+--------------------+-----------+
|Shawshank Redempt...| Shawshank |
|        Shrek (2001)| Shrek (200|
|   Page Eight (2011)| Page Eight|
|   Page Eight (2011)| Page Eight|
|   Page Eight (2011)| Page Eight|
|   Page Eight (2011)| Page Eight|
|      Skyfall (2012)| Skyfall (2|
|Zero Motivation (...| Zero Motiv|
|Zero Motivation (...| Zero Motiv|
|Zero Motivation (...| Zero Motiv|
+--------------------+-----------+
only showing top 10 rows



In [18]:
ratings_movies_df.withColumn('genres', F.split(F.col('genres'), '\|')).show(4)

+-------+------+----------+----------+-------------------+--------------------+--------------------+----------------+-------+
|movieId|userId|       tag| timestamp|             rating|               title|              genres|             now|rating2|
+-------+------+----------+----------+-------------------+--------------------+--------------------+----------------+-------+
|    318|     1|  narrated|1425942391| 0.8815723431402016|Shawshank Redempt...|      [Crime, Drama]|2019/01/21 14:08|    0.4|
|   4306|    20|Dreamworks|1459855607| 0.9968457084194876|        Shrek (2001)|[Adventure, Anima...|2019/01/21 14:08|    0.4|
|  89302|    20|   England|1400778834| 0.2875929481456937|   Page Eight (2011)|   [Drama, Thriller]|2019/01/21 14:08|    0.4|
|  89302|    20| espionage|1400778836|0.23552414016865086|   Page Eight (2011)|   [Drama, Thriller]|2019/01/21 14:08|    0.4|
+-------+------+----------+----------+-------------------+--------------------+--------------------+----------------+-

In [19]:
ratings_movies_df.withColumn('1st_genre', F.split(F.col('genres'), '\|')[0])\
                 .select('genres', '1st_genre')\
                 .show(10)

+--------------------+---------+
|              genres|1st_genre|
+--------------------+---------+
|         Crime|Drama|    Crime|
|Adventure|Animati...|Adventure|
|      Drama|Thriller|    Drama|
|      Drama|Thriller|    Drama|
|      Drama|Thriller|    Drama|
|      Drama|Thriller|    Drama|
|Action|Adventure|...|   Action|
|        Comedy|Drama|   Comedy|
|        Comedy|Drama|   Comedy|
|        Comedy|Drama|   Comedy|
+--------------------+---------+
only showing top 10 rows



In [20]:
ratings_movies_df.withColumn('genres', F.regexp_replace(F.col('genres'), '\|', '-'))\
                 .select('title', 'genres')\
                 .show(10, truncate=False)

+---------------------------------------------+---------------------------------------------------+
|title                                        |genres                                             |
+---------------------------------------------+---------------------------------------------------+
|Shawshank Redemption, The (1994)             |Crime-Drama                                        |
|Shrek (2001)                                 |Adventure-Animation-Children-Comedy-Fantasy-Romance|
|Page Eight (2011)                            |Drama-Thriller                                     |
|Page Eight (2011)                            |Drama-Thriller                                     |
|Page Eight (2011)                            |Drama-Thriller                                     |
|Page Eight (2011)                            |Drama-Thriller                                     |
|Skyfall (2012)                               |Action-Adventure-Thriller-IMAX                     |




_Con expresiones regulares_

https://regexr.com/

In [21]:
ratings_movies_df.withColumn('title', F.regexp_replace(F.col('title'), ' \(\d{4}\)', '')).show(5, truncate=False)

+-------+------+----------+----------+-------------------+-------------------------+---------------------------------------------------+----------------+-------+
|movieId|userId|tag       |timestamp |rating             |title                    |genres                                             |now             |rating2|
+-------+------+----------+----------+-------------------+-------------------------+---------------------------------------------------+----------------+-------+
|318    |1     |narrated  |1425942391|0.8815723431402016 |Shawshank Redemption, The|Crime|Drama                                        |2019/01/21 14:08|0.4    |
|4306   |20    |Dreamworks|1459855607|0.9968457084194876 |Shrek                    |Adventure|Animation|Children|Comedy|Fantasy|Romance|2019/01/21 14:08|0.4    |
|89302  |20    |England   |1400778834|0.2875929481456937 |Page Eight               |Drama|Thriller                                     |2019/01/21 14:08|0.4    |
|89302  |20    |espionage |1

In [22]:
ratings_movies_df = ratings_movies_df.withColumn('year', 
                                                 F.regexp_extract(F.col('title'),  '\((\d{4})\)', 1))

ratings_movies_df.show(5)

+-------+------+----------+----------+-------------------+--------------------+--------------------+----------------+-------+----+
|movieId|userId|       tag| timestamp|             rating|               title|              genres|             now|rating2|year|
+-------+------+----------+----------+-------------------+--------------------+--------------------+----------------+-------+----+
|    318|     1|  narrated|1425942391| 0.8815723431402016|Shawshank Redempt...|         Crime|Drama|2019/01/21 14:08|    0.4|1994|
|   4306|    20|Dreamworks|1459855607| 0.9968457084194876|        Shrek (2001)|Adventure|Animati...|2019/01/21 14:08|    0.4|2001|
|  89302|    20|   England|1400778834| 0.2875929481456937|   Page Eight (2011)|      Drama|Thriller|2019/01/21 14:08|    0.4|2011|
|  89302|    20| espionage|1400778836|0.23552414016865086|   Page Eight (2011)|      Drama|Thriller|2019/01/21 14:08|    0.4|2011|
|  89302|    20|      jazz|1400778841|0.07069456430435894|   Page Eight (2011)|    



## Casting

Con el método `withColumn` también es posible convertir el tipo de una columna con la función `cast`. Es importante saber que en caso de no poder convertirse (por ejemplo una letra a número) no saltará error y el resultado será un valor nulo.

In [23]:
ratings_movies_df.printSchema()

root
 |-- movieId: integer (nullable = true)
 |-- userId: integer (nullable = true)
 |-- tag: string (nullable = true)
 |-- timestamp: string (nullable = true)
 |-- rating: double (nullable = false)
 |-- title: string (nullable = true)
 |-- genres: string (nullable = true)
 |-- now: string (nullable = false)
 |-- rating2: double (nullable = false)
 |-- year: string (nullable = true)



In [24]:
ratings_movies_df = ratings_movies_df.withColumn('year', F.col('year').cast('int'))
ratings_movies_df.show(5)

+-------+------+----------+----------+-------------------+--------------------+--------------------+----------------+-------+----+
|movieId|userId|       tag| timestamp|             rating|               title|              genres|             now|rating2|year|
+-------+------+----------+----------+-------------------+--------------------+--------------------+----------------+-------+----+
|    318|     1|  narrated|1425942391| 0.8815723431402016|Shawshank Redempt...|         Crime|Drama|2019/01/21 14:08|    0.4|1994|
|   4306|    20|Dreamworks|1459855607| 0.9968457084194876|        Shrek (2001)|Adventure|Animati...|2019/01/21 14:08|    0.4|2001|
|  89302|    20|   England|1400778834| 0.2875929481456937|   Page Eight (2011)|      Drama|Thriller|2019/01/21 14:08|    0.4|2011|
|  89302|    20| espionage|1400778836|0.23552414016865086|   Page Eight (2011)|      Drama|Thriller|2019/01/21 14:08|    0.4|2011|
|  89302|    20|      jazz|1400778841|0.07069456430435894|   Page Eight (2011)|    

In [25]:
ratings_movies_df = ratings_movies_df.withColumn('movieId', F.col('movieId').cast('string'))

In [26]:
ratings_movies_df.printSchema()

root
 |-- movieId: string (nullable = true)
 |-- userId: integer (nullable = true)
 |-- tag: string (nullable = true)
 |-- timestamp: string (nullable = true)
 |-- rating: double (nullable = false)
 |-- title: string (nullable = true)
 |-- genres: string (nullable = true)
 |-- now: string (nullable = false)
 |-- rating2: double (nullable = false)
 |-- year: integer (nullable = true)



In [27]:
ratings_movies_df.withColumn('error', F.col('title').cast('int')).show(5)

+-------+------+----------+----------+-------------------+--------------------+--------------------+----------------+-------+----+-----+
|movieId|userId|       tag| timestamp|             rating|               title|              genres|             now|rating2|year|error|
+-------+------+----------+----------+-------------------+--------------------+--------------------+----------------+-------+----+-----+
|    318|     1|  narrated|1425942391| 0.8815723431402016|Shawshank Redempt...|         Crime|Drama|2019/01/21 14:08|    0.4|1994| null|
|   4306|    20|Dreamworks|1459855607| 0.9968457084194876|        Shrek (2001)|Adventure|Animati...|2019/01/21 14:08|    0.4|2001| null|
|  89302|    20|   England|1400778834| 0.2875929481456937|   Page Eight (2011)|      Drama|Thriller|2019/01/21 14:08|    0.4|2011| null|
|  89302|    20| espionage|1400778836|0.23552414016865086|   Page Eight (2011)|      Drama|Thriller|2019/01/21 14:08|    0.4|2011| null|
|  89302|    20|      jazz|1400778841|0.0



## UDF (User Defined Functions)
Cuando no es posible definir la operación con las funciones de spark se 
pueden crear funciones propias usando la UDFs. 
* Primero se crea  una función de Python normal
* posteriormente se crea la UDFs asignando el tipo de dato de la columna de salida
* Aplicar la función UDF

In [28]:
from pyspark.sql.types import StringType, IntegerType, DoubleType, DateType

_Aumenta el rating en un 15% para cada película más 
antigua que 2000 (el máximo siempre es 5)._

In [29]:
def increase_rating(year, rating):
    
    if year < 2000:
        rating = min(rating * 1.15, 5.0)
    
    return rating

In [30]:
increase_rating_udf = F.udf(increase_rating, DoubleType())

In [31]:
ratings_movies_df.withColumn('rating_inc', 
                              increase_rating_udf(F.col('year'), F.col('rating')))\
                 .select('title', 'year', 'rating', 'rating_inc')\
                 .show(20)

+--------------------+----+-------------------+--------------------+
|               title|year|             rating|          rating_inc|
+--------------------+----+-------------------+--------------------+
|Shawshank Redempt...|1994| 0.8815723431402016|  1.0138081946112318|
|        Shrek (2001)|2001| 0.9968457084194876|  0.9968457084194876|
|   Page Eight (2011)|2011| 0.2875929481456937|  0.2875929481456937|
|   Page Eight (2011)|2011|0.23552414016865086| 0.23552414016865086|
|   Page Eight (2011)|2011|0.07069456430435894| 0.07069456430435894|
|   Page Eight (2011)|2011| 0.8257551826970266|  0.8257551826970266|
|      Skyfall (2012)|2012|0.12585434754888725| 0.12585434754888725|
|Zero Motivation (...|2014|0.45601967086068707| 0.45601967086068707|
|Zero Motivation (...|2014| 0.7190057406174745|  0.7190057406174745|
|Zero Motivation (...|2014| 0.4578086466008906|  0.4578086466008906|
|Zero Motivation (...|2014| 0.7786882924600995|  0.7786882924600995|
|Zero Motivation (...|2014| 0.7744



Extrae el año de la película sin usar expresiones regulares.

In [32]:
title = 'Trainspotting (1996)'

In [33]:
title.replace(')', '').replace('(', '')

'Trainspotting 1996'

In [34]:
year = title.replace(')', '').replace('(', '').split(' ')[-1]
year = int(year)
year

1996

In [35]:
def get_year(title): 
    
    year = title.replace(')', '').replace('(', '').split(' ')[-1]
    if year.isnumeric():
        year = int(year)
    else:
        year = -1
    
    return year

In [36]:
get_year_udf = F.udf(get_year, IntegerType())

In [37]:
ratings_movies_df.withColumn('year2', get_year_udf(F.col('title')))\
                 .select('title', 'year', 'year2').show(10, truncate=False)

+---------------------------------------------+----+-----+
|title                                        |year|year2|
+---------------------------------------------+----+-----+
|Shawshank Redemption, The (1994)             |1994|1994 |
|Shrek (2001)                                 |2001|2001 |
|Page Eight (2011)                            |2011|2011 |
|Page Eight (2011)                            |2011|2011 |
|Page Eight (2011)                            |2011|2011 |
|Page Eight (2011)                            |2011|2011 |
|Skyfall (2012)                               |2012|2012 |
|Zero Motivation (Efes beyahasei enosh) (2014)|2014|2014 |
|Zero Motivation (Efes beyahasei enosh) (2014)|2014|2014 |
|Zero Motivation (Efes beyahasei enosh) (2014)|2014|2014 |
+---------------------------------------------+----+-----+
only showing top 10 rows





# Datetimes

Hay varias funciones de _pyspark_ que permiten trabajar con fechas: diferencia entre fechas, dia de la semana, año... Pero para ello primero es necesario transformar las columnas a tipo fecha. Se permite la conversion de dos formatos de fecha:
* timestamp de unix: una columna de tipo entero con los segundos trascurridos entre la medianoche del 1 de Enero de 1990 hasta la fecha.
* cadena: la fecha representada como una cadena siguiendo un formato específico que puede variar.

In [38]:
ratings_movies_df.select('title', 'timestamp', 'now').show(5)

+--------------------+----------+----------------+
|               title| timestamp|             now|
+--------------------+----------+----------------+
|Shawshank Redempt...|1425942391|2019/01/21 14:08|
|        Shrek (2001)|1459855607|2019/01/21 14:08|
|   Page Eight (2011)|1400778834|2019/01/21 14:08|
|   Page Eight (2011)|1400778836|2019/01/21 14:08|
|   Page Eight (2011)|1400778841|2019/01/21 14:08|
+--------------------+----------+----------------+
only showing top 5 rows



 

## unix timestamp a datetime

In [39]:
ratings_movies_df = ratings_movies_df.withColumn('datetime', F.from_unixtime(F.col('timestamp')))
ratings_movies_df.select('datetime', 'timestamp').show(10)

+-------------------+----------+
|           datetime| timestamp|
+-------------------+----------+
|2015-03-09 18:06:31|1425942391|
|2016-04-05 06:26:47|1459855607|
|2014-05-22 12:13:54|1400778834|
|2014-05-22 12:13:56|1400778836|
|2014-05-22 12:14:01|1400778841|
|2014-05-22 12:14:01|1400778841|
|2014-08-13 06:44:09|1407930249|
|2014-08-12 05:05:17|1407837917|
|2014-08-12 05:06:46|1407838006|
|2014-08-12 05:05:13|1407837913|
+-------------------+----------+
only showing top 10 rows





## string a datetime

In [40]:
ratings_movies_df = ratings_movies_df.withColumn('now_datetime', 
                                                 F.from_unixtime(F.unix_timestamp(F.col('now'), 'yyyy/MM/dd HH:mm')))
ratings_movies_df.select('now', 'now_datetime').show(10)

+----------------+-------------------+
|             now|       now_datetime|
+----------------+-------------------+
|2019/01/21 14:08|2019-01-21 14:08:00|
|2019/01/21 14:08|2019-01-21 14:08:00|
|2019/01/21 14:08|2019-01-21 14:08:00|
|2019/01/21 14:08|2019-01-21 14:08:00|
|2019/01/21 14:08|2019-01-21 14:08:00|
|2019/01/21 14:08|2019-01-21 14:08:00|
|2019/01/21 14:08|2019-01-21 14:08:00|
|2019/01/21 14:08|2019-01-21 14:08:00|
|2019/01/21 14:08|2019-01-21 14:08:00|
|2019/01/21 14:08|2019-01-21 14:08:00|
+----------------+-------------------+
only showing top 10 rows





## funciones con datetimes

In [41]:
ratings_movies_df.select('now_datetime', 'datetime', F.datediff(F.col('now_datetime'), F.col('datetime'))).show(10)

+-------------------+-------------------+--------------------------------+
|       now_datetime|           datetime|datediff(now_datetime, datetime)|
+-------------------+-------------------+--------------------------------+
|2019-01-21 14:08:00|2015-03-09 18:06:31|                            1414|
|2019-01-21 14:08:00|2016-04-05 06:26:47|                            1021|
|2019-01-21 14:08:00|2014-05-22 12:13:54|                            1705|
|2019-01-21 14:08:00|2014-05-22 12:13:56|                            1705|
|2019-01-21 14:08:00|2014-05-22 12:14:01|                            1705|
|2019-01-21 14:08:00|2014-05-22 12:14:01|                            1705|
|2019-01-21 14:08:00|2014-08-13 06:44:09|                            1622|
|2019-01-21 14:08:00|2014-08-12 05:05:17|                            1623|
|2019-01-21 14:08:00|2014-08-12 05:06:46|                            1623|
|2019-01-21 14:08:00|2014-08-12 05:05:13|                            1623|
+-------------------+----

In [42]:
ratings_movies_df.select('datetime', F.date_add(F.col('datetime'), 10)).show(10)

+-------------------+----------------------+
|           datetime|date_add(datetime, 10)|
+-------------------+----------------------+
|2015-03-09 18:06:31|            2015-03-19|
|2016-04-05 06:26:47|            2016-04-15|
|2014-05-22 12:13:54|            2014-06-01|
|2014-05-22 12:13:56|            2014-06-01|
|2014-05-22 12:14:01|            2014-06-01|
|2014-05-22 12:14:01|            2014-06-01|
|2014-08-13 06:44:09|            2014-08-23|
|2014-08-12 05:05:17|            2014-08-22|
|2014-08-12 05:06:46|            2014-08-22|
|2014-08-12 05:05:13|            2014-08-22|
+-------------------+----------------------+
only showing top 10 rows



In [43]:
ratings_movies_df.withColumn('datetime_plus_4_months', F.add_months(F.col('datetime'), 4))\
                  .select('datetime', 'datetime_plus_4_months').show(5)

+-------------------+----------------------+
|           datetime|datetime_plus_4_months|
+-------------------+----------------------+
|2015-03-09 18:06:31|            2015-07-09|
|2016-04-05 06:26:47|            2016-08-05|
|2014-05-22 12:13:54|            2014-09-22|
|2014-05-22 12:13:56|            2014-09-22|
|2014-05-22 12:14:01|            2014-09-22|
+-------------------+----------------------+
only showing top 5 rows



In [44]:
ratings_movies_df.select('datetime', F.month(F.col('datetime')).alias('month')).show(10)

+-------------------+-----+
|           datetime|month|
+-------------------+-----+
|2015-03-09 18:06:31|    3|
|2016-04-05 06:26:47|    4|
|2014-05-22 12:13:54|    5|
|2014-05-22 12:13:56|    5|
|2014-05-22 12:14:01|    5|
|2014-05-22 12:14:01|    5|
|2014-08-13 06:44:09|    8|
|2014-08-12 05:05:17|    8|
|2014-08-12 05:06:46|    8|
|2014-08-12 05:05:13|    8|
+-------------------+-----+
only showing top 10 rows



In [45]:
ratings_movies_df.select('datetime', F.last_day(F.col('datetime')).alias('last_day')).show(10)

+-------------------+----------+
|           datetime|  last_day|
+-------------------+----------+
|2015-03-09 18:06:31|2015-03-31|
|2016-04-05 06:26:47|2016-04-30|
|2014-05-22 12:13:54|2014-05-31|
|2014-05-22 12:13:56|2014-05-31|
|2014-05-22 12:14:01|2014-05-31|
|2014-05-22 12:14:01|2014-05-31|
|2014-08-13 06:44:09|2014-08-31|
|2014-08-12 05:05:17|2014-08-31|
|2014-08-12 05:06:46|2014-08-31|
|2014-08-12 05:05:13|2014-08-31|
+-------------------+----------+
only showing top 10 rows



In [46]:
ratings_movies_df.select('datetime', F.dayofmonth(F.col('datetime')).alias('day'),
                                     F.dayofyear(F.col('datetime')).alias('year_day'),
                                     F.date_format(F.col('datetime'), 'E').alias('weekday')).show(10)

+-------------------+---+--------+-------+
|           datetime|day|year_day|weekday|
+-------------------+---+--------+-------+
|2015-03-09 18:06:31|  9|      68|    Mon|
|2016-04-05 06:26:47|  5|      96|    Tue|
|2014-05-22 12:13:54| 22|     142|    Thu|
|2014-05-22 12:13:56| 22|     142|    Thu|
|2014-05-22 12:14:01| 22|     142|    Thu|
|2014-05-22 12:14:01| 22|     142|    Thu|
|2014-08-13 06:44:09| 13|     225|    Wed|
|2014-08-12 05:05:17| 12|     224|    Tue|
|2014-08-12 05:06:46| 12|     224|    Tue|
|2014-08-12 05:05:13| 12|     224|    Tue|
+-------------------+---+--------+-------+
only showing top 10 rows





Para filtrar por fechas se pueden comparar directamente con una cadena en el formato YYYY-MM-DD hh:mm:ss ya que será interpretada como una fecha.

In [47]:
ratings_movies_df.filter(F.col('datetime') >= "2015-09-30 20:00:00").select('datetime', 'title', 'rating').show(10)

+-------------------+--------------------+-------------------+
|           datetime|               title|             rating|
+-------------------+--------------------+-------------------+
|2016-04-05 06:26:47|        Shrek (2001)| 0.9968457084194876|
|2015-12-25 10:59:28|Star Wars: Episod...|0.13558505903248463|
|2016-04-30 23:53:29|Dr. Strangelove o...| 0.5096446566732716|
|2016-04-30 23:51:01|  The Martian (2015)|  0.428529255326898|
|2017-02-09 15:47:57|Apartment, The (1...|0.13659992245850905|
|2017-01-04 19:46:16|Everyone Says I L...|0.17733014917469325|
|2017-01-04 19:46:18|Everyone Says I L...| 0.5234339752799414|
|2017-01-04 19:46:14|Everyone Says I L...|   0.78018784851751|
|2017-01-05 18:31:18|     Swingers (1996)| 0.4818668636340552|
|2017-01-05 18:31:46|     Swingers (1996)| 0.9008459886834707|
+-------------------+--------------------+-------------------+
only showing top 10 rows



In [48]:
ratings_movies_df.filter(F.col('datetime').between("2003-01-31", "2003-02-10"))\
                  .select('datetime', 'title', 'rating').show(5)

+--------+-----+------+
|datetime|title|rating|
+--------+-----+------+
+--------+-----+------+



In [49]:
ratings_movies_df.filter(F.year(F.col('datetime')) >= 2012)\
                 .select('datetime', 'title', 'rating').show(5)

+-------------------+--------------------+-------------------+
|           datetime|               title|             rating|
+-------------------+--------------------+-------------------+
|2015-03-09 18:06:31|Shawshank Redempt...| 0.8815723431402016|
|2016-04-05 06:26:47|        Shrek (2001)| 0.9968457084194876|
|2014-05-22 12:13:54|   Page Eight (2011)| 0.2875929481456937|
|2014-05-22 12:13:56|   Page Eight (2011)|0.23552414016865086|
|2014-05-22 12:14:01|   Page Eight (2011)|0.07069456430435894|
+-------------------+--------------------+-------------------+
only showing top 5 rows

