In [0]:
# Global data variables
SANDBOX_NAME = # Sandbox Name
DATA_PATH = "/data/sandboxes/" + SANDBOX_NAME + "/data/data/" 

# Nueva sección

# Nueva sección

In [0]:
from pyspark.sql import functions as F



# Creación o modificación de columnas

En Spark hay un único método para la creación o modificación de columnas y es `withColumn`. Este método es de nuevo una transformación y toma dos parámetros: el nombre de la columna a crear (o sobreescribir) y la operación que crea la nueva columna.

Para una ejecución más óptima se recomienda utilizar únicamente las funciones de PySpark cuando se define la operación, pero como se detallará más adelante se pueden utilizar funciones propias. 

In [0]:
movies_df = spark.read.csv(DATA_PATH + 'movie-ratings/movies.csv', sep=',', header=True, inferSchema=True)
ratings_df = spark.read.csv(DATA_PATH + 'movie-ratings/ratings.csv', sep=',', header=True, inferSchema=True)

In [0]:
ratings_movies_df = ratings_df.join(movies_df, on='movieId', how='inner')

In [0]:
ratings_movies_df.cache()

DataFrame[movieId: int, userId: int, rating: double, timestamp: int, title: string, genres: string]



## Funciones de Spark



__valor fijo__

El ejemplo más sencillo es crear una columna con un valor fijo.

In [0]:
ratings_movies_df = ratings_movies_df.withColumn('now', F.lit('2019/01/21 14:08'))

In [0]:
ratings_movies_df.show(3)

+-------+------+------+----------+--------------------+----------------+----------------+
|movieId|userId|rating| timestamp|               title|          genres|             now|
+-------+------+------+----------+--------------------+----------------+----------------+
|    110|     1|   1.0|1425941529|   Braveheart (1995)|Action|Drama|War|2019/01/21 14:08|
|    147|     1|   4.5|1425942435|Basketball Diarie...|           Drama|2019/01/21 14:08|
|    858|     1|   5.0|1425941523|Godfather, The (1...|     Crime|Drama|2019/01/21 14:08|
+-------+------+------+----------+--------------------+----------------+----------------+
only showing top 3 rows



In [0]:
ratings_movies_df = ratings_movies_df.withColumn('rating2', F.lit(4.0))

In [0]:
ratings_movies_df.show(3)

+-------+------+------+----------+--------------------+----------------+----------------+-------+
|movieId|userId|rating| timestamp|               title|          genres|             now|rating2|
+-------+------+------+----------+--------------------+----------------+----------------+-------+
|    110|     1|   1.0|1425941529|   Braveheart (1995)|Action|Drama|War|2019/01/21 14:08|    4.0|
|    147|     1|   4.5|1425942435|Basketball Diarie...|           Drama|2019/01/21 14:08|    4.0|
|    858|     1|   5.0|1425941523|Godfather, The (1...|     Crime|Drama|2019/01/21 14:08|    4.0|
+-------+------+------+----------+--------------------+----------------+----------------+-------+
only showing top 3 rows





__duplicar columna__

In [0]:
ratings_movies_df.withColumn('title2', F.col('title'))\
                 .select('title', 'title2')\
                 .show(10)

+--------------------+--------------------+
|               title|              title2|
+--------------------+--------------------+
|   Braveheart (1995)|   Braveheart (1995)|
|Basketball Diarie...|Basketball Diarie...|
|Godfather, The (1...|Godfather, The (1...|
|Godfather: Part I...|Godfather: Part I...|
|Dead Poets Societ...|Dead Poets Societ...|
|Breakfast Club, T...|Breakfast Club, T...|
|Sixth Sense, The ...|Sixth Sense, The ...|
|Ferris Bueller's ...|Ferris Bueller's ...|
|   Fight Club (1999)|   Fight Club (1999)|
|      Memento (2000)|      Memento (2000)|
+--------------------+--------------------+
only showing top 10 rows





__operaciones aritmeticas__

In [0]:
ratings_movies_df.withColumn('rating_10', F.col('rating') * 2)\
                 .select('rating', 'rating_10')\
                 .show(10)

+------+---------+
|rating|rating_10|
+------+---------+
|   1.0|      2.0|
|   4.5|      9.0|
|   5.0|     10.0|
|   5.0|     10.0|
|   5.0|     10.0|
|   4.0|      8.0|
|   4.5|      9.0|
|   5.0|     10.0|
|   4.0|      8.0|
|   4.0|      8.0|
+------+---------+
only showing top 10 rows



In [0]:
ratings_movies_df.withColumn('rating_avg', (F.col('rating') + F.col('rating2')) /  2)\
                 .select('rating', 'rating2', 'rating_avg')\
                 .show(10)

+------+-------+----------+
|rating|rating2|rating_avg|
+------+-------+----------+
|   1.0|    4.0|       2.5|
|   4.5|    4.0|      4.25|
|   5.0|    4.0|       4.5|
|   5.0|    4.0|       4.5|
|   5.0|    4.0|       4.5|
|   4.0|    4.0|       4.0|
|   4.5|    4.0|      4.25|
|   5.0|    4.0|       4.5|
|   4.0|    4.0|       4.0|
|   4.0|    4.0|       4.0|
+------+-------+----------+
only showing top 10 rows



 

__if/else__

In [0]:
ratings_movies_df.withColumn('kind_rating', 
                              F.when(F.col('rating') >= 4, 'high').otherwise('low')).show(10)

+-------+------+------+----------+--------------------+--------------------+----------------+-------+-----------+
|movieId|userId|rating| timestamp|               title|              genres|             now|rating2|kind_rating|
+-------+------+------+----------+--------------------+--------------------+----------------+-------+-----------+
|    110|     1|   1.0|1425941529|   Braveheart (1995)|    Action|Drama|War|2019/01/21 14:08|    4.0|        low|
|    147|     1|   4.5|1425942435|Basketball Diarie...|               Drama|2019/01/21 14:08|    4.0|       high|
|    858|     1|   5.0|1425941523|Godfather, The (1...|         Crime|Drama|2019/01/21 14:08|    4.0|       high|
|   1221|     1|   5.0|1425941546|Godfather: Part I...|         Crime|Drama|2019/01/21 14:08|    4.0|       high|
|   1246|     1|   5.0|1425941556|Dead Poets Societ...|               Drama|2019/01/21 14:08|    4.0|       high|
|   1968|     1|   4.0|1425942148|Breakfast Club, T...|        Comedy|Drama|2019/01/21 1



Se pueden concatenar multiples sentencias _when_.

In [0]:
ratings_movies_df.withColumn('kind_rating', 
                              F.when(F.col('rating') >= 4, 'high')\
                               .when(F.col('rating') >= 2, 'med')\
                               .otherwise('low')).show(20)

+-------+------+------+----------+--------------------+--------------------+----------------+-------+-----------+
|movieId|userId|rating| timestamp|               title|              genres|             now|rating2|kind_rating|
+-------+------+------+----------+--------------------+--------------------+----------------+-------+-----------+
|    110|     1|   1.0|1425941529|   Braveheart (1995)|    Action|Drama|War|2019/01/21 14:08|    4.0|        low|
|    147|     1|   4.5|1425942435|Basketball Diarie...|               Drama|2019/01/21 14:08|    4.0|       high|
|    858|     1|   5.0|1425941523|Godfather, The (1...|         Crime|Drama|2019/01/21 14:08|    4.0|       high|
|   1221|     1|   5.0|1425941546|Godfather: Part I...|         Crime|Drama|2019/01/21 14:08|    4.0|       high|
|   1246|     1|   5.0|1425941556|Dead Poets Societ...|               Drama|2019/01/21 14:08|    4.0|       high|
|   1968|     1|   4.0|1425942148|Breakfast Club, T...|        Comedy|Drama|2019/01/21 1



__operaciones con strings__

In [0]:
ratings_movies_df.withColumn('title', F.upper(F.col('title'))).show(3)

+-------+------+------+----------+--------------------+----------------+----------------+-------+
|movieId|userId|rating| timestamp|               title|          genres|             now|rating2|
+-------+------+------+----------+--------------------+----------------+----------------+-------+
|    110|     1|   1.0|1425941529|   BRAVEHEART (1995)|Action|Drama|War|2019/01/21 14:08|    4.0|
|    147|     1|   4.5|1425942435|BASKETBALL DIARIE...|           Drama|2019/01/21 14:08|    4.0|
|    858|     1|   5.0|1425941523|GODFATHER, THE (1...|     Crime|Drama|2019/01/21 14:08|    4.0|
+-------+------+------+----------+--------------------+----------------+----------------+-------+
only showing top 3 rows



In [0]:
ratings_movies_df.withColumn('short_title', F.substring(F.col('title'), 0, 10))\
                 .select('title', 'short_title')\
                 .show(10)

+--------------------+-----------+
|               title|short_title|
+--------------------+-----------+
|   Braveheart (1995)| Braveheart|
|Basketball Diarie...| Basketball|
|Godfather, The (1...| Godfather,|
|Godfather: Part I...| Godfather:|
|Dead Poets Societ...| Dead Poets|
|Breakfast Club, T...| Breakfast |
|Sixth Sense, The ...| Sixth Sens|
|Ferris Bueller's ...| Ferris Bue|
|   Fight Club (1999)| Fight Club|
|      Memento (2000)| Memento (2|
+--------------------+-----------+
only showing top 10 rows



In [0]:
ratings_movies_df.withColumn('genres', F.split(F.col('genres'), '\|')).show(4)

+-------+------+------+----------+--------------------+--------------------+----------------+-------+
|movieId|userId|rating| timestamp|               title|              genres|             now|rating2|
+-------+------+------+----------+--------------------+--------------------+----------------+-------+
|    110|     1|   1.0|1425941529|   Braveheart (1995)|[Action, Drama, War]|2019/01/21 14:08|    4.0|
|    147|     1|   4.5|1425942435|Basketball Diarie...|             [Drama]|2019/01/21 14:08|    4.0|
|    858|     1|   5.0|1425941523|Godfather, The (1...|      [Crime, Drama]|2019/01/21 14:08|    4.0|
|   1221|     1|   5.0|1425941546|Godfather: Part I...|      [Crime, Drama]|2019/01/21 14:08|    4.0|
+-------+------+------+----------+--------------------+--------------------+----------------+-------+
only showing top 4 rows



In [0]:
ratings_movies_df.withColumn('1st_genre', F.split(F.col('genres'), '\|')[0])\
                 .select('genres', '1st_genre')\
                 .show(10)

+--------------------+---------+
|              genres|1st_genre|
+--------------------+---------+
|    Action|Drama|War|   Action|
|               Drama|    Drama|
|         Crime|Drama|    Crime|
|         Crime|Drama|    Crime|
|               Drama|    Drama|
|        Comedy|Drama|   Comedy|
|Drama|Horror|Mystery|    Drama|
|              Comedy|   Comedy|
|Action|Crime|Dram...|   Action|
|    Mystery|Thriller|  Mystery|
+--------------------+---------+
only showing top 10 rows



In [0]:
ratings_movies_df.withColumn('genres', F.regexp_replace(F.col('genres'), '\|', '-'))\
                 .select('title', 'genres')\
                 .show(10, truncate=False)

+-------------------------------+---------------------------+
|title                          |genres                     |
+-------------------------------+---------------------------+
|Braveheart (1995)              |Action-Drama-War           |
|Basketball Diaries, The (1995) |Drama                      |
|Godfather, The (1972)          |Crime-Drama                |
|Godfather: Part II, The (1974) |Crime-Drama                |
|Dead Poets Society (1989)      |Drama                      |
|Breakfast Club, The (1985)     |Comedy-Drama               |
|Sixth Sense, The (1999)        |Drama-Horror-Mystery       |
|Ferris Bueller's Day Off (1986)|Comedy                     |
|Fight Club (1999)              |Action-Crime-Drama-Thriller|
|Memento (2000)                 |Mystery-Thriller           |
+-------------------------------+---------------------------+
only showing top 10 rows





_Con expresiones regulares_

https://regexr.com/

In [0]:
ratings_movies_df.withColumn('title', F.regexp_replace(F.col('title'), ' \(\d{4}\)', '')).show(5, truncate=False)

+-------+------+------+----------+-----------------------+----------------+----------------+-------+
|movieId|userId|rating|timestamp |title                  |genres          |now             |rating2|
+-------+------+------+----------+-----------------------+----------------+----------------+-------+
|110    |1     |1.0   |1425941529|Braveheart             |Action|Drama|War|2019/01/21 14:08|4.0    |
|147    |1     |4.5   |1425942435|Basketball Diaries, The|Drama           |2019/01/21 14:08|4.0    |
|858    |1     |5.0   |1425941523|Godfather, The         |Crime|Drama     |2019/01/21 14:08|4.0    |
|1221   |1     |5.0   |1425941546|Godfather: Part II, The|Crime|Drama     |2019/01/21 14:08|4.0    |
|1246   |1     |5.0   |1425941556|Dead Poets Society     |Drama           |2019/01/21 14:08|4.0    |
+-------+------+------+----------+-----------------------+----------------+----------------+-------+
only showing top 5 rows



In [0]:
ratings_movies_df = ratings_movies_df.withColumn('year', 
                                                 F.regexp_extract(F.col('title'),  '\((\d{4})\)', 1))

ratings_movies_df.show(5)

+-------+------+------+----------+--------------------+----------------+----------------+-------+----+
|movieId|userId|rating| timestamp|               title|          genres|             now|rating2|year|
+-------+------+------+----------+--------------------+----------------+----------------+-------+----+
|    110|     1|   1.0|1425941529|   Braveheart (1995)|Action|Drama|War|2019/01/21 14:08|    4.0|1995|
|    147|     1|   4.5|1425942435|Basketball Diarie...|           Drama|2019/01/21 14:08|    4.0|1995|
|    858|     1|   5.0|1425941523|Godfather, The (1...|     Crime|Drama|2019/01/21 14:08|    4.0|1972|
|   1221|     1|   5.0|1425941546|Godfather: Part I...|     Crime|Drama|2019/01/21 14:08|    4.0|1974|
|   1246|     1|   5.0|1425941556|Dead Poets Societ...|           Drama|2019/01/21 14:08|    4.0|1989|
+-------+------+------+----------+--------------------+----------------+----------------+-------+----+
only showing top 5 rows





## Casting

Con el método `withColumn` también es posible convertir el tipo de una columna con la función `cast`. Es importante saber que en caso de no poder convertirse (por ejemplo una letra a número) no saltará error y el resultado será un valor nulo.

In [0]:
ratings_movies_df.printSchema()

root
 |-- movieId: integer (nullable = true)
 |-- userId: integer (nullable = true)
 |-- rating: double (nullable = true)
 |-- timestamp: integer (nullable = true)
 |-- title: string (nullable = true)
 |-- genres: string (nullable = true)
 |-- now: string (nullable = false)
 |-- rating2: double (nullable = false)
 |-- year: string (nullable = true)



In [0]:
ratings_movies_df = ratings_movies_df.withColumn('year', F.col('year').cast('int'))
ratings_movies_df.show(5)

+-------+------+------+----------+--------------------+----------------+----------------+-------+----+
|movieId|userId|rating| timestamp|               title|          genres|             now|rating2|year|
+-------+------+------+----------+--------------------+----------------+----------------+-------+----+
|    110|     1|   1.0|1425941529|   Braveheart (1995)|Action|Drama|War|2019/01/21 14:08|    4.0|1995|
|    147|     1|   4.5|1425942435|Basketball Diarie...|           Drama|2019/01/21 14:08|    4.0|1995|
|    858|     1|   5.0|1425941523|Godfather, The (1...|     Crime|Drama|2019/01/21 14:08|    4.0|1972|
|   1221|     1|   5.0|1425941546|Godfather: Part I...|     Crime|Drama|2019/01/21 14:08|    4.0|1974|
|   1246|     1|   5.0|1425941556|Dead Poets Societ...|           Drama|2019/01/21 14:08|    4.0|1989|
+-------+------+------+----------+--------------------+----------------+----------------+-------+----+
only showing top 5 rows



In [0]:
ratings_movies_df = ratings_movies_df.withColumn('movieId', F.col('movieId').cast('string'))

In [0]:
ratings_movies_df.printSchema()

root
 |-- movieId: string (nullable = true)
 |-- userId: integer (nullable = true)
 |-- rating: double (nullable = true)
 |-- timestamp: integer (nullable = true)
 |-- title: string (nullable = true)
 |-- genres: string (nullable = true)
 |-- now: string (nullable = false)
 |-- rating2: double (nullable = false)
 |-- year: integer (nullable = true)



In [0]:
ratings_movies_df.withColumn('error', F.col('title').cast('int')).show(5)

+-------+------+------+----------+--------------------+----------------+----------------+-------+----+-----+
|movieId|userId|rating| timestamp|               title|          genres|             now|rating2|year|error|
+-------+------+------+----------+--------------------+----------------+----------------+-------+----+-----+
|    110|     1|   1.0|1425941529|   Braveheart (1995)|Action|Drama|War|2019/01/21 14:08|    4.0|1995| null|
|    147|     1|   4.5|1425942435|Basketball Diarie...|           Drama|2019/01/21 14:08|    4.0|1995| null|
|    858|     1|   5.0|1425941523|Godfather, The (1...|     Crime|Drama|2019/01/21 14:08|    4.0|1972| null|
|   1221|     1|   5.0|1425941546|Godfather: Part I...|     Crime|Drama|2019/01/21 14:08|    4.0|1974| null|
|   1246|     1|   5.0|1425941556|Dead Poets Societ...|           Drama|2019/01/21 14:08|    4.0|1989| null|
+-------+------+------+----------+--------------------+----------------+----------------+-------+----+-----+
only showing top 5 



## UDF (User Defined Functions)

Cuando no es posible definir la operación con las funciones de spark se pueden crear funciones propias usando la UDFs. Primero se crea  una función de Python normal y posteriormente se crea la UDFs. Es necesario indicar el tipo de la columna de salida en la UDF.

In [0]:
from pyspark.sql.types import StringType, IntegerType, DoubleType, DateType



_Aumenta el rating en un 15% para cada película más antigua que 2000 (el máximo siempre es 5)._

In [0]:
def increase_rating(year, rating):
    
    if year < 2000:
        rating = min(rating * 1.15, 5.0)
    
    return rating

In [0]:
increase_rating_udf = F.udf(increase_rating, DoubleType())

In [0]:
ratings_movies_df.withColumn('rating_inc', 
                              increase_rating_udf(F.col('year'), F.col('rating')))\
                 .select('title', 'year', 'rating', 'rating_inc')\
                 .show(20)

+--------------------+----+------+----------+
|               title|year|rating|rating_inc|
+--------------------+----+------+----------+
|   Braveheart (1995)|1995|   1.0|      1.15|
|Basketball Diarie...|1995|   4.5|       5.0|
|Godfather, The (1...|1972|   5.0|       5.0|
|Godfather: Part I...|1974|   5.0|       5.0|
|Dead Poets Societ...|1989|   5.0|       5.0|
|Breakfast Club, T...|1985|   4.0|       4.6|
|Sixth Sense, The ...|1999|   4.5|       5.0|
|Ferris Bueller's ...|1986|   5.0|       5.0|
|   Fight Club (1999)|1999|   4.0|       4.6|
|      Memento (2000)|2000|   4.0|       4.0|
| Donnie Darko (2001)|2001|   5.0|       5.0|
|Igby Goes Down (2...|2002|   5.0|       5.0|
|Batman Begins (2005)|2005|   4.0|       4.0|
|     Superbad (2007)|2007|   3.5|       3.5|
|Dark Knight, The ...|2008|   4.0|       4.0|
|     Iron Man (2008)|2008|   5.0|       5.0|
|    Star Trek (2009)|2009|   5.0|       5.0|
|Harry Potter and ...|2009|   5.0|       5.0|
|Sherlock Holmes (...|2009|   5.0|



Extrae el año de la película sin usar expresiones regulares.

In [0]:
title = 'Trainspotting (1996)'

In [0]:
title.replace(')', '').replace('(', '')

'Trainspotting 1996'

In [0]:
year = title.replace(')', '').replace('(', '').split(' ')[-1]
year = int(year)
year

1996

In [0]:
def get_year(title): 
    
    year = title.replace(')', '').replace('(', '').split(' ')[-1]
    if year.isnumeric():
        year = int(year)
    else:
        year = -1
    
    return year

In [0]:
get_year_udf = F.udf(get_year, IntegerType())

In [0]:
ratings_movies_df.withColumn('year2', get_year_udf(F.col('title')))\
                 .select('title', 'year', 'year2').show(10, truncate=False)

+-------------------------------+----+-----+
|title                          |year|year2|
+-------------------------------+----+-----+
|Braveheart (1995)              |1995|1995 |
|Basketball Diaries, The (1995) |1995|1995 |
|Godfather, The (1972)          |1972|1972 |
|Godfather: Part II, The (1974) |1974|1974 |
|Dead Poets Society (1989)      |1989|1989 |
|Breakfast Club, The (1985)     |1985|1985 |
|Sixth Sense, The (1999)        |1999|1999 |
|Ferris Bueller's Day Off (1986)|1986|1986 |
|Fight Club (1999)              |1999|1999 |
|Memento (2000)                 |2000|2000 |
+-------------------------------+----+-----+
only showing top 10 rows





# Datetimes

Hay varias funciones de _pyspark_ que permiten trabajar con fechas: diferencia entre fechas, dia de la semana, año... Pero para ello primero es necesario transformar las columnas a tipo fecha. Se permite la conversion de dos formatos de fecha:
* timestamp de unix: una columna de tipo entero con los segundos trascurridos entre la medianoche del 1 de Enero de 1990 hasta la fecha.
* cadena: la fecha representada como una cadena siguiendo un formato específico que puede variar.

In [0]:
ratings_movies_df.select('title', 'timestamp', 'now').show(5)

+--------------------+----------+----------------+
|               title| timestamp|             now|
+--------------------+----------+----------------+
|   Braveheart (1995)|1425941529|2019/01/21 14:08|
|Basketball Diarie...|1425942435|2019/01/21 14:08|
|Godfather, The (1...|1425941523|2019/01/21 14:08|
|Godfather: Part I...|1425941546|2019/01/21 14:08|
|Dead Poets Societ...|1425941556|2019/01/21 14:08|
+--------------------+----------+----------------+
only showing top 5 rows



 

## unix timestamp a datetime

In [0]:
ratings_movies_df = ratings_movies_df.withColumn('datetime', F.from_unixtime(F.col('timestamp')))
ratings_movies_df.select('datetime', 'timestamp').show(10)

+-------------------+----------+
|           datetime| timestamp|
+-------------------+----------+
|2015-03-09 22:52:09|1425941529|
|2015-03-09 23:07:15|1425942435|
|2015-03-09 22:52:03|1425941523|
|2015-03-09 22:52:26|1425941546|
|2015-03-09 22:52:36|1425941556|
|2015-03-09 23:02:28|1425942148|
|2015-03-09 22:48:20|1425941300|
|2015-03-09 22:53:13|1425941593|
|2015-03-09 22:53:21|1425941601|
|2015-03-09 23:03:48|1425942228|
+-------------------+----------+
only showing top 10 rows





## string a datetime

In [0]:
ratings_movies_df = ratings_movies_df.withColumn('now_datetime', 
                                                 F.from_unixtime(F.unix_timestamp(F.col('now'), 'yyyy/MM/dd HH:mm')))

ratings_movies_df.select('now', 'now_datetime').show(10)

+----------------+-------------------+
|             now|       now_datetime|
+----------------+-------------------+
|2019/01/21 14:08|2019-01-21 14:08:00|
|2019/01/21 14:08|2019-01-21 14:08:00|
|2019/01/21 14:08|2019-01-21 14:08:00|
|2019/01/21 14:08|2019-01-21 14:08:00|
|2019/01/21 14:08|2019-01-21 14:08:00|
|2019/01/21 14:08|2019-01-21 14:08:00|
|2019/01/21 14:08|2019-01-21 14:08:00|
|2019/01/21 14:08|2019-01-21 14:08:00|
|2019/01/21 14:08|2019-01-21 14:08:00|
|2019/01/21 14:08|2019-01-21 14:08:00|
+----------------+-------------------+
only showing top 10 rows





## funciones con datetimes

# Nueva sección

In [0]:
ratings_movies_df.select('now_datetime', 'datetime', 
                          F.datediff(F.col('now_datetime'), F.col('datetime'))).show(10)

+-------------------+-------------------+--------------------------------+
|       now_datetime|           datetime|datediff(now_datetime, datetime)|
+-------------------+-------------------+--------------------------------+
|2019-01-21 14:08:00|2015-03-09 22:52:09|                            1414|
|2019-01-21 14:08:00|2015-03-09 23:07:15|                            1414|
|2019-01-21 14:08:00|2015-03-09 22:52:03|                            1414|
|2019-01-21 14:08:00|2015-03-09 22:52:26|                            1414|
|2019-01-21 14:08:00|2015-03-09 22:52:36|                            1414|
|2019-01-21 14:08:00|2015-03-09 23:02:28|                            1414|
|2019-01-21 14:08:00|2015-03-09 22:48:20|                            1414|
|2019-01-21 14:08:00|2015-03-09 22:53:13|                            1414|
|2019-01-21 14:08:00|2015-03-09 22:53:21|                            1414|
|2019-01-21 14:08:00|2015-03-09 23:03:48|                            1414|
+-------------------+----

In [0]:
ratings_movies_df.select('datetime', F.date_add(F.col('datetime'), 10)).show(10)

+-------------------+----------------------+
|           datetime|date_add(datetime, 10)|
+-------------------+----------------------+
|2015-03-09 22:52:09|            2015-03-19|
|2015-03-09 23:07:15|            2015-03-19|
|2015-03-09 22:52:03|            2015-03-19|
|2015-03-09 22:52:26|            2015-03-19|
|2015-03-09 22:52:36|            2015-03-19|
|2015-03-09 23:02:28|            2015-03-19|
|2015-03-09 22:48:20|            2015-03-19|
|2015-03-09 22:53:13|            2015-03-19|
|2015-03-09 22:53:21|            2015-03-19|
|2015-03-09 23:03:48|            2015-03-19|
+-------------------+----------------------+
only showing top 10 rows



In [0]:
ratings_movies_df.withColumn('datetime_plus_4_months', F.add_months(F.col('datetime'), 4))\
                  .select('datetime', 'datetime_plus_4_months').show(5)

+-------------------+----------------------+
|           datetime|datetime_plus_4_months|
+-------------------+----------------------+
|2015-03-09 22:52:09|            2015-07-09|
|2015-03-09 23:07:15|            2015-07-09|
|2015-03-09 22:52:03|            2015-07-09|
|2015-03-09 22:52:26|            2015-07-09|
|2015-03-09 22:52:36|            2015-07-09|
+-------------------+----------------------+
only showing top 5 rows



In [0]:
ratings_movies_df.select('datetime', F.month(F.col('datetime')).alias('month')).show(10)

+-------------------+-----+
|           datetime|month|
+-------------------+-----+
|2015-03-09 22:52:09|    3|
|2015-03-09 23:07:15|    3|
|2015-03-09 22:52:03|    3|
|2015-03-09 22:52:26|    3|
|2015-03-09 22:52:36|    3|
|2015-03-09 23:02:28|    3|
|2015-03-09 22:48:20|    3|
|2015-03-09 22:53:13|    3|
|2015-03-09 22:53:21|    3|
|2015-03-09 23:03:48|    3|
+-------------------+-----+
only showing top 10 rows



In [0]:
ratings_movies_df.select('datetime', F.last_day(F.col('datetime')).alias('last_day')).show(10)

+-------------------+----------+
|           datetime|  last_day|
+-------------------+----------+
|2015-03-09 22:52:09|2015-03-31|
|2015-03-09 23:07:15|2015-03-31|
|2015-03-09 22:52:03|2015-03-31|
|2015-03-09 22:52:26|2015-03-31|
|2015-03-09 22:52:36|2015-03-31|
|2015-03-09 23:02:28|2015-03-31|
|2015-03-09 22:48:20|2015-03-31|
|2015-03-09 22:53:13|2015-03-31|
|2015-03-09 22:53:21|2015-03-31|
|2015-03-09 23:03:48|2015-03-31|
+-------------------+----------+
only showing top 10 rows



In [0]:
ratings_movies_df.select('datetime', F.dayofmonth(F.col('datetime')).alias('day'),
                                     F.dayofyear(F.col('datetime')).alias('year_day'),
                                     F.date_format(F.col('datetime'), 'E').alias('weekday')).show(10)

+-------------------+---+--------+-------+
|           datetime|day|year_day|weekday|
+-------------------+---+--------+-------+
|2015-03-09 22:52:09|  9|      68|    Mon|
|2015-03-09 23:07:15|  9|      68|    Mon|
|2015-03-09 22:52:03|  9|      68|    Mon|
|2015-03-09 22:52:26|  9|      68|    Mon|
|2015-03-09 22:52:36|  9|      68|    Mon|
|2015-03-09 23:02:28|  9|      68|    Mon|
|2015-03-09 22:48:20|  9|      68|    Mon|
|2015-03-09 22:53:13|  9|      68|    Mon|
|2015-03-09 22:53:21|  9|      68|    Mon|
|2015-03-09 23:03:48|  9|      68|    Mon|
+-------------------+---+--------+-------+
only showing top 10 rows





Para filtrar por fechas se pueden comparar directamente con una cadena en el formato YYYY-MM-DD hh:mm:ss ya que será interpretada como una fecha.

In [0]:
ratings_movies_df.filter(F.col('datetime') >= "2015-09-30 20:00:00").select('datetime', 'title', 'rating').show(10)

+-------------------+--------------------+------+
|           datetime|               title|rating|
+-------------------+--------------------+------+
|2017-02-05 00:14:07|Léon: The Profess...|   5.0|
|2017-02-05 00:13:15|Shawshank Redempt...|   4.0|
|2017-02-05 00:13:19|  Matrix, The (1999)|   4.0|
|2017-02-05 00:23:06|American Beauty (...|   4.5|
|2017-02-05 00:13:22|   Fight Club (1999)|   4.5|
|2017-02-05 00:16:21|American Psycho (...|   5.0|
|2017-02-05 00:14:59|Meet the Parents ...|   1.5|
|2017-02-05 00:14:43|    Cast Away (2000)|   4.0|
|2017-02-05 00:14:32|        Shrek (2001)|   2.5|
|2017-02-05 00:14:47|Harry Potter and ...|   4.5|
+-------------------+--------------------+------+
only showing top 10 rows



In [0]:
ratings_movies_df.filter(F.col('datetime').between("2003-01-31", "2003-02-10"))\
                  .select('datetime', 'title', 'rating').show(5)

+-------------------+--------------------+------+
|           datetime|               title|rating|
+-------------------+--------------------+------+
|2003-01-31 13:46:21|Sense and Sensibi...|   4.0|
|2003-01-31 13:33:25|   Braveheart (1995)|   3.0|
|2003-01-31 13:35:28|  French Kiss (1995)|   1.0|
|2003-01-31 13:41:21| Pulp Fiction (1994)|   3.0|
|2003-01-31 13:41:31|Muriel's Wedding ...|   2.0|
+-------------------+--------------------+------+
only showing top 5 rows



In [0]:
ratings_movies_df.filter(F.year(F.col('datetime')) >= 2012)\
                 .select('datetime', 'title', 'rating').show(5)

+-------------------+--------------------+------+
|           datetime|               title|rating|
+-------------------+--------------------+------+
|2015-03-09 22:52:09|   Braveheart (1995)|   1.0|
|2015-03-09 23:07:15|Basketball Diarie...|   4.5|
|2015-03-09 22:52:03|Godfather, The (1...|   5.0|
|2015-03-09 22:52:26|Godfather: Part I...|   5.0|
|2015-03-09 22:52:36|Dead Poets Societ...|   5.0|
+-------------------+--------------------+------+
only showing top 5 rows

