In [32]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, from_unixtime, to_timestamp, collect_set
from pyspark.sql import functions as f

# Default ISO8601 format for dates
DATEFORMAT = "yyyy-MM-dd'T'HH:mm:ssZ"

spark = SparkSession.builder.appName('lineage_graph_test').getOrCreate()


In [13]:
links = spark.read.csv(
    "/home/jovyan/data-sets/ml-latest-small/links.csv", 
    header=True, 
    schema='movieId INT, imdbId STRING, tmdbId STRING'
)
links.show()
links.printSchema()

+-------+-------+------+
|movieId| imdbId|tmdbId|
+-------+-------+------+
|      1|0114709|   862|
|      2|0113497|  8844|
|      3|0113228| 15602|
|      4|0114885| 31357|
|      5|0113041| 11862|
|      6|0113277|   949|
|      7|0114319| 11860|
|      8|0112302| 45325|
|      9|0114576|  9091|
|     10|0113189|   710|
|     11|0112346|  9087|
|     12|0112896| 12110|
|     13|0112453| 21032|
|     14|0113987| 10858|
|     15|0112760|  1408|
|     16|0112641|   524|
|     17|0114388|  4584|
|     18|0113101|     5|
|     19|0112281|  9273|
|     20|0113845| 11517|
+-------+-------+------+
only showing top 20 rows

root
 |-- movieId: integer (nullable = true)
 |-- imdbId: string (nullable = true)
 |-- tmdbId: string (nullable = true)



In [261]:
movies = spark.read.csv(
    "/home/jovyan/data-sets/ml-latest-small/movies.csv",
    header=True,
    schema="movieId INT, title STRING, genres STRING",
)

# movies.where(col("genres") == "(no genres listed)").show()
movies = movies.replace("(no genres listed)", None, "genres")

movie_genre = (
    movies
    .withColumn("genres_array", f.split("genres", "\|"))
    .withColumn("genre", f.explode("genres_array"))
    .select("movieId", "genre")
)
# movie_genre.show()

movies = movies.withColumn("genres", f.split("genres", "\|"))
movies.show()

available_genres = movie_genre.select("genre").distinct()
# available_genres.show()

+-------+--------------------+--------------------+
|movieId|               title|              genres|
+-------+--------------------+--------------------+
|      1|    Toy Story (1995)|[Adventure, Anima...|
|      2|      Jumanji (1995)|[Adventure, Child...|
|      3|Grumpier Old Men ...|   [Comedy, Romance]|
|      4|Waiting to Exhale...|[Comedy, Drama, R...|
|      5|Father of the Bri...|            [Comedy]|
|      6|         Heat (1995)|[Action, Crime, T...|
|      7|      Sabrina (1995)|   [Comedy, Romance]|
|      8| Tom and Huck (1995)|[Adventure, Child...|
|      9| Sudden Death (1995)|            [Action]|
|     10|    GoldenEye (1995)|[Action, Adventur...|
|     11|American Presiden...|[Comedy, Drama, R...|
|     12|Dracula: Dead and...|    [Comedy, Horror]|
|     13|        Balto (1995)|[Adventure, Anima...|
|     14|        Nixon (1995)|             [Drama]|
|     15|Cutthroat Island ...|[Action, Adventur...|
|     16|       Casino (1995)|      [Crime, Drama]|
|     17|Sen

In [262]:
tags = spark.read.csv(
    "/home/jovyan/data-sets/ml-latest-small/tags.csv",
    header=True,
    schema="userId INT, movieId INT, tag STRING, timestamp STRING",
).withColumn("timestamp", to_timestamp(from_unixtime(col("timestamp"))))
tags.show()
distint_tags_per_movie = tags.select("movieId", "tag").distinct().sort("movieId")

tags_per_movie = (
    tags.select("movieId", "tag")
    .groupBy("movieId")
    .agg(collect_set("tag"))
    .sort("movieId")
    .withColumnRenamed("collect_set(tag)", "tags")
)

tags_per_movie.show()

+------+-------+-----------------+-------------------+
|userId|movieId|              tag|          timestamp|
+------+-------+-----------------+-------------------+
|     2|  60756|            funny|2015-10-24 19:29:54|
|     2|  60756|  Highly quotable|2015-10-24 19:29:56|
|     2|  60756|     will ferrell|2015-10-24 19:29:52|
|     2|  89774|     Boxing story|2015-10-24 19:33:27|
|     2|  89774|              MMA|2015-10-24 19:33:20|
|     2|  89774|        Tom Hardy|2015-10-24 19:33:25|
|     2| 106782|            drugs|2015-10-24 19:30:54|
|     2| 106782|Leonardo DiCaprio|2015-10-24 19:30:51|
|     2| 106782|  Martin Scorsese|2015-10-24 19:30:56|
|     7|  48516|     way too long|2007-01-25 01:08:45|
|    18|    431|        Al Pacino|2016-05-01 21:39:25|
|    18|    431|         gangster|2016-05-01 21:39:09|
|    18|    431|            mafia|2016-05-01 21:39:15|
|    18|   1221|        Al Pacino|2016-04-26 19:35:06|
|    18|   1221|            Mafia|2016-04-26 19:35:03|
|    18|  

In [64]:
movies_with_tags = movies.join(tags_per_movie, "movieId")
movies_with_tags.show()
movies_with_tags.printSchema()

+-------+--------------------+--------------------+--------------------+
|movieId|               title|              genres|                tags|
+-------+--------------------+--------------------+--------------------+
|      1|    Toy Story (1995)|[Adventure, Anima...|        [pixar, fun]|
|      2|      Jumanji (1995)|[Adventure, Child...|[fantasy, game, m...|
|      3|Grumpier Old Men ...|   [Comedy, Romance]|        [old, moldy]|
|      5|Father of the Bri...|            [Comedy]| [pregnancy, remake]|
|      7|      Sabrina (1995)|   [Comedy, Romance]|            [remake]|
|     11|American Presiden...|[Comedy, Drama, R...|[politics, presid...|
|     14|        Nixon (1995)|             [Drama]|[politics, presid...|
|     16|       Casino (1995)|      [Crime, Drama]|             [Mafia]|
|     17|Sense and Sensibi...|    [Drama, Romance]|       [Jane Austen]|
|     21|   Get Shorty (1995)|[Comedy, Crime, T...|         [Hollywood]|
|     22|      Copycat (1995)|[Crime, Drama, Ho...|