In [4]:
from pyspark.sql import SparkSession, functions as f

In [5]:
spark = (
    SparkSession
    .builder
    .appName("Hands-on-4")
    .master("local[*]")
    .getOrCreate()
)

### Setting up where we left off

In [6]:
df_ratings = (
    spark
    .read
    .csv(
        path="../../data-sets/ml-latest/ratings.csv",
        header=True,
        encoding="UTF-8",
        sep=",",
        quote='"',
        schema="userId INT, movieId INT, rating DOUBLE, timestamp INT"
    )
    .withColumn("timestamp", f.to_timestamp(f.from_unixtime("timestamp")))
)

In [7]:
df_ratings.show(n=5)
df_ratings.printSchema()

+------+-------+------+-------------------+
|userId|movieId|rating|          timestamp|
+------+-------+------+-------------------+
|     1|    307|   3.5|2009-10-28 02:30:21|
|     1|    481|   3.5|2009-10-28 02:34:16|
|     1|   1091|   1.5|2009-10-28 02:34:31|
|     1|   1257|   4.5|2009-10-28 02:34:20|
|     1|   1449|   4.5|2009-10-28 02:31:04|
+------+-------+------+-------------------+
only showing top 5 rows

root
 |-- userId: integer (nullable = true)
 |-- movieId: integer (nullable = true)
 |-- rating: double (nullable = true)
 |-- timestamp: timestamp (nullable = true)



In [8]:
df_movies = (
    spark
    .read
    .csv(
        path="../../data-sets/ml-latest/movies.csv",
        encoding="UTF-8",
        sep=",",
        header=True,
        quote='"',
        schema="movieId INT, title STRING, genres STRING",
    )
)

In [9]:
df_movies.show(n=15, truncate=False)
df_movies.printSchema()

+-------+----------------------------------+-------------------------------------------+
|movieId|title                             |genres                                     |
+-------+----------------------------------+-------------------------------------------+
|1      |Toy Story (1995)                  |Adventure|Animation|Children|Comedy|Fantasy|
|2      |Jumanji (1995)                    |Adventure|Children|Fantasy                 |
|3      |Grumpier Old Men (1995)           |Comedy|Romance                             |
|4      |Waiting to Exhale (1995)          |Comedy|Drama|Romance                       |
|5      |Father of the Bride Part II (1995)|Comedy                                     |
|6      |Heat (1995)                       |Action|Crime|Thriller                      |
|7      |Sabrina (1995)                    |Comedy|Romance                             |
|8      |Tom and Huck (1995)               |Adventure|Children                         |
|9      |Sudden Death

In [10]:
df_movie_genre = (
    df_movies
    .withColumn("genres", f.split("genres", "\|"))
    .withColumn("genre", f.explode("genres"))
    .drop("genres")
)

In [11]:
df_movie_genre.show(n=15, truncate=False)
df_movie_genre.printSchema()

+-------+----------------------------------+---------+
|movieId|title                             |genre    |
+-------+----------------------------------+---------+
|1      |Toy Story (1995)                  |Adventure|
|1      |Toy Story (1995)                  |Animation|
|1      |Toy Story (1995)                  |Children |
|1      |Toy Story (1995)                  |Comedy   |
|1      |Toy Story (1995)                  |Fantasy  |
|2      |Jumanji (1995)                    |Adventure|
|2      |Jumanji (1995)                    |Children |
|2      |Jumanji (1995)                    |Fantasy  |
|3      |Grumpier Old Men (1995)           |Comedy   |
|3      |Grumpier Old Men (1995)           |Romance  |
|4      |Waiting to Exhale (1995)          |Comedy   |
|4      |Waiting to Exhale (1995)          |Drama    |
|4      |Waiting to Exhale (1995)          |Romance  |
|5      |Father of the Bride Part II (1995)|Comedy   |
|6      |Heat (1995)                       |Action   |
+-------+-

In [12]:
available_genres = (
    df_movie_genre
    .select("genre")
    .distinct()
)

In [13]:
available_genres.collect()

[Row(genre='Crime'),
 Row(genre='Romance'),
 Row(genre='Thriller'),
 Row(genre='Adventure'),
 Row(genre='Drama'),
 Row(genre='War'),
 Row(genre='Documentary'),
 Row(genre='Fantasy'),
 Row(genre='Mystery'),
 Row(genre='Musical'),
 Row(genre='Animation'),
 Row(genre='Film-Noir'),
 Row(genre='(no genres listed)'),
 Row(genre='IMAX'),
 Row(genre='Horror'),
 Row(genre='Western'),
 Row(genre='Comedy'),
 Row(genre='Children'),
 Row(genre='Action'),
 Row(genre='Sci-Fi')]

In [14]:
movies_without_genres = (
    df_movies
    .where(
        f.col("genres")
        == "(no genres listed)"
    )
)

In [15]:
print(f"Total {movies_without_genres.count()} movie(s) does not have genres")
movies_without_genres.show(truncate=False)

Total 4266 movie(s) does not have genres
+-------+-----------------------------------------------+------------------+
|movieId|title                                          |genres            |
+-------+-----------------------------------------------+------------------+
|83773  |Away with Words (San tiao ren) (1999)          |(no genres listed)|
|83829  |Scorpio Rising (1964)                          |(no genres listed)|
|84768  |Glitterbug (1994)                              |(no genres listed)|
|86493  |Age of the Earth, The (A Idade da Terra) (1980)|(no genres listed)|
|87061  |Trails (Veredas) (1978)                        |(no genres listed)|
|91246  |Milky Way (Tejút) (2007)                       |(no genres listed)|
|92435  |Dancing Hawk, The (Tanczacy jastrzab) (1978)   |(no genres listed)|
|92641  |Warsaw Bridge (Pont de Varsòvia) (1990)        |(no genres listed)|
|94431  |Ella Lola, a la Trilby (1898)                  |(no genres listed)|
|94657  |Turkish Dance, Ella Lola (

### Let's read links.csv and tags.csv for further analysis

In [16]:
df_links = (
    spark
    .read
    .csv(
        path="../../data-sets/ml-latest/links.csv",
        encoding="UTF-8",
        header=True,
        sep=",",
        quote='"',
        schema="movieId INT, imdbId INT, tmdbId INT",
    )
)

In [17]:
df_links.show(n=5)
df_links.printSchema()

+-------+------+------+
|movieId|imdbId|tmdbId|
+-------+------+------+
|      1|114709|   862|
|      2|113497|  8844|
|      3|113228| 15602|
|      4|114885| 31357|
|      5|113041| 11862|
+-------+------+------+
only showing top 5 rows

root
 |-- movieId: integer (nullable = true)
 |-- imdbId: integer (nullable = true)
 |-- tmdbId: integer (nullable = true)



In [18]:
df_tags = (
    spark
    .read
    .csv(
        path="../../data-sets/ml-latest/tags.csv",
        encoding="UTF-8",
        header=True,
        sep=",",
        quote='"',
        schema="userId INT, movieId INT, tag STRING, timestamp INT",
    )
    .withColumn("timestamp", f.to_timestamp(f.from_unixtime("timestamp")))
)

In [19]:
df_tags.show(n=5)
df_tags.printSchema()

+------+-------+------------+-------------------+
|userId|movieId|         tag|          timestamp|
+------+-------+------------+-------------------+
|    14|    110|        epic|2015-09-25 08:05:38|
|    14|    110|    Medieval|2015-09-25 08:05:32|
|    14|    260|      sci-fi|2015-09-14 00:06:50|
|    14|    260|space action|2015-09-14 00:07:01|
|    14|    318|imdb top 250|2015-09-19 03:56:35|
+------+-------+------------+-------------------+
only showing top 5 rows

root
 |-- userId: integer (nullable = true)
 |-- movieId: integer (nullable = true)
 |-- tag: string (nullable = true)
 |-- timestamp: timestamp (nullable = true)



### Let's compute number of movies per genre

In [20]:
df_movies_per_genre = (
    df_movie_genre
    .groupBy("genre")
    .count()
)

In [26]:
df_movies_per_genre.show()
df_movies_per_genre.printSchema()

+------------------+-----+
|             genre|count|
+------------------+-----+
|             Crime| 5105|
|           Romance| 7412|
|          Thriller| 8216|
|         Adventure| 4067|
|             Drama|24144|
|               War| 1820|
|       Documentary| 5118|
|           Fantasy| 2637|
|           Mystery| 2773|
|           Musical| 1113|
|         Animation| 2663|
|         Film-Noir|  364|
|(no genres listed)| 4266|
|              IMAX|  197|
|            Horror| 5555|
|           Western| 1378|
|            Comedy|15956|
|          Children| 2749|
|            Action| 7130|
|            Sci-Fi| 3444|
+------------------+-----+

root
 |-- genre: string (nullable = true)
 |-- count: long (nullable = false)



### Let's join df_movies and df_tags to get movie title with tags

In [44]:
df_opinion = (
    df_movies
    .join(
        df_tags,
        on=df_movies["movieId"] == df_tags["movieId"],
        how="inner", # by default the join type is inner, but better provide than to be sorry
    )
)

In [45]:
df_opinion.show(n=15, truncate=False)
df_opinion.printSchema()

+-------+-----------------------------------------+--------------------------------------+------+-------+--------------+-------------------+
|movieId|title                                    |genres                                |userId|movieId|tag           |timestamp          |
+-------+-----------------------------------------+--------------------------------------+------+-------+--------------+-------------------+
|110    |Braveheart (1995)                        |Action|Drama|War                      |14    |110    |epic          |2015-09-25 08:05:38|
|110    |Braveheart (1995)                        |Action|Drama|War                      |14    |110    |Medieval      |2015-09-25 08:05:32|
|260    |Star Wars: Episode IV - A New Hope (1977)|Action|Adventure|Sci-Fi               |14    |260    |sci-fi        |2015-09-14 00:06:50|
|260    |Star Wars: Episode IV - A New Hope (1977)|Action|Adventure|Sci-Fi               |14    |260    |space action  |2015-09-14 00:07:01|
|318    |Shaw

### It's not easy to visualize, so let's select useful rows

In [46]:
df_opinion = (
    df_movies
    .join(
        df_tags,
        on=df_movies["movieId"] == df_tags["movieId"],
        how="inner",
    )
    .select("userId", "movieId", "title", "tag", "timestamp")
)

AnalysisException: Reference 'movieId' is ambiguous, could be: movieId, movieId.;

### Notice that we have two movieId columns, if we wan't to select only one we can't do that the usual way. A work around to that is to use a different syntax for providing `on` argument. As we know that movieId is present in both the dataframes, we can use something called a list notation for join clause

In [47]:
df_opinion = (
    df_movies
    .join(
        df_tags,
        on=["movieId"],
        how="inner",
    )
)

In [48]:
df_opinion.show(n=15, truncate=False)
df_opinion.printSchema()

+-------+-----------------------------------------+--------------------------------------+------+--------------+-------------------+
|movieId|title                                    |genres                                |userId|tag           |timestamp          |
+-------+-----------------------------------------+--------------------------------------+------+--------------+-------------------+
|110    |Braveheart (1995)                        |Action|Drama|War                      |14    |epic          |2015-09-25 08:05:38|
|110    |Braveheart (1995)                        |Action|Drama|War                      |14    |Medieval      |2015-09-25 08:05:32|
|260    |Star Wars: Episode IV - A New Hope (1977)|Action|Adventure|Sci-Fi               |14    |sci-fi        |2015-09-14 00:06:50|
|260    |Star Wars: Episode IV - A New Hope (1977)|Action|Adventure|Sci-Fi               |14    |space action  |2015-09-14 00:07:01|
|318    |Shawshank Redemption, The (1994)         |Crime|Drama       

### Now we can see that we only have one column for movieId. So we can use .select without any ambiguity

In [49]:
df_opinion = (
    df_movies
    .join(
        df_tags,
        on=["movieId"],
        how="inner",
    )
    .select("userId", "movieId", "title", "tag", "timestamp")
)

In [50]:
df_opinion.show(n=15, truncate=False)
df_opinion.printSchema()

+------+-------+-----------------------------------------+--------------+-------------------+
|userId|movieId|title                                    |tag           |timestamp          |
+------+-------+-----------------------------------------+--------------+-------------------+
|14    |110    |Braveheart (1995)                        |epic          |2015-09-25 08:05:38|
|14    |110    |Braveheart (1995)                        |Medieval      |2015-09-25 08:05:32|
|14    |260    |Star Wars: Episode IV - A New Hope (1977)|sci-fi        |2015-09-14 00:06:50|
|14    |260    |Star Wars: Episode IV - A New Hope (1977)|space action  |2015-09-14 00:07:01|
|14    |318    |Shawshank Redemption, The (1994)         |imdb top 250  |2015-09-19 03:56:35|
|14    |318    |Shawshank Redemption, The (1994)         |justice       |2015-09-19 03:56:32|
|14    |480    |Jurassic Park (1993)                     |Dinosaurs     |2015-09-25 08:06:03|
|14    |593    |Silence of the Lambs, The (1991)         |ps

### Much better!

In [52]:
df_opinion_ext = (
    df_opinion
    .withColumnRenamed("timestamp", "tag_time")
    .join(
        df_ratings,
        on=["movieId", "userId"]
    )
)

In [53]:
df_opinion_ext.show(n=15, truncate=False)
df_opinion_ext.printSchema()

+-------+------+----------------------------+----------------+-------------------+------+-------------------+
|movieId|userId|title                       |tag             |tag_time           |rating|timestamp          |
+-------+------+----------------------------+----------------+-------------------+------+-------------------+
|1      |277453|Toy Story (1995)            |animation       |2006-05-12 10:43:44|4.0   |2006-05-12 10:43:47|
|1      |277453|Toy Story (1995)            |Disney          |2006-05-12 21:35:42|4.0   |2006-05-12 10:43:47|
|1      |277453|Toy Story (1995)            |Pixar           |2006-05-12 10:43:42|4.0   |2006-05-12 10:43:47|
|1      |277453|Toy Story (1995)            |toys            |2006-05-12 21:35:40|4.0   |2006-05-12 10:43:47|
|6      |11476 |Heat (1995)                 |Al Pacino       |2013-08-15 20:10:02|3.5   |2013-08-15 20:09:35|
|6      |11476 |Heat (1995)                 |crime           |2013-08-15 20:10:11|3.5   |2013-08-15 20:09:35|
|6      |1

In [54]:
df_movies.count()

58098

In [55]:
df_ratings.count()

27753444

In [56]:
df_opinion_ext.count()

830492

In [None]:
spark.stop()