# Load Data

In [1]:
# install Java8
!apt-get install openjdk-8-jdk-headless -qq > /dev/null
# download spark2.4.5
!wget -q https://archive.apache.org/dist/spark/spark-2.4.5/spark-2.4.5-bin-hadoop2.7.tgz
# unzip it
!tar xf spark-2.4.5-bin-hadoop2.7.tgz
# install findspark
!pip install -q findspark
# Google Colab has Java 11 available, test it using below command -
!ls /usr/lib/jvm
#install pyarrow
!pip install -U pyarrow

default-java		   java-11-openjdk-amd64     java-8-openjdk-amd64
java-1.11.0-openjdk-amd64  java-1.8.0-openjdk-amd64
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [2]:
import os
import findspark

os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-8-openjdk-amd64"
os.environ["SPARK_HOME"] = "/content/spark-2.4.5-bin-hadoop2.7"

findspark.init()
from pyspark.sql import SparkSession

spark = SparkSession.builder.master("local[*]").getOrCreate()
spark_params = {
"spark.executor.memory" : "4g",
"spark.driver.memory": "4g",
"spark.memory.fraction": "0.9"}
for param, value in spark_params.items():
  spark.conf.set(param, value)

!wget "https://datasets.imdbws.com/name.basics.tsv.gz"
!wget "https://datasets.imdbws.com/title.akas.tsv.gz"
!wget "https://datasets.imdbws.com/title.basics.tsv.gz"
!wget "https://datasets.imdbws.com/title.crew.tsv.gz"
!wget "https://datasets.imdbws.com/title.episode.tsv.gz"
!wget "https://datasets.imdbws.com/title.principals.tsv.gz"
!wget "https://datasets.imdbws.com/title.ratings.tsv.gz"

title_ratings = spark.read.csv("title.ratings.tsv.gz", sep='\t', header=True)
title_principals = spark.read.csv("title.principals.tsv.gz", sep='\t',header=True)
title_episode = spark.read.csv("title.episode.tsv.gz", sep='\t', header=True)
title_crew = spark.read.csv("title.crew.tsv.gz", sep='\t', header=True)
title_basics = spark.read.csv("title.basics.tsv.gz", sep='\t', header=True)
title_akas = spark.read.csv("title.akas.tsv.gz", sep='\t', header=True)
name_basics = spark.read.csv("name.basics.tsv.gz", sep='\t', header=True)

--2022-11-04 16:48:17--  https://datasets.imdbws.com/name.basics.tsv.gz
Resolving datasets.imdbws.com (datasets.imdbws.com)... 65.9.86.67, 65.9.86.88, 65.9.86.82, ...
Connecting to datasets.imdbws.com (datasets.imdbws.com)|65.9.86.67|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 235890561 (225M) [binary/octet-stream]
Saving to: ‘name.basics.tsv.gz.1’


2022-11-04 16:48:20 (77.4 MB/s) - ‘name.basics.tsv.gz.1’ saved [235890561/235890561]

--2022-11-04 16:48:20--  https://datasets.imdbws.com/title.akas.tsv.gz
Resolving datasets.imdbws.com (datasets.imdbws.com)... 65.9.86.67, 65.9.86.88, 65.9.86.82, ...
Connecting to datasets.imdbws.com (datasets.imdbws.com)|65.9.86.67|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 286499313 (273M) [binary/octet-stream]
Saving to: ‘title.akas.tsv.gz.1’


2022-11-04 16:48:24 (71.2 MB/s) - ‘title.akas.tsv.gz.1’ saved [286499313/286499313]

--2022-11-04 16:48:24--  https://datasets.imdbws.com/title.basics

# Prepare Data

Data overview

In [3]:
print("title_ratings")
title_ratings.show(2)
print("title_principals")
title_principals.show(2)
print("title_episode")
title_episode.show(2)
print("title_crew")
title_crew.show(2)
print("title_basics")
title_basics.show(2)
print("title_akas")
title_akas.show(2)
print("name_basics")
name_basics.show(2)

title_ratings
+---------+-------------+--------+
|   tconst|averageRating|numVotes|
+---------+-------------+--------+
|tt0000001|          5.7|    1923|
|tt0000002|          5.8|     260|
+---------+-------------+--------+
only showing top 2 rows

title_principals
+---------+--------+---------+--------+---+----------+
|   tconst|ordering|   nconst|category|job|characters|
+---------+--------+---------+--------+---+----------+
|tt0000001|       1|nm1588970|    self| \N|  ["Self"]|
|tt0000001|       2|nm0005690|director| \N|        \N|
+---------+--------+---------+--------+---+----------+
only showing top 2 rows

title_episode
+---------+------------+------------+-------------+
|   tconst|parentTconst|seasonNumber|episodeNumber|
+---------+------------+------------+-------------+
|tt0041951|   tt0041038|           1|            9|
|tt0042816|   tt0989125|           1|           17|
+---------+------------+------------+-------------+
only showing top 2 rows

title_crew
+---------+------

Join data

In [4]:
to_print = ["title_basics", "title_ratings", "title_principals", 
            "title_episode", "name_basics"]

for p in to_print:
  print(f"Dimension {p}: ({eval(p).count()}, {len(eval(p).columns)})")

data = title_basics.join(title_ratings, how="left", on="tconst")
data = data.join(title_principals, how="left", on="tconst")
data = data.join(title_episode, how="left", on="tconst")
data = data.join(name_basics, how="left", on="nconst")

print(f"Dimension data = ({data.count()}, {len(data.columns)})")

Dimension title_basics: (9346838, 9)
Dimension title_ratings: (1243998, 3)
Dimension title_principals: (52877285, 6)
Dimension title_episode: (7055394, 4)
Dimension name_basics: (12052333, 6)
Dimension data = (53798200, 24)


In [5]:
print(data.sample(0.0001).show(10))

+---------+----------+---------+--------------------+--------------------+-------+---------+-------+--------------+--------------------+-------------+--------+--------+--------+---+-------------------+------------+------------+-------------+-----------------+---------+---------+--------------------+--------------------+
|   nconst|    tconst|titleType|        primaryTitle|       originalTitle|isAdult|startYear|endYear|runtimeMinutes|              genres|averageRating|numVotes|ordering|category|job|         characters|parentTconst|seasonNumber|episodeNumber|      primaryName|birthYear|deathYear|   primaryProfession|      knownForTitles|
+---------+----------+---------+--------------------+--------------------+-------+---------+-------+--------------+--------------------+-------------+--------+--------+--------+---+-------------------+------------+------------+-------------+-----------------+---------+---------+--------------------+--------------------+
|nm0131254|tt22026256|tvEpisode|  