# Spark Setup and Data Load

##Installation of Spark

In [None]:
# install Java8
!apt-get install openjdk-8-jdk-headless -qq > /dev/null
# download spark2.4.5
!wget -q https://archive.apache.org/dist/spark/spark-2.4.5/spark-2.4.5-bin-hadoop2.7.tgz
# unzip it
!tar xf spark-2.4.5-bin-hadoop2.7.tgz
# install findspark
!pip install -q findspark
# Google Colab has Java 11 available, test it using below command -
!ls /usr/lib/jvm
#install pyarrow
!pip install -U pyarrow

In [None]:
import os
import findspark

Now we need to setup environment variables for our Spark Session to work

In [None]:
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-8-openjdk-amd64"
os.environ["SPARK_HOME"] = "/content/spark-2.4.5-bin-hadoop2.7"

To verify the automatically detected location we use findspark.init()

In [None]:
findspark.init()
from pyspark.sql import SparkSession

Time to initiate a Spark Session

In [None]:
spark = SparkSession.builder.master("local[*]").getOrCreate()
spark_params = {
"spark.executor.memory" : "4g",
"spark.driver.memory": "4g",
"spark.memory.fraction": "0.9"}
for param, value in spark_params.items():
  spark.conf.set(param, value)

In [None]:
spark

## Load dataset

In [None]:
!wget "https://datasets.imdbws.com/name.basics.tsv.gz"
!wget "https://datasets.imdbws.com/title.akas.tsv.gz"
!wget "https://datasets.imdbws.com/title.basics.tsv.gz"
!wget "https://datasets.imdbws.com/title.crew.tsv.gz"
!wget "https://datasets.imdbws.com/title.episode.tsv.gz"
!wget "https://datasets.imdbws.com/title.principals.tsv.gz"
!wget "https://datasets.imdbws.com/title.ratings.tsv.gz"

In [None]:
title_ratings = spark.read.csv("title.ratings.tsv.gz", sep='\t', header=True)
title_principals = spark.read.csv("title.principals.tsv.gz", sep='\t',header=True)
title_episode = spark.read.csv("title.episode.tsv.gz", sep='\t', header=True)
title_crew = spark.read.csv("title.crew.tsv.gz", sep='\t', header=True)
title_basics = spark.read.csv("title.basics.tsv.gz", sep='\t', header=True)
title_akas = spark.read.csv("title.akas.tsv.gz", sep='\t', header=True)
name_basics = spark.read.csv("name.basics.tsv.gz", sep='\t', header=True)

In [None]:
title_ratings.show(5)

# Prepare Data

Data overview

In [None]:
print("title_ratings")
title_ratings.show(2)
print("title_principals")
title_principals.show(2)
print("title_episode")
title_episode.show(2)
print("title_crew")
title_crew.show(2)
print("title_basics")
title_basics.show(2)
print("title_akas")
title_akas.show(2)
print("name_basics")
name_basics.show(2)

Join tables

In [None]:
temp_akas = title_akas.filter(title_akas.isOriginalTitle == 1)
temp_akas = temp_akas.select(["titleId", "region", "language"]).distinct()
temp_akas = temp_akas.withColumnRenamed("titleId", "tconst")

to_print = ["title_basics", "title_ratings", "title_principals", 
            "title_episode", "name_basics", "temp_akas"]

for p in to_print:
  print(f"Dimension {p}: ({eval(p).count()}, {len(eval(p).columns)})")

data = title_basics.join(title_ratings, how="left", on="tconst")
print(f"\nJoined title_principals to title_basics\n" + 
      f"Dimension: ({data.count()}, {len(data.columns)})")

data = data.join(title_principals, how="left", on="tconst")
print(f"Joined title_principals\n" + 
      f"Dimension: ({data.count()}, {len(data.columns)})")

data = data.join(title_episode, how="left", on="tconst")
print(f"Joined title_episode\n" + 
      f"Dimension: ({data.count()}, {len(data.columns)})")

data = data.join(name_basics, how="left", on="nconst")
print(f"Joined name_basics\n" + 
      f"Dimension: ({data.count()}, {len(data.columns)})")

data = data.join(temp_akas, how="left", on="tconst")
print(f"Joined temp_akas\n" + 
      f"Dimension: ({data.count()}, {len(data.columns)})")


Show sample rows

In [None]:
sample = data.rdd.takeSample(False, 10)

In [None]:
import pandas as pd

pd.set_option('display.max_columns', None)
pd.DataFrame(sample, columns =  data.columns)