# Spark Setup and Data Load

##Installation of Spark

In [1]:
# install Java8
!apt-get install openjdk-8-jdk-headless -qq > /dev/null
# download spark2.4.5
!wget -q https://archive.apache.org/dist/spark/spark-2.4.5/spark-2.4.5-bin-hadoop2.7.tgz
# unzip it
!tar xf spark-2.4.5-bin-hadoop2.7.tgz
# install findspark
!pip install -q findspark
# Google Colab has Java 11 available, test it using below command -
!ls /usr/lib/jvm
#install pyarrow
!pip install -U pyarrow

default-java		   java-11-openjdk-amd64     java-8-openjdk-amd64
java-1.11.0-openjdk-amd64  java-1.8.0-openjdk-amd64
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting pyarrow
  Downloading pyarrow-10.0.0-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (35.4 MB)
[K     |████████████████████████████████| 35.4 MB 400 kB/s 
Installing collected packages: pyarrow
  Attempting uninstall: pyarrow
    Found existing installation: pyarrow 6.0.1
    Uninstalling pyarrow-6.0.1:
      Successfully uninstalled pyarrow-6.0.1
Successfully installed pyarrow-10.0.0


In [2]:
import os
import findspark

Now we need to setup environment variables for our Spark Session to work

In [3]:
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-8-openjdk-amd64"
os.environ["SPARK_HOME"] = "/content/spark-2.4.5-bin-hadoop2.7"

To verify the automatically detected location we use findspark.init()

In [4]:
findspark.init()
from pyspark.sql import SparkSession

Time to initiate a Spark Session

In [5]:
spark = SparkSession.builder.master("local[*]").getOrCreate()
spark_params = {
"spark.executor.memory" : "4g",
"spark.driver.memory": "4g",
"spark.memory.fraction": "0.9"}
for param, value in spark_params.items():
  spark.conf.set(param, value)

In [6]:
spark

## Load dataset

In [7]:
!wget "https://datasets.imdbws.com/name.basics.tsv.gz"
!wget "https://datasets.imdbws.com/title.akas.tsv.gz"
!wget "https://datasets.imdbws.com/title.basics.tsv.gz"
!wget "https://datasets.imdbws.com/title.crew.tsv.gz"
!wget "https://datasets.imdbws.com/title.episode.tsv.gz"
!wget "https://datasets.imdbws.com/title.principals.tsv.gz"
!wget "https://datasets.imdbws.com/title.ratings.tsv.gz"

--2022-10-31 19:02:29--  https://datasets.imdbws.com/name.basics.tsv.gz
Resolving datasets.imdbws.com (datasets.imdbws.com)... 13.226.210.25, 13.226.210.101, 13.226.210.114, ...
Connecting to datasets.imdbws.com (datasets.imdbws.com)|13.226.210.25|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 235710020 (225M) [binary/octet-stream]
Saving to: ‘name.basics.tsv.gz’


2022-10-31 19:02:45 (14.1 MB/s) - ‘name.basics.tsv.gz’ saved [235710020/235710020]

--2022-10-31 19:02:45--  https://datasets.imdbws.com/title.akas.tsv.gz
Resolving datasets.imdbws.com (datasets.imdbws.com)... 13.226.210.25, 13.226.210.101, 13.226.210.114, ...
Connecting to datasets.imdbws.com (datasets.imdbws.com)|13.226.210.25|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 286163343 (273M) [binary/octet-stream]
Saving to: ‘title.akas.tsv.gz’


2022-10-31 19:03:03 (15.5 MB/s) - ‘title.akas.tsv.gz’ saved [286163343/286163343]

--2022-10-31 19:03:03--  https://datasets.imd

In [8]:
title_ratings = spark.read.csv("title.ratings.tsv.gz", sep='\t', header=True)
title_principals = spark.read.csv("title.principals.tsv.gz", sep='\t',header=True)
title_episode = spark.read.csv("title.episode.tsv.gz", sep='\t', header=True)
title_crew = spark.read.csv("title.crew.tsv.gz", sep='\t', header=True)
title_basics = spark.read.csv("title.basics.tsv.gz", sep='\t', header=True)
title_akas = spark.read.csv("title.akas.tsv.gz", sep='\t', header=True)
name_basics = spark.read.csv("name.basics.tsv.gz", sep='\t', header=True)

In [9]:
title_ratings.show(5)

+---------+-------------+--------+
|   tconst|averageRating|numVotes|
+---------+-------------+--------+
|tt0000001|          5.7|    1921|
|tt0000002|          5.8|     260|
|tt0000003|          6.5|    1729|
|tt0000004|          5.6|     174|
|tt0000005|          6.2|    2543|
+---------+-------------+--------+
only showing top 5 rows

