# Wczytanie danych

In [1]:
# install Java8
!apt-get install openjdk-8-jdk-headless -qq > /dev/null
# download spark2.4.5
!wget -q https://archive.apache.org/dist/spark/spark-3.3.1/spark-3.3.1-bin-hadoop2.tgz
# unzip it
!tar xf spark-3.3.1-bin-hadoop2.tgz
# install findspark
!pip install -q findspark
# Google Colab has Java 11 available, test it using below command -
!ls /usr/lib/jvm
#install pyarrow
!pip install -U pyarrow


default-java		   java-11-openjdk-amd64     java-8-openjdk-amd64
java-1.11.0-openjdk-amd64  java-1.8.0-openjdk-amd64
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting pyarrow
  Downloading pyarrow-10.0.1-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (36.0 MB)
[K     |████████████████████████████████| 36.0 MB 292 kB/s 
Installing collected packages: pyarrow
  Attempting uninstall: pyarrow
    Found existing installation: pyarrow 9.0.0
    Uninstalling pyarrow-9.0.0:
      Successfully uninstalled pyarrow-9.0.0
[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
pandas-gbq 0.17.9 requires pyarrow<10.0dev,>=3.0.0, but you have pyarrow 10.0.1 which is incompatible.
db-dtypes 1.0.4 requires pyarrow<10.0dev,>=3.0.0, but you have pyarrow 10.0.1 which is incompatible.[0m
Successfully installed py

In [3]:
import os
import findspark


os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-8-openjdk-amd64"
os.environ["SPARK_HOME"] = "/content/spark-3.3.1-bin-hadoop2"

findspark.init()
from pyspark.sql import SparkSession

spark = SparkSession.builder.master("local[*]").getOrCreate()
spark_params = {
"spark.executor.memory" : "4g",
"spark.driver.memory": "4g",
"spark.memory.fraction": "0.9"}
#for param, value in spark_params.items():
#  spark.conf.set(param, value)
spark

In [5]:
!wget "https://datasets.imdbws.com/title.basics.tsv.gz"
title_basics = spark.read.csv("title.basics.tsv.gz", sep='\t', header=True)

--2022-12-05 20:55:26--  https://datasets.imdbws.com/title.basics.tsv.gz
Resolving datasets.imdbws.com (datasets.imdbws.com)... 18.160.46.72, 18.160.46.120, 18.160.46.19, ...
Connecting to datasets.imdbws.com (datasets.imdbws.com)|18.160.46.72|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 164632109 (157M) [binary/octet-stream]
Saving to: ‘title.basics.tsv.gz’


2022-12-05 20:55:27 (313 MB/s) - ‘title.basics.tsv.gz’ saved [164632109/164632109]



# Funkcja dodająca kolumnę z epokami

In [14]:
from pyspark.sql.functions import col, when
import pandas as pd

In [27]:
def add_epoch_column(df):
  periods = [1901,1918,1926,1939,1954,1970,1985,1994,2009]
  df_no_N = df.filter(df.startYear != "\\N")

  df_periods = df_no_N.withColumn('period',
                             when(col('startYear') <= periods[0], "1")
                             .when(col('startYear') <= periods[1], "2")
                             .when(col('startYear') <= periods[2], "3")
                             .when(col('startYear') <= periods[3], "4")
                             .when(col('startYear') <= periods[4], "5")
                             .when(col('startYear') <= periods[5], "6")
                             .when(col('startYear') <= periods[6], "7")
                             .when(col('startYear') <= periods[7], "8")
                             .when(col('startYear') <= periods[8], "9")
                             .otherwise("10"))
  return df_periods

In [105]:
sample = add_epoch_column(title_basics).rdd.takeSample(False,20)
pd.DataFrame(sample, columns =  add_epoch_column(title_basics).columns)

Unnamed: 0,tconst,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres,period
0,tt9915964,tvEpisode,The Four Skulls of Jonathan Drake,The Four Skulls of Jonathan Drake,0,1968,\N,\N,"Comedy,Drama,Fantasy",6
1,tt11402320,short,Don't Look,Don't Look,0,2019,\N,6,"Horror,Short,Thriller",10
2,tt8135520,tvEpisode,7on1 DAP Gangbang with Charlotte Sartre Balls ...,7on1 DAP Gangbang with Charlotte Sartre Balls ...,1,2018,\N,\N,Adult,10
3,tt0900429,tvEpisode,Postcards from Emmaville: Part 2,Postcards from Emmaville: Part 2,0,2003,\N,29,"Biography,Documentary",9
4,tt0130833,movie,Luxury,Luxury,0,1921,\N,\N,Drama,3
5,tt0864170,tvEpisode,"Robert Young, Martha Raye, Marion Lorne, Milto...","Robert Young, Martha Raye, Marion Lorne, Milto...",0,1957,\N,\N,Comedy,6
6,tt2417412,tvEpisode,Episode dated 26 September 2012,Episode dated 26 September 2012,0,2012,\N,\N,News,10
7,tt3624518,short,This Is an Ending,This Is an Ending,0,2015,\N,\N,"Drama,Short",10
8,tt1596925,tvEpisode,Episode #1.15,Episode #1.15,0,2001,\N,\N,Comedy,9
9,tt19767774,tvEpisode,Hvem ved hvad?,Hvem ved hvad?,0,2021,\N,\N,"Drama,Family,Romance",10


## Test

In [66]:
import pytest

In [104]:
def test_periods(sample_size):
  
  result = []
  exp_result = []

  title_basics_epoch = add_epoch_column(title_basics)
  columns_names = title_basics_epoch.columns
  periods = [0,1901,1918,1926,1939,1954,1970,1985,1994,2009,2050]

  sample =  title_basics_epoch.rdd.takeSample(False,sample_size)

  periods_index = columns_names.index("period")
  startYear_index = columns_names.index("startYear")
  
  for i in range(sample_size):
    result.append(sample[i][periods_index])  
    for k in range(len(periods) - 1):
      if int(sample[i][startYear_index]) <= periods[k+1] and int(sample[i][startYear_index]) > periods[k]:
        exp_result.append(k+1)
  
  result = list(map(int, result))
  assert result == exp_result, 'function add_epoch_column returns wrong output'

test_periods(1000)