# Spark Setup and Data Load

In [8]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [9]:
# install Java8
!apt-get install openjdk-8-jdk-headless -qq > /dev/null
# download spark2.4.5
!wget -q https://archive.apache.org/dist/spark/spark-3.2.1/spark-3.2.1-bin-hadoop3.2.tgz
# unzip it
!tar xf spark-3.2.1-bin-hadoop3.2.tgz
# install findspark
!pip install -q findspark
# clone github repo
!git clone https://github.com/PiotrMaciejKowalski/BigData2022-films
# Przeniesienie plików z BigData2022-films do katalogu nadrzędnego
!mv BigData2022-films/* .
!mv BigData2022-films/.* .
!rmdir BigData2022-films

Cloning into 'BigData2022-films'...
remote: Enumerating objects: 800, done.[K
remote: Counting objects: 100% (415/415), done.[K
remote: Compressing objects: 100% (235/235), done.[K
remote: Total 800 (delta 270), reused 244 (delta 179), pack-reused 385[K
Receiving objects: 100% (800/800), 2.91 MiB | 13.65 MiB/s, done.
Resolving deltas: 100% (446/446), done.
mv: cannot move 'BigData2022-films/colabs' to './colabs': Directory not empty
mv: cannot move 'BigData2022-films/docs' to './docs': Directory not empty
mv: cannot move 'BigData2022-films/lib' to './lib': Directory not empty
mv: cannot move 'BigData2022-films/notebooks' to './notebooks': Directory not empty
mv: cannot move 'BigData2022-films/reports' to './reports': Directory not empty
mv: cannot move 'BigData2022-films/stripped' to './stripped': Directory not empty
mv: cannot move 'BigData2022-films/tests' to './tests': Directory not empty
mv: cannot move 'BigData2022-films/tutorials' to './tutorials': Directory not empty
mv: can

In [10]:
import os

# setup environment variables for our Spark Session to work
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-8-openjdk-amd64"
os.environ["SPARK_HOME"] = '/content/spark-3.2.1-bin-hadoop3.2'

from lib.pyspark_startup import init, load

In [11]:
spark = init()

In [12]:
# Ładowanie danych z dysku google
path = "/content/drive/.shortcut-targets-by-id/1VcOir9FMG8LzEsUE-Q8YA79c_sV0tJwp/bigdata2022/"

df = spark.read.parquet(path + "clean_df.parquet")

# Preprocessing - przyłożenie stworzonych funkcji

In [14]:
from lib.pyspark_preprocesing import one_hot_encoding

df = one_hot_encoding(df, ["rodzaj_produkcji","gatunek"])

In [16]:
from lib.feature_creators import add_epoch_column

df = add_epoch_column(df)

In [17]:
df.show()

+---------+--------------------+-----------------+---------------------+-------------------------+-----------------------+--------------+--------------------------+---------+----+---------+---------+---------+---------+----+----+----+----+--------------------+------------------+-----+
|       id|               tytul|czy_dla_doroslych|rok_wydania_produkcji|rok_zakonczenia_produkcji|dlugosc_produkcji_w_min|liczba_sezonow|liczba_wszystkich_odcinkow|        1|  10|        2|        3|        4|        5|   6|   7|   8|   9|rodzaj_produkcji_ohe|       gatunek_ohe|epoka|
+---------+--------------------+-----------------+---------------------+-------------------------+-----------------------+--------------+--------------------------+---------+----+---------+---------+---------+---------+----+----+----+----+--------------------+------------------+-----+
|tt0000001|          Carmencita|                0|                 1894|                     1894|                      1|           1.0|     

# Podział na zbiór uczący, walidacyjny i testowy

In [None]:
train, valid, test = df.randomSplit([0.70, 0.2, 0.1], seed=123)

# Zapis na dysku

In [None]:
train.write.mode("overwrite").parquet(path + "train_df.parquet")
valid.write.mode("overwrite").parquet(path + "valid_df.parquet")
test.write.mode("overwrite").parquet(path + "test_df.parquet")