# Spark Setup and Data Load

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
# install Java8
!apt-get install openjdk-8-jdk-headless -qq > /dev/null
# download spark2.4.5
!wget -q https://archive.apache.org/dist/spark/spark-3.2.1/spark-3.2.1-bin-hadoop3.2.tgz
# unzip it
!tar xf spark-3.2.1-bin-hadoop3.2.tgz
# install findspark
!pip install -q findspark
# clone github repo
!git clone https://github.com/PiotrMaciejKowalski/BigData2022-films
# Przeniesienie plików z BigData2022-films do katalogu nadrzędnego
!mv BigData2022-films/* .
!mv BigData2022-films/.* .
!rmdir BigData2022-films

Cloning into 'BigData2022-films'...
remote: Enumerating objects: 812, done.[K
remote: Counting objects: 100% (424/424), done.[K
remote: Compressing objects: 100% (244/244), done.[K
remote: Total 812 (delta 274), reused 246 (delta 179), pack-reused 388[K
Receiving objects: 100% (812/812), 2.91 MiB | 7.57 MiB/s, done.
Resolving deltas: 100% (453/453), done.
mv: cannot move 'BigData2022-films/.' to './.': Device or resource busy
mv: cannot move 'BigData2022-films/..' to './..': Device or resource busy


In [3]:
import os

# setup environment variables for our Spark Session to work
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-8-openjdk-amd64"
os.environ["SPARK_HOME"] = '/content/spark-3.2.1-bin-hadoop3.2'

from lib.pyspark_startup import init, load

In [4]:
spark = init()

In [31]:
# Ładowanie danych z dysku google
path = "/content/drive/.shortcut-targets-by-id/1VcOir9FMG8LzEsUE-Q8YA79c_sV0tJwp/bigdata2022/"

df = spark.read.parquet(path + "clean_df.parquet")

# Preprocessing - przyłożenie stworzonych funkcji

In [32]:
from lib.pyspark_preprocesing import one_hot_encoding

df = one_hot_encoding(df, ["rodzaj_produkcji","gatunek"])

In [33]:
from lib.feature_creators import add_epoch_column

df = add_epoch_column(df)

In [34]:
from lib.film_people_list import people_film_merge_columns

dataCollect = df.collect()
df = people_film_merge_columns(df,df['id'], add_column = True)

In [35]:
df.show()

+---------+--------------------+-----------------+---------------------+-------------------------+-----------------------+--------------+--------------------------+--------------------+------------------+-----+--------------------+
|       id|               tytul|czy_dla_doroslych|rok_wydania_produkcji|rok_zakonczenia_produkcji|dlugosc_produkcji_w_min|liczba_sezonow|liczba_wszystkich_odcinkow|rodzaj_produkcji_ohe|       gatunek_ohe|epoka|        ludzie_filmu|
+---------+--------------------+-----------------+---------------------+-------------------------+-----------------------+--------------+--------------------------+--------------------+------------------+-----+--------------------+
|tt0000001|          Carmencita|                0|                 1894|                     1894|                      1|           1.0|                       1.0|       (7,[0],[1.0])|  (1886,[4],[1.0])|    1|[nm1588970, nm000...|
|tt0000003|      Pauvre Pierrot|                0|                 1892|

# Podział na zbiór uczący, walidacyjny i testowy

In [None]:
train, valid, test = df.randomSplit([0.70, 0.2, 0.1], seed=123)

# Zapis na dysku

In [None]:
train.write.mode("overwrite").parquet(path + "train_df.parquet")
valid.write.mode("overwrite").parquet(path + "valid_df.parquet")
test.write.mode("overwrite").parquet(path + "test_df.parquet")