# Spark Setup and Data Load

In [6]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [2]:
# install Java8
!apt-get install openjdk-8-jdk-headless -qq > /dev/null
# download spark2.4.5
!wget -q https://archive.apache.org/dist/spark/spark-3.2.1/spark-3.2.1-bin-hadoop3.2.tgz
# unzip it
!tar xf spark-3.2.1-bin-hadoop3.2.tgz
# install findspark
!pip install -q findspark
# clone github repo
!git clone https://github.com/PiotrMaciejKowalski/BigData2022-films
# Przeniesienie plików z BigData2022-films do katalogu nadrzędnego
!mv BigData2022-films/* .
!mv BigData2022-films/.* .
!rmdir BigData2022-films

Cloning into 'BigData2022-films'...
remote: Enumerating objects: 1189, done.[K
remote: Counting objects: 100% (509/509), done.[K
remote: Compressing objects: 100% (264/264), done.[K
remote: Total 1189 (delta 353), reused 287 (delta 245), pack-reused 680[K
Receiving objects: 100% (1189/1189), 2.72 MiB | 4.95 MiB/s, done.
Resolving deltas: 100% (709/709), done.
mv: cannot move 'BigData2022-films/.' to './.': Device or resource busy
mv: cannot move 'BigData2022-films/..' to './..': Device or resource busy


In [3]:
import os

# setup environment variables for our Spark Session to work
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-8-openjdk-amd64"
os.environ["SPARK_HOME"] = '/content/spark-3.2.1-bin-hadoop3.2'

from lib.pyspark_startup import init, load

In [4]:
spark = init()

In [40]:
# Ładowanie danych z dysku google
path = "/content/drive/.shortcut-targets-by-id/1VcOir9FMG8LzEsUE-Q8YA79c_sV0tJwp/bigdata2022/"

df = spark.read.parquet(path + "clean_df.parquet")

# Preprocessing - przyłożenie stworzonych funkcji

In [41]:
from pyspark.sql.types import IntegerType
df = df.withColumn("czy_dla_doroslych",df.czy_dla_doroslych.cast(IntegerType()))

In [42]:
from lib.feature_creators import add_epoch_column

df = add_epoch_column(df)

In [43]:
from lib.film_people_list import people_film_merge_columns

df = people_film_merge_columns(df,df['id'], add_column = True)

In [44]:
from lib.pyspark_preprocesing import one_hot_encoding, count_vectorizer

df = one_hot_encoding(df, ["rodzaj_produkcji", "epoka_rok_wydania_produkcji"])
df = count_vectorizer(df, "gatunek")

In [45]:
from lib.scaler_columns_function import scaler_columns
df = scaler_columns(df, ["dlugosc_produkcji_w_min", "liczba_sezonow", "liczba_wszystkich_odcinkow"])

In [47]:
from lib.preprocessing_function_similarity import merge_dense_vectors

df = merge_dense_vectors(df, ["czy_dla_doroslych", "rodzaj_produkcji_ohe", "epoka_rok_wydania_produkcji_ohe", "gatunek_vec", "dlugosc_produkcji_w_min_scaled", "liczba_sezonow_scaled", "liczba_wszystkich_odcinkow_scaled"])

# Podział na zbiór uczący, walidacyjny i testowy

In [None]:
train, valid, test = df.randomSplit([0.70, 0.2, 0.1], seed=123)

# Zapis na dysku

In [None]:
train.write.mode("overwrite").parquet(path + "train_df.parquet")
valid.write.mode("overwrite").parquet(path + "valid_df.parquet")
test.write.mode("overwrite").parquet(path + "test_df.parquet")