In [1]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [2]:
# install Java8
!apt-get install openjdk-8-jdk-headless -qq > /dev/null
# download spark2.4.5
!wget -q https://archive.apache.org/dist/spark/spark-3.2.1/spark-3.2.1-bin-hadoop3.2.tgz
# unzip it
!tar xf spark-3.2.1-bin-hadoop3.2.tgz
# install findspark
!pip install -q findspark
# clone github repo
!git clone https://github.com/PiotrMaciejKowalski/BigData2022-films
# Przeniesienie plików z BigData2022-films do katalogu nadrzędnego
!mv BigData2022-films/* .
!mv BigData2022-films/.* .
!rmdir BigData2022-films

Cloning into 'BigData2022-films'...
remote: Enumerating objects: 1202, done.[K
remote: Counting objects: 100% (516/516), done.[K
remote: Compressing objects: 100% (276/276), done.[K
remote: Total 1202 (delta 352), reused 283 (delta 240), pack-reused 686[K
Receiving objects: 100% (1202/1202), 2.72 MiB | 8.25 MiB/s, done.
Resolving deltas: 100% (714/714), done.
mv: cannot move 'BigData2022-films/colabs' to './colabs': Directory not empty
mv: cannot move 'BigData2022-films/docs' to './docs': Directory not empty
mv: cannot move 'BigData2022-films/lib' to './lib': Directory not empty
mv: cannot move 'BigData2022-films/notebooks' to './notebooks': Directory not empty
mv: cannot move 'BigData2022-films/reports' to './reports': Directory not empty
mv: cannot move 'BigData2022-films/stripped' to './stripped': Directory not empty
mv: cannot move 'BigData2022-films/tests' to './tests': Directory not empty
mv: cannot move 'BigData2022-films/tutorials' to './tutorials': Directory not empty
mv: 

In [3]:
import os

# setup environment variables for our Spark Session to work
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-8-openjdk-amd64"
os.environ["SPARK_HOME"] = '/content/spark-3.2.1-bin-hadoop3.2'

from lib.pyspark_startup import init, load

In [4]:
spark = init()

In [5]:
path = "/content/drive/.shortcut-targets-by-id/1VcOir9FMG8LzEsUE-Q8YA79c_sV0tJwp/bigdata2022/"

In [6]:
train = spark.read.parquet(path + "train_df.parquet")

In [7]:
import pyspark.sql.functions as f
from pyspark.sql.types import FloatType 
import random
from pyspark.sql import DataFrame

In [8]:
train.show(20)

+---------+--------------------+---------------------+-------------------------+-----------------------+--------------+--------------------------+--------------------+--------------------+
|       id|               tytul|rok_wydania_produkcji|rok_zakonczenia_produkcji|dlugosc_produkcji_w_min|liczba_sezonow|liczba_wszystkich_odcinkow|        ludzie_filmu|            features|
+---------+--------------------+---------------------+-------------------------+-----------------------+--------------+--------------------------+--------------------+--------------------+
|tt0000001|          Carmencita|                 1894|                     1894|                      1|             1|                         1|[nm1588970, nm000...|(45,[1,17,19],[1....|
|tt0000003|      Pauvre Pierrot|                 1892|                     1892|                      4|             1|                         1|[nm0721526, nm177...|(45,[1,20,22,23],...|
|tt0000006|   Chinese Opium Den|                 1894| 

COS_SIM

In [9]:
# Define a function that takes two vectors as input and returns their cosine similarity
def cosine_similarity(vec1, vec2):
    return float(vec1.dot(vec2) / (vec1.norm(2) * vec2.norm(2)))

IoU


In [10]:
from typing import List, Any


def intersection_over_union(list_1: List[Any], list_2: List[Any]) -> float:

    if None in list_1:
      list_1 = list(filter(None, list_1))

    if None in list_2:
      list_2 = list(filter(None, list_2))

    if len(list_1) == 0 or len(list_2) == 0:
        return 0.0

    return len(set(list_1) & set(list_2)) / len(set(list_1 + list_2))

Funkcja łącząca IOU i cos_sim

In [11]:
def cos_sim_and_iou_for_row(
    df: DataFrame,
    movie_id: str,
    cos_sim_col_name: str = "features",
    iou_col_name: str = "ludzie_filmu",
) -> DataFrame:
    """This function returns a DataFrame that contains cosinus similarity and 
    intersection_over_union calculations for the given movie_id.



    :param df:              pyspark.sql.DataFrame
    :param movie_id:        String
    :return:                pyspark.sql.DataFrame"""

  #  if not(cos_sim_col_name in df.columns and iou_col_name in df.columns):
    #  raise AssertionError("input dataframe does not have the required columns")

    vec_cos_sim =  df.filter(df.id == movie_id).select(cos_sim_col_name).collect()[0][0]
    vec_iou = df.filter(df.id == movie_id).select(iou_col_name).collect()[0][0]

    def cos(x):
        return cosine_similarity(vec_cos_sim, x)

    cos_udf = f.udf(cos, FloatType())

    def iou(x):
      return intersection_over_union(vec_iou, x)

    iou_udf = f.udf(iou, FloatType())


    df = df.withColumn("cos_similarity", cos_udf(f.col(cos_sim_col_name)))
    df = df.withColumn('IOU', iou_udf(f.col(iou_col_name)))

    return df.select(["id","tytul", "cos_similarity","IOU"])

Film do testowania: tutaj "Toy Story". 

In [12]:
movie_id = train.filter(train.tytul == "Toy Story").select("id").collect()[0][0]

In [13]:
df = cos_sim_and_iou_for_row(train,movie_id)

In [14]:
df.show()

+---------+--------------------+--------------+---+
|       id|               tytul|cos_similarity|IOU|
+---------+--------------------+--------------+---+
|tt0000001|          Carmencita|           0.0|0.0|
|tt0000003|      Pauvre Pierrot|     0.4472136|0.0|
|tt0000006|   Chinese Opium Den|           0.0|0.0|
|tt0000007|Corbett and Court...|           0.0|0.0|
|tt0000009|          Miss Jerry|    0.31622776|0.0|
|tt0000016|Boat Leaving the ...|           0.0|0.0|
|tt0000018|Das boxende Känguruh|           0.0|0.0|
|tt0000027|Cordeliers' Squar...|           0.0|0.0|
|tt0000029|         Baby's Meal|           0.0|0.0|
|tt0000033|  Horse Trick Riders|     0.2236068|0.0|
|tt0000034|Arrivée d'un trai...|           0.0|0.0|
|tt0000035|Watering the Flowers|     0.2581989|0.0|
|tt0000036|    Awakening of Rip|           0.0|0.0|
|tt0000041|   Bataille de neige|     0.2236068|0.0|
|tt0000062|    Danse serpentine|           0.0|0.0|
|tt0000070|Demolition of a Wall|           0.0|0.0|
|tt0000075|T

Tutaj funkcja przyjmujaca df z policzynm iou oraz cos_sim. param to waga. 

In [15]:
def testujemy(df,a_param:float):


    add_udf = f.udf(lambda x, y: a_param* x + (1- a_param) * y, FloatType())

    df = df.withColumn("prediction", add_udf(df["cos_similarity"], df["IOU"]))


    return df

In [23]:
df_prediction= testujemy(df, 0.5)

In [24]:
df_prediction.show()

+---------+--------------------+--------------+---+----------+
|       id|               tytul|cos_similarity|IOU|prediction|
+---------+--------------------+--------------+---+----------+
|tt0000001|          Carmencita|           0.0|0.0|       0.0|
|tt0000003|      Pauvre Pierrot|     0.4472136|0.0| 0.2236068|
|tt0000006|   Chinese Opium Den|           0.0|0.0|       0.0|
|tt0000007|Corbett and Court...|           0.0|0.0|       0.0|
|tt0000009|          Miss Jerry|    0.31622776|0.0|0.15811388|
|tt0000016|Boat Leaving the ...|           0.0|0.0|       0.0|
|tt0000018|Das boxende Känguruh|           0.0|0.0|       0.0|
|tt0000027|Cordeliers' Squar...|           0.0|0.0|       0.0|
|tt0000029|         Baby's Meal|           0.0|0.0|       0.0|
|tt0000033|  Horse Trick Riders|     0.2236068|0.0| 0.1118034|
|tt0000034|Arrivée d'un trai...|           0.0|0.0|       0.0|
|tt0000035|Watering the Flowers|     0.2581989|0.0|0.12909944|
|tt0000036|    Awakening of Rip|           0.0|0.0|    

In [25]:
from pyspark.sql.functions import desc

In [26]:
df_prediction = df_prediction.sort(desc("prediction"))

In [27]:
df_prediction.show(10)

+---------+--------------------+--------------+----------+----------+
|       id|               tytul|cos_similarity|       IOU|prediction|
+---------+--------------------+--------------+----------+----------+
|tt0114709|           Toy Story|           1.0|       1.0|       1.0|
|tt0120363|         Toy Story 2|           1.0|0.33333334| 0.6666667|
|tt0120623|        A Bug's Life|           1.0| 0.1764706| 0.5882353|
|tt0317219|                Cars|           1.0|0.11111111| 0.5555556|
|tt0338348|   The Polar Express|           1.0|0.05263158| 0.5263158|
|tt0356634|            Garfield|           1.0|0.05263158| 0.5263158|
|tt0455499|Garfield: A Tail ...|           1.0|0.05263158| 0.5263158|
|tt1049413|                  Up|           1.0|0.05263158| 0.5263158|
|tt0429589|       The Ant Bully|           1.0|0.05263158| 0.5263158|
|tt1979376|         Toy Story 4|           0.8|      0.25|     0.525|
+---------+--------------------+--------------+----------+----------+
only showing top 10 

In [None]:
from lib.model import predict

In [None]:
df_pred = predict(df,"Toy Story")

In [None]:
df_pred.show(20)

In [None]:
# import time

# # get the start time
# st = time.time()

# # main program
# # find sum to first 1 million numbers
# kam = IOU("tt0000006", df_dom)

# # get the end time
# et = time.time()

# # get the execution time
# elapsed_time = et - st
# print('Execution time:', elapsed_time, 'seconds')