<a href="https://colab.research.google.com/github/PiotrMaciejKowalski/BigData2022-films/blob/analiza_b%C5%82%C4%99d%C3%B3w/colabs/analiza_b%C5%82%C4%99d%C3%B3w.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Wczytywanie danych, ustawienia początkowe

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
# install Java8
!apt-get install openjdk-8-jdk-headless -qq > /dev/null
# download spark2.4.5
!wget -q https://archive.apache.org/dist/spark/spark-3.2.1/spark-3.2.1-bin-hadoop3.2.tgz
# unzip it
!tar xf spark-3.2.1-bin-hadoop3.2.tgz
# install findspark
!pip install -q findspark
# clone github repo
!git clone https://github.com/PiotrMaciejKowalski/BigData2022-films
# Przeniesienie plików z BigData2022-films do katalogu nadrzędnego
!mv BigData2022-films/* .
!mv BigData2022-films/.* .
!rmdir BigData2022-films

Cloning into 'BigData2022-films'...
remote: Enumerating objects: 1258, done.[K
remote: Counting objects: 100% (572/572), done.[K
remote: Compressing objects: 100% (304/304), done.[K
remote: Total 1258 (delta 392), reused 328 (delta 267), pack-reused 686[K
Receiving objects: 100% (1258/1258), 2.74 MiB | 12.64 MiB/s, done.
Resolving deltas: 100% (754/754), done.
mv: cannot move 'BigData2022-films/.' to './.': Device or resource busy
mv: cannot move 'BigData2022-films/..' to './..': Device or resource busy


In [3]:
import os

# setup environment variables for our Spark Session to work
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-8-openjdk-amd64"
os.environ["SPARK_HOME"] = '/content/spark-3.2.1-bin-hadoop3.2'

from lib.pyspark_startup import init, load

In [4]:
spark = init()

In [5]:
path = "/content/drive/.shortcut-targets-by-id/1VcOir9FMG8LzEsUE-Q8YA79c_sV0tJwp/bigdata2022/"

In [26]:
test = spark.read.parquet(path + "test_df.parquet")
train = spark.read.parquet(path + "train_df.parquet")
valid = spark.read.parquet(path + "valid_df.parquet")

In [38]:
df_temp = test.union(train)
df_clean = df_temp.union(valid)

In [8]:
import pyspark.sql.functions as f
from pyspark.sql.types import FloatType 
import random
from pyspark.sql import DataFrame

## Funkcje

COS_SIM

In [10]:
# Define a function that takes two vectors as input and returns their cosine similarity
def cosine_similarity(vec1, vec2):
    return float(vec1.dot(vec2) / (vec1.norm(2) * vec2.norm(2)))

IoU


In [11]:
from typing import List, Any


def intersection_over_union(list_1: List[Any], list_2: List[Any]) -> float:

    if None in list_1:
      list_1 = list(filter(None, list_1))

    if None in list_2:
      list_2 = list(filter(None, list_2))

    if len(list_1) == 0 or len(list_2) == 0:
        return 0.0

    return len(set(list_1) & set(list_2)) / len(set(list_1 + list_2))

Funkcja łącząca IOU i cos_sim

In [12]:
def cos_sim_and_iou_for_row(
    df: DataFrame,
    movie_id: str,
    cos_sim_col_name: str = "features",
    iou_col_name: str = "ludzie_filmu",
) -> DataFrame:
    """This function returns a DataFrame that contains cosinus similarity and 
    intersection_over_union calculations for the given movie_id.



    :param df:              pyspark.sql.DataFrame
    :param movie_id:        String
    :return:                pyspark.sql.DataFrame"""

  #  if not(cos_sim_col_name in df.columns and iou_col_name in df.columns):
    #  raise AssertionError("input dataframe does not have the required columns")

    vec_cos_sim =  df.filter(df.id == movie_id).select(cos_sim_col_name).collect()[0][0]
    vec_iou = df.filter(df.id == movie_id).select(iou_col_name).collect()[0][0]

    def cos(x):
        return cosine_similarity(vec_cos_sim, x)

    cos_udf = f.udf(cos, FloatType())

    def iou(x):
      return intersection_over_union(vec_iou, x)

    iou_udf = f.udf(iou, FloatType())


    df = df.withColumn("cos_similarity", cos_udf(f.col(cos_sim_col_name)))
    df = df.withColumn('IOU', iou_udf(f.col(iou_col_name)))

    return df.select(["id","tytul", "cos_similarity","IOU"])

Rank_function

In [13]:
from lib.ranking_function import ranking_list

Połączone cos_sim i Iou

In [14]:
def testujemy(df,a_param:float):


    add_udf = f.udf(lambda x, y: a_param* x + (1- a_param) * y, FloatType())

    df = df.withColumn("prediction", add_udf(df["cos_similarity"], df["IOU"]))


    return df

In [66]:
from pyspark.sql.functions import desc

def najbardziej_podobne (id, ile, df1):
  #wyrzuca najlepsze wartości prediction
    df = cos_sim_and_iou_for_row(df1,id)
    df_prediction= testujemy(df, 0.5)
    df_prediction = df_prediction.sort(desc("prediction"))
    df_prediction.show(ile)


## Testowanie

In [67]:
def porownanie(id,liczba,df1):
  # porównanie funkcji rankującej i prediction z cos i iou
  najbardziej_podobne(id,liczba,df1)
  a = ranking_list(df1,id).sort_values(by='score', ascending=False).head(liczba)
  print(a)

In [42]:
porownanie("tt0068646",10,df_clean)

+---------+--------------------+--------------+----------+----------+
|       id|               tytul|cos_similarity|       IOU|prediction|
+---------+--------------------+--------------+----------+----------+
|tt0068646|       The Godfather|           1.0|       1.0|       1.0|
|tt0071562|The Godfather: Pa...|           1.0|0.42857143|0.71428573|
|tt0099674|The Godfather: Pa...|          0.75|0.33333334| 0.5416667|
|tt0086442|       A Time to Die|           1.0|0.05263158| 0.5263158|
|tt0086250|            Scarface|           1.0|0.05263158| 0.5263158|
|tt0086066|       The Outsiders|           1.0|0.05263158| 0.5263158|
|tt0071532|         The Gambler|           1.0|0.05263158| 0.5263158|
|tt0087089|     The Cotton Club|     0.8944272|0.11111111| 0.5027692|
|tt0073845|L'uomo che sfidò ...|           1.0|       0.0|       0.5|
|tt0068526|       Les ebranlées|           1.0|       0.0|       0.5|
+---------+--------------------+--------------+----------+----------+
only showing top 10 

In [52]:
porownanie("tt0111161",10,df_clean)

+---------+--------------------+--------------+-----------+----------+
|       id|               tytul|cos_similarity|        IOU|prediction|
+---------+--------------------+--------------+-----------+----------+
|tt0111161|The Shawshank Red...|           1.0|        1.0|       1.0|
|tt0093354|    The Kitchen Toto|           1.0|0.055555556| 0.5277778|
|tt0095827|    Pascali's Island|           1.0| 0.05263158| 0.5263158|
|tt0105323|    Scent of a Woman|           1.0| 0.05263158| 0.5263158|
|tt0101921|Fried Green Tomatoes|           1.0| 0.05263158| 0.5263158|
|tt0111667|             The War|           1.0| 0.05263158| 0.5263158|
|tt0105107|        Passion Fish|           1.0| 0.05263158| 0.5263158|
|tt0106464|              Bopha!|           1.0| 0.05263158| 0.5263158|
|tt0094884|     Clean and Sober|           1.0| 0.05263158| 0.5263158|
|tt0097722|          Lean on Me|           1.0| 0.05263158| 0.5263158|
+---------+--------------------+--------------+-----------+----------+
only s

In [53]:
porownanie("tt1645170",10,df_clean)

+----------+--------------------+--------------+----------+----------+
|        id|               tytul|cos_similarity|       IOU|prediction|
+----------+--------------------+--------------+----------+----------+
| tt1645170|        The Dictator|           1.0|       1.0|       1.0|
|tt13143964|Borat Subsequent ...|           1.0|0.11111111| 0.5555556|
| tt1477837|        Cedar Rapids|           1.0|0.05263158| 0.5263158|
|tt16302202|          The Estate|           1.0|0.05263158| 0.5263158|
|tt10519662|       Futro z misia|           1.0|       0.0|       0.5|
|tt10143114|Premier de la classe|           1.0|       0.0|       0.5|
|tt10940736|15 Ways to Kill Y...|           1.0|       0.0|       0.5|
| tt0475360|   Persona Au Gratin|           1.0|       0.0|       0.5|
|tt10472102|Karakomik Filmler...|           1.0|       0.0|       0.5|
|tt10081172|  15 Lakh Kado Aauga|           1.0|       0.0|       0.5|
+----------+--------------------+--------------+----------+----------+
only s

In [44]:
porownanie("tt4154756",10,df_clean)

+---------+--------------------+--------------+----------+----------+
|       id|               tytul|cos_similarity|       IOU|prediction|
+---------+--------------------+--------------+----------+----------+
|tt4154756|Avengers: Infinit...|           1.0|       1.0|       1.0|
|tt4154796|   Avengers: Endgame|           0.8|       1.0|       0.9|
|tt3498820|Captain America: ...|           1.0|0.53846157| 0.7692308|
|tt2395427|Avengers: Age of ...|           1.0|0.42857143|0.71428573|
|tt1843866|Captain America: ...|           1.0|0.42857143|0.71428573|
|tt0458339|Captain America: ...|           1.0|      0.25|     0.625|
|tt1300854|          Iron Man 3|           1.0| 0.1764706| 0.5882353|
|tt1228705|          Iron Man 2|           1.0| 0.1764706| 0.5882353|
|tt1502712|      Fantastic Four|           1.0|0.11111111| 0.5555556|
|tt0848228|        The Avengers|           1.0|0.11111111| 0.5555556|
+---------+--------------------+--------------+----------+----------+
only showing top 10 

In [45]:
porownanie("tt0137523",10,df_clean)

+---------+------------------+--------------+-----------+----------+
|       id|             tytul|cos_similarity|        IOU|prediction|
+---------+------------------+--------------+-----------+----------+
|tt0137523|        Fight Club|           1.0|        1.0|       1.0|
|tt0114782|  Under the Bridge|           1.0|0.055555556| 0.5277778|
|tt0449467|             Babel|           1.0| 0.05263158| 0.5263158|
|tt0307901|         25th Hour|           1.0| 0.05263158| 0.5263158|
|tt0332452|              Troy|           1.0| 0.05263158| 0.5263158|
|tt0115468|          Adosados|           1.0|        0.0|       0.5|
|tt0114642|El techo del mundo|           1.0|        0.0|       0.5|
|tt0114612| The Night Is Dark|           1.0|        0.0|       0.5|
|tt0116720|   Joyeux Calvaire|           1.0|        0.0|       0.5|
|tt0112941|          Eldorado|           1.0|        0.0|       0.5|
+---------+------------------+--------------+-----------+----------+
only showing top 10 rows

        

In [55]:
porownanie("tt0088247",30,df_clean)

+----------+--------------------+--------------+----------+----------+
|        id|               tytul|cos_similarity|       IOU|prediction|
+----------+--------------------+--------------+----------+----------+
| tt0088247|      The Terminator|           1.0|       1.0|       1.0|
| tt0103064|Terminator 2: Jud...|          0.75|0.42857143|0.58928573|
|tt15739442|The Terminator: '...|     0.4472136| 0.6666667|0.55694014|
| tt0077479|    Dynamite Johnson|           1.0|       0.0|       0.5|
| tt0310293|Çilgin kiz ve üç ...|           1.0|       0.0|       0.5|
| tt0237923|Wild Sex of the C...|           1.0|       0.0|       0.5|
| tt0185027| The Deathless Devil|           1.0|       0.0|       0.5|
| tt0184983|            Uçan Kiz|           1.0|       0.0|       0.5|
| tt0091110|       Ghost Warrior|           1.0|       0.0|       0.5|
| tt0085125|2019: After the F...|           1.0|       0.0|       0.5|
| tt0087130|            DEFCON-4|           1.0|       0.0|       0.5|
| tt00

In [46]:
porownanie("tt0031381",10,df_clean)

+---------+--------------------+--------------+----------+----------+
|       id|               tytul|cos_similarity|       IOU|prediction|
+---------+--------------------+--------------+----------+----------+
|tt0031381|  Gone with the Wind|           1.0|       1.0|       1.0|
|tt0023185|The Man from Yest...|           1.0|0.05263158| 0.5263158|
|tt0018666|       The Awakening|           1.0|0.05263158| 0.5263158|
|tt0022879|  A Farewell to Arms|           1.0|0.05263158| 0.5263158|
|tt0021525|    The Virtuous Sin|           1.0|0.05263158| 0.5263158|
|tt0024770|    The White Sister|     0.8944272|0.11111111| 0.5027692|
|tt0030848|          Test Pilot|     0.8944272|0.11111111| 0.5027692|
|tt0028944|      The Good Earth|     0.8944272|0.11111111| 0.5027692|
|tt0030099|     Eli Sjursdotter|           1.0|       0.0|       0.5|
|tt0026095|          La bandera|           1.0|       0.0|       0.5|
+---------+--------------------+--------------+----------+----------+
only showing top 10 

In [47]:
porownanie("tt0114709",10,df_clean)

+---------+--------------------+--------------+----------+----------+
|       id|               tytul|cos_similarity|       IOU|prediction|
+---------+--------------------+--------------+----------+----------+
|tt0114709|           Toy Story|           1.0|       1.0|       1.0|
|tt0120363|         Toy Story 2|           1.0|0.33333334| 0.6666667|
|tt0120623|        A Bug's Life|           1.0| 0.1764706| 0.5882353|
|tt0317219|                Cars|           1.0|0.11111111| 0.5555556|
|tt0198781|      Monsters, Inc.|           1.0|0.05263158| 0.5263158|
|tt0356634|            Garfield|           1.0|0.05263158| 0.5263158|
|tt0455499|Garfield: A Tail ...|           1.0|0.05263158| 0.5263158|
|tt0338348|   The Polar Express|           1.0|0.05263158| 0.5263158|
|tt0429589|       The Ant Bully|           1.0|0.05263158| 0.5263158|
|tt1049413|                  Up|           1.0|0.05263158| 0.5263158|
+---------+--------------------+--------------+----------+----------+
only showing top 10 

In [48]:
porownanie("tt0081505",10,df_clean)

+----------+--------------------+--------------+---+----------+
|        id|               tytul|cos_similarity|IOU|prediction|
+----------+--------------------+--------------+---+----------+
| tt0081505|         The Shining|           1.0|1.0|       1.0|
| tt0084899|           White Dog|           1.0|0.0|       0.5|
| tt0070396|Al otro lado del ...|           1.0|0.0|       0.5|
| tt0067178|They Have Changed...|           1.0|0.0|       0.5|
| tt9619984|             Bonikka|           1.0|0.0|       0.5|
| tt0095012|           The Devil|           1.0|0.0|       0.5|
|tt11383980|House of Seven Be...|           1.0|0.0|       0.5|
| tt0071872|The Dead, the Dev...|           1.0|0.0|       0.5|
| tt0078132|The Night Before ...|           1.0|0.0|       0.5|
| tt0085185|Atrapados en el m...|           1.0|0.0|       0.5|
+----------+--------------------+--------------+---+----------+
only showing top 10 rows

                id  score
69365    tt0081505   17.0
195397   tt0285531    7.0


In [49]:
porownanie("tt0111161",10,df_clean)

+---------+--------------------+--------------+-----------+----------+
|       id|               tytul|cos_similarity|        IOU|prediction|
+---------+--------------------+--------------+-----------+----------+
|tt0111161|The Shawshank Red...|           1.0|        1.0|       1.0|
|tt0093354|    The Kitchen Toto|           1.0|0.055555556| 0.5277778|
|tt0095827|    Pascali's Island|           1.0| 0.05263158| 0.5263158|
|tt0105323|    Scent of a Woman|           1.0| 0.05263158| 0.5263158|
|tt0101921|Fried Green Tomatoes|           1.0| 0.05263158| 0.5263158|
|tt0111667|             The War|           1.0| 0.05263158| 0.5263158|
|tt0105107|        Passion Fish|           1.0| 0.05263158| 0.5263158|
|tt0106464|              Bopha!|           1.0| 0.05263158| 0.5263158|
|tt0094884|     Clean and Sober|           1.0| 0.05263158| 0.5263158|
|tt0097722|          Lean on Me|           1.0| 0.05263158| 0.5263158|
+---------+--------------------+--------------+-----------+----------+
only s

In [50]:
porownanie("tt0468569",10,df_clean)

+---------+--------------------+--------------+----------+----------+
|       id|               tytul|cos_similarity|       IOU|prediction|
+---------+--------------------+--------------+----------+----------+
|tt0468569|     The Dark Knight|           1.0|       1.0|       1.0|
|tt0372784|       Batman Begins|           1.0|0.53846157| 0.7692308|
|tt1345836|The Dark Knight R...|     0.6708204|0.42857143| 0.5496959|
|tt0963178|   The International|           1.0|0.05263158| 0.5263158|
|tt0433387|         Harsh Times|           1.0|0.05263158| 0.5263158|
|tt0119099|              Fallen|           1.0|0.05263158| 0.5263158|
|tt1289406|         Harry Brown|           1.0|0.05263158| 0.5263158|
|tt0381849|        3:10 to Yuma|           1.0|0.05263158| 0.5263158|
|tt0367878|         Os Imortais|           1.0|       0.0|       0.5|
|tt0214555| Bullets Over Summer|           1.0|       0.0|       0.5|
+---------+--------------------+--------------+----------+----------+
only showing top 10 

In [51]:
porownanie("tt0167260",10,df_clean)

+----------+--------------------+--------------+----------+----------+
|        id|               tytul|cos_similarity|       IOU|prediction|
+----------+--------------------+--------------+----------+----------+
| tt0167260|The Lord of the R...|           1.0|       1.0|       1.0|
| tt0167261|The Lord of the R...|           1.0| 0.8181818| 0.9090909|
| tt0120737|The Lord of the R...|           1.0| 0.6666667| 0.8333334|
|tt21811588|The Lord of the R...|           0.8|0.54545456| 0.6727273|
|tt21811606|The Lord of the R...|           0.8|       0.5|      0.65|
|tt21811594|The Lord of the R...|           0.8|0.45454547| 0.6272727|
| tt0360717|           King Kong|           1.0| 0.1764706| 0.5882353|
| tt0320661|   Kingdom of Heaven|           1.0|0.05263158| 0.5263158|
| tt0116040|            Daylight|           1.0|0.05263158| 0.5263158|
| tt1320185|          Chandrahas|           1.0|       0.0|       0.5|
+----------+--------------------+--------------+----------+----------+
only s

Inne próby

In [56]:
porownanie("tt0000005",10,test) #jakiś random

+---------+--------------------+--------------+----------+----------+
|       id|               tytul|cos_similarity|       IOU|prediction|
+---------+--------------------+--------------+----------+----------+
|tt0000005|    Blacksmith Scene|           1.0|       1.0|       1.0|
|tt0312728|Glenroy Bros., No. 2|           1.0|     0.125|    0.5625|
|tt0219560|The Boxing Cats (...|     0.8660254|0.16666667|0.51634604|
|tt0241524|        A Hand Shake|     0.8164966|       0.2| 0.5082483|
|tt0295663|Tommy Atkins in t...|           1.0|       0.0|       0.5|
|tt0227175|Off to Bloomingda...|           1.0|       0.0|       0.5|
|tt0258104|     Rêve et réalité|           1.0|       0.0|       0.5|
|tt0230510|Maude's Naughty L...|           1.0|       0.0|       0.5|
|tt0203653|      Lone Fisherman|           1.0|       0.0|       0.5|
|tt0312017|What Happened on ...|           1.0|       0.0|       0.5|
+---------+--------------------+--------------+----------+----------+
only showing top 10 

In [57]:
porownanie("tt0054215",10,test) #psychoza

+---------+--------------------+--------------+-----------+----------+
|       id|               tytul|cos_similarity|        IOU|prediction|
+---------+--------------------+--------------+-----------+----------+
|tt0054215|              Psycho|           1.0|        1.0|       1.0|
|tt0058403|    The Night Walker|           1.0|0.055555556| 0.5277778|
|tt0064904|Hatchet for the H...|           1.0|        0.0|       0.5|
|tt0064888|The House That Sc...|           1.0|        0.0|       0.5|
|tt0063330|  Muñecos infernales|           1.0|        0.0|       0.5|
|tt0214860| Klokker i måneskinn|           1.0|        0.0|       0.5|
|tt0055106|Werewolf in a Gir...|     0.8944272|        0.0| 0.4472136|
|tt0061005|           She Beast|     0.8944272|        0.0| 0.4472136|
|tt0373080|          Last Night|     0.8944272|        0.0| 0.4472136|
|tt0061036|           The Witch|     0.8944272|        0.0| 0.4472136|
+---------+--------------------+--------------+-----------+----------+
only s

In [58]:
porownanie("tt0081505",10,test) #lśnienie

+----------+--------------------+--------------+---+----------+
|        id|               tytul|cos_similarity|IOU|prediction|
+----------+--------------------+--------------+---+----------+
| tt0081505|         The Shining|           1.0|1.0|       1.0|
| tt0078132|The Night Before ...|           1.0|0.0|       0.5|
|tt11383980|House of Seven Be...|           1.0|0.0|       0.5|
| tt0071872|The Dead, the Dev...|           1.0|0.0|       0.5|
| tt9619984|             Bonikka|           1.0|0.0|       0.5|
| tt0095012|           The Devil|           1.0|0.0|       0.5|
| tt0067178|They Have Changed...|           1.0|0.0|       0.5|
| tt0085185|Atrapados en el m...|           1.0|0.0|       0.5|
| tt0434915|           Ek Paheli|     0.8944272|0.0| 0.4472136|
| tt0123796|   Desolation Angels|     0.8944272|0.0| 0.4472136|
+----------+--------------------+--------------+---+----------+
only showing top 10 rows

              id  score
69365  tt0081505   17.0
8589   tt0253514    7.0
72225 

In [59]:
porownanie("tt3850590",10,test) #krampus

+----------+--------------------+--------------+---+----------+
|        id|               tytul|cos_similarity|IOU|prediction|
+----------+--------------------+--------------+---+----------+
| tt3850590|             Krampus|           1.0|1.0|       1.0|
| tt2952044|A Villa in Los An...|           1.0|0.0|       0.5|
| tt2182256|           Premature|           1.0|0.0|       0.5|
| tt2185022|     Love on a Leash|           1.0|0.0|       0.5|
|tt10509128|               Bekçi|           1.0|0.0|       0.5|
|tt14331014|    Control the Hunt|           1.0|0.0|       0.5|
| tt5688932| Sorry to Bother You|           1.0|0.0|       0.5|
| tt7447510|       Io resto qua!|           1.0|0.0|       0.5|
| tt2659706|        Doce Amianto|           1.0|0.0|       0.5|
|tt11646756|    30 Chua Phai Tet|           1.0|0.0|       0.5|
+----------+--------------------+--------------+---+----------+
only showing top 10 rows

                id  score
48936    tt3850590   18.0
111006   tt3460062    8.0


In [60]:
porownanie("tt4963900",10,test) #cold war secrets

+----------+--------------------+--------------+---+----------+
|        id|               tytul|cos_similarity|IOU|prediction|
+----------+--------------------+--------------+---+----------+
| tt4963900|Cold War Secrets:...|           1.0|1.0|       1.0|
|tt10075830|      M for Malaysia|           1.0|0.0|       0.5|
|tt10081252|Stalins Rache: Di...|           1.0|0.0|       0.5|
|tt10164744|  Spreeland. Fontane|           1.0|0.0|       0.5|
|tt10081632|Christa de Caroug...|           1.0|0.0|       0.5|
|tt10011682|  Assholes: A Theory|           1.0|0.0|       0.5|
|tt10081592|Vietnam's Subsidy...|           1.0|0.0|       0.5|
|tt10049110|           Die Wiese|           1.0|0.0|       0.5|
|tt10130954|   Riders of Destiny|           1.0|0.0|       0.5|
|tt10137106|    Brothers in Arms|           1.0|0.0|       0.5|
+----------+--------------------+--------------+---+----------+
only showing top 10 rows

                id  score
116358   tt4963900   16.0
90489   tt13449438    6.0


In [61]:
 porownanie("tt3486354",10,test) #star wars: the last jedi

+----------+--------------------+--------------+---+----------+
|        id|               tytul|cos_similarity|IOU|prediction|
+----------+--------------------+--------------+---+----------+
| tt3486354|Star Wars: The La...|           1.0|1.0|       1.0|
|tt10540158|          05.12.2020|           0.8|0.0|       0.4|
| tt1496814|    Nevous Guard 2.0|           0.8|0.0|       0.4|
| tt0430399|        Neverthought|           0.8|0.0|       0.4|
| tt1057489|The Book of Tomorrow|           0.8|0.0|       0.4|
|tt10757744|              Eshana|           0.8|0.0|       0.4|
| tt0120198|Star Trek: The Ex...|           0.8|0.0|       0.4|
|tt12486004|  Marcel and Ardvark|           0.8|0.0|       0.4|
| tt0892866|         Bottle Neck|           0.8|0.0|       0.4|
| tt1453958|             Soulbot|           0.8|0.0|       0.4|
+----------+--------------------+--------------+---+----------+
only showing top 10 rows

               id  score
47289   tt3486354   10.0
26292   tt1365484    8.0
108

In [62]:
 porownanie("tt15677078",10,test) #godzilla vs king kong

+----------+--------------------+--------------+----------+----------+
|        id|               tytul|cos_similarity|       IOU|prediction|
+----------+--------------------+--------------+----------+----------+
|tt15677078|Godzilla vs. Kong...|           1.0|       1.0|       1.0|
|tt15677080|Godzilla vs. Kong...|           1.0|0.45454547|0.72727275|
|tt15677020|Godzilla vs. Kong...|           1.0| 0.2857143|0.64285713|
|tt10873794|Godzilla King of ...|           1.0|0.18181819| 0.5909091|
|tt10873836|Godzilla: King of...|           1.0|0.18181819| 0.5909091|
| tt8565252|The 15:17 To Pari...|           1.0|0.15384616| 0.5769231|
|tt15677046|Godzilla vs. Kong...|           1.0|0.14285715| 0.5714286|
|tt10229142|         High Octane|           1.0|       0.0|       0.5|
|tt10283468|City of Imaginati...|           1.0|       0.0|       0.5|
|tt10202426|     I want to dance|           1.0|       0.0|       0.5|
+----------+--------------------+--------------+----------+----------+
only s

In [125]:
 porownanie("tt1468843",10,test) #jakieś pokemony

+----------+--------------------+--------------+----------+----------+
|        id|               tytul|cos_similarity|       IOU|prediction|
+----------+--------------------+--------------+----------+----------+
| tt1468843|Pokémon: Arceus a...|           1.0|       1.0|       1.0|
| tt0287635|Pokemon 4Ever: Ce...|           1.0|      0.25|     0.625|
| tt0874648|Pokémon Mystery D...|           0.8|0.33333334|0.56666666|
| tt1059944|             Minushi|           1.0|       0.0|       0.5|
| tt0306741|Doraemon: Nobita ...|           1.0|       0.0|       0.5|
| tt0170180|Lupin III: Dead o...|           1.0|       0.0|       0.5|
| tt0173840|Final Fantasy: Th...|           1.0|       0.0|       0.5|
| tt0406672|Kureyon Shin-chan...|           1.0|       0.0|       0.5|
|tt14142856|Bakuso Kyodai Let...|           1.0|       0.0|       0.5|
| tt0259974|  Digimon: The Movie|           1.0|       0.0|       0.5|
+----------+--------------------+--------------+----------+----------+
only s

In [63]:
porownanie("tt0281471",10,test) #the mummy kreskówka

+---------+--------------------+--------------+----------+----------+
|       id|               tytul|cos_similarity|       IOU|prediction|
+---------+--------------------+--------------+----------+----------+
|tt0281471|           The Mummy|           1.0|       1.0|       1.0|
|tt0976192|The Spectacular S...|           1.0|0.05263158| 0.5263158|
|tt0472252|Transformers: Cyb...|           1.0|       0.0|       0.5|
|tt0230804|   Sonic Underground|           1.0|       0.0|       0.5|
|tt0251439|              Trigun|           1.0|       0.0|       0.5|
|tt0421480|  Virus Buster Serge|           1.0|       0.0|       0.5|
|tt0211606|Saber Marionette ...|           1.0|       0.0|       0.5|
|tt0414763|         Rave Master|           1.0|       0.0|       0.5|
|tt0147746|       Batman Beyond|           1.0|       0.0|       0.5|
|tt0437711|         Dark Oracle|           1.0|       0.0|       0.5|
+---------+--------------------+--------------+----------+----------+
only showing top 10 

In [64]:
porownanie("tt0834001",10,test) #underworld 

+---------+--------------------+--------------+---+----------+
|       id|               tytul|cos_similarity|IOU|prediction|
+---------+--------------------+--------------+---+----------+
|tt0834001|Underworld: Rise ...|           1.0|1.0|       1.0|
|tt0482930|   Dragon Tiger Gate|           1.0|0.0|       0.5|
|tt0212311|Los lobos de Wash...|     0.8944272|0.0| 0.4472136|
|tt0338960|         Ek Tha Raja|     0.8944272|0.0| 0.4472136|
|tt0340068|De la mano de un ...|     0.8944272|0.0| 0.4472136|
|tt0495626|In the Name of th...|     0.8944272|0.0| 0.4472136|
|tt0349592|        Huli sa akto|     0.8944272|0.0| 0.4472136|
|tt0275611|   Witness to a Kill|     0.8944272|0.0| 0.4472136|
|tt0404826|           Bob Steel|     0.8944272|0.0| 0.4472136|
|tt1330525|       Beast Stalker|     0.8944272|0.0| 0.4472136|
+---------+--------------------+--------------+---+----------+
only showing top 10 rows

              id  score
81549  tt0834001   18.0
14907  tt0482930    8.0
5225   tt0130827    

In [65]:
porownanie("tt7424200",10,test) #młodzi tytani

+----------+--------------------+--------------+----------+----------+
|        id|               tytul|cos_similarity|       IOU|prediction|
+----------+--------------------+--------------+----------+----------+
| tt7424200|Teen Titans GO! T...|           1.0|       1.0|       1.0|
| tt3513498|The Lego Movie 2:...|           1.0|0.11111111| 0.5555556|
|tt11177804|Legend of Deifica...|           1.0|       0.0|       0.5|
|tt14901058|Mortal Kombat Leg...|           1.0|       0.0|       0.5|
|tt12862032|Scrolls of Auristria|           1.0|       0.0|       0.5|
|tt13516524|    The Poor Hoodman|           1.0|       0.0|       0.5|
|tt12042710|Enchantimals: Spr...|           1.0|       0.0|       0.5|
|tt12876132|Deathstroke: Knig...|           1.0|       0.0|       0.5|
|tt13898164|Gekijouban Macros...|           1.0|       0.0|       0.5|
| tt1446192|Rise of the Guard...|           1.0|       0.0|       0.5|
+----------+--------------------+--------------+----------+----------+
only s

In [68]:
porownanie("tt0816671",10,test) # tu też młodzi tytani, a nie wyskoczyli w cos i iou

+----------+--------------------+--------------+---+----------+
|        id|               tytul|cos_similarity|IOU|prediction|
+----------+--------------------+--------------+---+----------+
| tt0816671|Teen Titans: Trou...|           1.0|1.0|       1.0|
| tt0874648|Pokémon Mystery D...|           1.0|0.0|       0.5|
| tt1314193|The Grim Adventur...|           1.0|0.0|       0.5|
| tt0468445|The Amazing Screw...|           1.0|0.0|       0.5|
| tt0461571|Lupin III: Da Cap...|           1.0|0.0|       0.5|
|tt22753230|      Kingdom Hearts|           1.0|0.0|       0.5|
| tt0312250|Welcome to Elting...|           1.0|0.0|       0.5|
| tt0142232|Dragon Ball GT: A...|           1.0|0.0|       0.5|
| tt0189974|Lupin III: Island...|           1.0|0.0|       0.5|
| tt1543920|     Turtles Forever|           1.0|0.0|       0.5|
+----------+--------------------+--------------+---+----------+
only showing top 10 rows

                id  score
15937    tt0816671   18.0
124057   tt7424200   10.0


In [70]:
porownanie("tt3213366",10,test) #seven, ale nie TO seven

+----------+-------------+--------------+-----------+----------+
|        id|        tytul|cos_similarity|        IOU|prediction|
+----------+-------------+--------------+-----------+----------+
| tt3213366|        Seven|           1.0|        1.0|       1.0|
| tt8194358|       Dodger|           1.0|0.071428575|0.53571427|
| tt4468148|The Hypnotist|     0.8944272| 0.14285715| 0.5186422|
|tt10010034|      Anxiety|           1.0|        0.0|       0.5|
|tt10010330|         Akif|           1.0|        0.0|       0.5|
|tt10010626|   Bellingham|           1.0|        0.0|       0.5|
|tt10120498|        Jakab|           1.0|        0.0|       0.5|
|tt10019100|         Pick|           1.0|        0.0|       0.5|
|tt10011248|        Lilac|           1.0|        0.0|       0.5|
|tt10016554|        Tesfa|           1.0|        0.0|       0.5|
+----------+-------------+--------------+-----------+----------+
only showing top 10 rows

                id  score
46201    tt3213366   17.0
51037    tt4