In [1]:
# Reading Dataset
import numpy as np
import pandas as pd

# Visualization
import plotly.express as px
from plotly.offline import init_notebook_mode, iplot
init_notebook_mode(connected=True)

from datetime import datetime

In [2]:
# Start spark session
from pyspark.sql import SparkSession

spark = SparkSession.builder.config("spark.driver.memory", "10g").getOrCreate()

24/08/01 09:55:05 WARN Utils: Your hostname, ubuntu20 resolves to a loopback address: 127.0.1.1; using 192.168.0.234 instead (on interface wlp0s20f3)
24/08/01 09:55:05 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
24/08/01 09:55:05 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [3]:
# Load anime dataset
from pyspark.sql.types import StructType, StructField, IntegerType, FloatType, StringType

schema = StructType([
    StructField("anime_id", IntegerType(), True),
    StructField("Name", StringType(), True),
    StructField("English name", StringType(), True),
    StructField("Other name", StringType(), True),
    StructField("Score", FloatType(), True),
    StructField("Genres", StringType(), True),
    StructField("Synopsis", StringType(), True),
    StructField("Type", StringType(), True),
    StructField("Episodes", FloatType(), True),
    StructField("Aired", StringType(), True),
    StructField("Premiered", StringType(), True),
    StructField("Status", StringType(), True),
    StructField("Producers", StringType(), True),
    StructField("Licensors", StringType(), True),
    StructField("Studios", StringType(), True),
    StructField("Source", StringType(), True),
    StructField("Duration", StringType(), True),
    StructField("Rating", StringType(), True),
    StructField("Rank", FloatType(), True),
    StructField("Popularity", IntegerType(), True),
    StructField("Favorites", IntegerType(), True),
    StructField("Scored By", FloatType(), True),
    StructField("Members", FloatType(), True),
    StructField("Image URL", StringType(), True),
])

df_anime = spark.read.csv('../dataset/myanimelist-dataset/processed-dataset/anime-dataset-2023.csv', header=True, schema=schema, multiLine=True, quote='\"', escape='\"')

In [4]:
# Importing user details dataset
schema = StructType([
    StructField("Mal ID", IntegerType(), True),
    StructField("Username", StringType(), True),
    StructField("Gender", StringType(), True),
    StructField("Birthday", StringType(), True),
    StructField("Location", StringType(), True),
    StructField("Joined", StringType(), True),
    StructField("Days Watched", FloatType(), True),
    StructField("Mean Score", FloatType(), True),
    StructField("Watching", FloatType(), True),
    StructField("Completed", FloatType(), True),
    StructField("On Hold", FloatType(), True),
    StructField("Dropped", FloatType(), True),
    StructField("Plan to Watch", FloatType(), True),
    StructField("Total Entries", FloatType(), True),
    StructField("Rewatched", FloatType(), True),
    StructField("Episodes Watched", FloatType(), True)
])

df_user = spark.read.csv("../dataset/myanimelist-dataset/processed-dataset/users-details-2023.csv", header=True, schema=schema)

In [5]:
# Importing user score dataset
schema = StructType([
    StructField("user_id", IntegerType(), True),
    StructField("Username", StringType(), True),
    StructField("anime_id", IntegerType(), True),
    StructField("Anime Title", StringType(), True),
    StructField("rating", IntegerType(), True)
])

df_score = spark.read.csv("../dataset/myanimelist-dataset/processed-dataset/users-score-2023.csv", header=True, schema=schema)

In [6]:
import pyspark.sql.functions as f

In [7]:
df_anime = df_anime.filter(~f.col('Genres').contains('UNKNOWN')).filter(~f.col('Studios').contains('UNKNOWN'))
df_anime.show()

+--------+--------------------+--------------------+------------------------------+-----+--------------------+--------------------+-----+--------+--------------------+-----------+----------------+--------------------+--------------------+----------------+-----------+-------------+--------------------+------+----------+---------+---------+---------+--------------------+
|anime_id|                Name|        English name|                    Other name|Score|              Genres|            Synopsis| Type|Episodes|               Aired|  Premiered|          Status|           Producers|           Licensors|         Studios|     Source|     Duration|              Rating|  Rank|Popularity|Favorites|Scored By|  Members|           Image URL|
+--------+--------------------+--------------------+------------------------------+-----+--------------------+--------------------+-----+--------+--------------------+-----------+----------------+--------------------+--------------------+----------------+-

In [8]:
df_anime_reduced = df_anime.select('anime_id', 'Name', 'Genres', 'Synopsis', 'Studios')
df_anime_reduced.show(10, truncate=False)

+--------+-------------------------------+------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------

In [9]:
from pyspark.ml.feature import HashingTF, IDF, Tokenizer, RegexTokenizer, StopWordsRemover
from pyspark.sql.types import ArrayType

In [10]:
nameRegexTokenizer = RegexTokenizer(inputCol="Name", outputCol="Name_tokens", pattern="[a-zA-Z0-9-_]+", gaps=False)
df_anime_reduced = nameRegexTokenizer.transform(df_anime_reduced)

In [11]:
df_anime_reduced.show(10)

+--------+--------------------+--------------------+--------------------+----------------+--------------------+
|anime_id|                Name|              Genres|            Synopsis|         Studios|         Name_tokens|
+--------+--------------------+--------------------+--------------------+----------------+--------------------+
|       1|        Cowboy Bebop|Action, Award Win...|Crime is timeless...|         Sunrise|     [cowboy, bebop]|
|       5|Cowboy Bebop: Ten...|      Action, Sci-Fi|Another day, anot...|           Bones|[cowboy, bebop, t...|
|       6|              Trigun|Action, Adventure...|Vash the Stampede...|        Madhouse|            [trigun]|
|       7|  Witch Hunter Robin|Action, Drama, My...|Robin Sena is a p...|         Sunrise|[witch, hunter, r...|
|       8|      Bouken Ou Beet|Adventure, Fantas...|It is the dark ce...|  Toei Animation|  [bouken, ou, beet]|
|      15|        Eyeshield 21|              Sports|Shy, reserved, an...|          Gallop|     [eyeshiel

In [14]:
# udf to remove "-" and "_" from the tokens
import re
remove_hyphen_udf = f.udf(lambda x: [re.sub('[-|_]', '', word) for word in x], ArrayType(StringType()))

In [15]:
df_anime_reduced = df_anime_reduced.withColumn('Name_tokens', remove_hyphen_udf(f.col('Name_tokens')))

In [16]:
df_anime_reduced.show(10)

+--------+--------------------+--------------------+--------------------+----------------+--------------------+
|anime_id|                Name|              Genres|            Synopsis|         Studios|         Name_tokens|
+--------+--------------------+--------------------+--------------------+----------------+--------------------+
|       1|        Cowboy Bebop|Action, Award Win...|Crime is timeless...|         Sunrise|     [cowboy, bebop]|
|       5|Cowboy Bebop: Ten...|      Action, Sci-Fi|Another day, anot...|           Bones|[cowboy, bebop, t...|
|       6|              Trigun|Action, Adventure...|Vash the Stampede...|        Madhouse|            [trigun]|
|       7|  Witch Hunter Robin|Action, Drama, My...|Robin Sena is a p...|         Sunrise|[witch, hunter, r...|
|       8|      Bouken Ou Beet|Adventure, Fantas...|It is the dark ce...|  Toei Animation|  [bouken, ou, beet]|
|      15|        Eyeshield 21|              Sports|Shy, reserved, an...|          Gallop|     [eyeshiel

                                                                                

In [17]:
nameRemover = StopWordsRemover(inputCol="Name_tokens", outputCol="Name_tokens_removed")
nameRemover.loadDefaultStopWords('english')
df_anime_reduced = nameRemover.transform(df_anime_reduced)

In [18]:
df_anime_reduced.select('Name', 'Name_tokens_removed').show(20, truncate=False)

+-------------------------------+--------------------------------+
|Name                           |Name_tokens_removed             |
+-------------------------------+--------------------------------+
|Cowboy Bebop                   |[cowboy, bebop]                 |
|Cowboy Bebop: Tengoku no Tobira|[cowboy, bebop, tengoku, tobira]|
|Trigun                         |[trigun]                        |
|Witch Hunter Robin             |[witch, hunter, robin]          |
|Bouken Ou Beet                 |[bouken, ou, beet]              |
|Eyeshield 21                   |[eyeshield, 21]                 |
|Hachimitsu to Clover           |[hachimitsu, clover]            |
|Hungry Heart: Wild Striker     |[hungry, heart, wild, striker]  |
|Initial D Fourth Stage         |[initial, d, fourth, stage]     |
|Monster                        |[monster]                       |
|Naruto                         |[naruto]                        |
|One Piece                      |[one, piece]                 

In [19]:
regexTokenizer = RegexTokenizer(inputCol="Synopsis", outputCol="Synopsis_tokens", pattern="[a-zA-Z0-9-_]+", gaps=False)
df_anime_reduced = regexTokenizer.transform(df_anime_reduced)

In [20]:
df_anime_reduced.show(10)

+--------+--------------------+--------------------+--------------------+----------------+--------------------+--------------------+--------------------+
|anime_id|                Name|              Genres|            Synopsis|         Studios|         Name_tokens| Name_tokens_removed|     Synopsis_tokens|
+--------+--------------------+--------------------+--------------------+----------------+--------------------+--------------------+--------------------+
|       1|        Cowboy Bebop|Action, Award Win...|Crime is timeless...|         Sunrise|     [cowboy, bebop]|     [cowboy, bebop]|[crime, is, timel...|
|       5|Cowboy Bebop: Ten...|      Action, Sci-Fi|Another day, anot...|           Bones|[cowboy, bebop, t...|[cowboy, bebop, t...|[another, day, an...|
|       6|              Trigun|Action, Adventure...|Vash the Stampede...|        Madhouse|            [trigun]|            [trigun]|[vash, the, stamp...|
|       7|  Witch Hunter Robin|Action, Drama, My...|Robin Sena is a p...|   

In [21]:
df_anime_reduced = df_anime_reduced.withColumn('Synopsis_tokens', remove_hyphen_udf(f.col('Synopsis_tokens')))

In [22]:
df_anime_reduced.show(10)

+--------+--------------------+--------------------+--------------------+----------------+--------------------+--------------------+--------------------+
|anime_id|                Name|              Genres|            Synopsis|         Studios|         Name_tokens| Name_tokens_removed|     Synopsis_tokens|
+--------+--------------------+--------------------+--------------------+----------------+--------------------+--------------------+--------------------+
|       1|        Cowboy Bebop|Action, Award Win...|Crime is timeless...|         Sunrise|     [cowboy, bebop]|     [cowboy, bebop]|[crime, is, timel...|
|       5|Cowboy Bebop: Ten...|      Action, Sci-Fi|Another day, anot...|           Bones|[cowboy, bebop, t...|[cowboy, bebop, t...|[another, day, an...|
|       6|              Trigun|Action, Adventure...|Vash the Stampede...|        Madhouse|            [trigun]|            [trigun]|[vash, the, stamp...|
|       7|  Witch Hunter Robin|Action, Drama, My...|Robin Sena is a p...|   

In [23]:
remover = StopWordsRemover(inputCol="Synopsis_tokens", outputCol="Synopsis_tokens_removed")
remover.loadDefaultStopWords('english')
df_anime_reduced = remover.transform(df_anime_reduced)

In [24]:
df_anime_reduced.show(10)

+--------+--------------------+--------------------+--------------------+----------------+--------------------+--------------------+--------------------+-----------------------+
|anime_id|                Name|              Genres|            Synopsis|         Studios|         Name_tokens| Name_tokens_removed|     Synopsis_tokens|Synopsis_tokens_removed|
+--------+--------------------+--------------------+--------------------+----------------+--------------------+--------------------+--------------------+-----------------------+
|       1|        Cowboy Bebop|Action, Award Win...|Crime is timeless...|         Sunrise|     [cowboy, bebop]|     [cowboy, bebop]|[crime, is, timel...|   [crime, timeless,...|
|       5|Cowboy Bebop: Ten...|      Action, Sci-Fi|Another day, anot...|           Bones|[cowboy, bebop, t...|[cowboy, bebop, t...|[another, day, an...|   [another, day, an...|
|       6|              Trigun|Action, Adventure...|Vash the Stampede...|        Madhouse|            [trigun]

In [25]:
from nltk.stem import PorterStemmer
from nltk.tokenize import sent_tokenize, word_tokenize

# Apply stemming with NLTK
# Built-in class from NLTK
ps = PorterStemmer()
# udf to apply stemming
stemming = f.udf(lambda x: [ps.stem(item) for item in x], ArrayType(StringType()))
# apply udf to tokens
df_anime_reduced = df_anime_reduced.withColumn('Synopsis_tokens_stemmed', stemming(f.col('Synopsis_tokens_removed')))
df_anime_reduced = df_anime_reduced.select('anime_id', 'Name', 'Genres', 'Name_tokens_removed', 'Studios', 'Synopsis_tokens_stemmed')

In [26]:
df_anime_reduced.show(10)

+--------+--------------------+--------------------+--------------------+----------------+-----------------------+
|anime_id|                Name|              Genres| Name_tokens_removed|         Studios|Synopsis_tokens_stemmed|
+--------+--------------------+--------------------+--------------------+----------------+-----------------------+
|       1|        Cowboy Bebop|Action, Award Win...|     [cowboy, bebop]|         Sunrise|   [crime, timeless,...|
|       5|Cowboy Bebop: Ten...|      Action, Sci-Fi|[cowboy, bebop, t...|           Bones|   [anoth, day, anot...|
|       6|              Trigun|Action, Adventure...|            [trigun]|        Madhouse|   [vash, stamped, m...|
|       7|  Witch Hunter Robin|Action, Drama, My...|[witch, hunter, r...|         Sunrise|   [robin, sena, pow...|
|       8|      Bouken Ou Beet|Adventure, Fantas...|  [bouken, ou, beet]|  Toei Animation|   [dark, centuri, p...|
|      15|        Eyeshield 21|              Sports|     [eyeshield, 21]|       

In [27]:
df_anime_reduced = df_anime_reduced.withColumn("Genres", f.split(f.col('Genres'), ', ').cast(ArrayType(StringType())))
df_anime_reduced.show(5)

+--------+--------------------+--------------------+--------------------+--------------+-----------------------+
|anime_id|                Name|              Genres| Name_tokens_removed|       Studios|Synopsis_tokens_stemmed|
+--------+--------------------+--------------------+--------------------+--------------+-----------------------+
|       1|        Cowboy Bebop|[Action, Award Wi...|     [cowboy, bebop]|       Sunrise|   [crime, timeless,...|
|       5|Cowboy Bebop: Ten...|    [Action, Sci-Fi]|[cowboy, bebop, t...|         Bones|   [anoth, day, anot...|
|       6|              Trigun|[Action, Adventur...|            [trigun]|      Madhouse|   [vash, stamped, m...|
|       7|  Witch Hunter Robin|[Action, Drama, M...|[witch, hunter, r...|       Sunrise|   [robin, sena, pow...|
|       8|      Bouken Ou Beet|[Adventure, Fanta...|  [bouken, ou, beet]|Toei Animation|   [dark, centuri, p...|
+--------+--------------------+--------------------+--------------------+--------------+--------

In [28]:
df_anime_reduced.filter(df_anime_reduced.Studios.contains(',')).show()

+--------+--------------------+--------------------+--------------------+--------------------+-----------------------+
|anime_id|                Name|              Genres| Name_tokens_removed|             Studios|Synopsis_tokens_stemmed|
+--------+--------------------+--------------------+--------------------+--------------------+-----------------------+
|      30|Neon Genesis Evan...|[Action, Avant Ga...|[neon, genesis, e...|Gainax, Tatsunoko...|   [fifteen, year, c...|
|      31|Neon Genesis Evan...|     [Drama, Sci-Fi]|[neon, genesis, e...|Gainax, Productio...|   [year, 2015, deca...|
|      32|Neon Genesis Evan...|[Avant Garde, Dra...|[neon, genesis, e...|Gainax, Productio...|   [shinji, ikari, l...|
|      45|Rurouni Kenshin: ...|[Action, Adventur...|[rurouni, kenshin...| Gallop, Studio Deen|   [final, year, bak...|
|      62|       D.C.: Da Capo|    [Drama, Romance]|    [d, c, da, capo]|        feel., Zexcs|   [hatsunejima, abo...|
|     112|Chou Henshin Cosp...|[Action, Adventur

In [29]:
df_anime_reduced = df_anime_reduced.withColumn("Studios", f.split(f.col('Studios'), ', ').cast(ArrayType(StringType())))
df_anime_reduced.show(5)

+--------+--------------------+--------------------+--------------------+----------------+-----------------------+
|anime_id|                Name|              Genres| Name_tokens_removed|         Studios|Synopsis_tokens_stemmed|
+--------+--------------------+--------------------+--------------------+----------------+-----------------------+
|       1|        Cowboy Bebop|[Action, Award Wi...|     [cowboy, bebop]|       [Sunrise]|   [crime, timeless,...|
|       5|Cowboy Bebop: Ten...|    [Action, Sci-Fi]|[cowboy, bebop, t...|         [Bones]|   [anoth, day, anot...|
|       6|              Trigun|[Action, Adventur...|            [trigun]|      [Madhouse]|   [vash, stamped, m...|
|       7|  Witch Hunter Robin|[Action, Drama, M...|[witch, hunter, r...|       [Sunrise]|   [robin, sena, pow...|
|       8|      Bouken Ou Beet|[Adventure, Fanta...|  [bouken, ou, beet]|[Toei Animation]|   [dark, centuri, p...|
+--------+--------------------+--------------------+--------------------+-------

In [30]:
# df_anime_reduced = df_anime_reduced.withColumn('intial_feature', f.concat(f.col('Genres'), f.col('Name_tokens_removed'), f.col('Synopsis_tokens_stemmed')))
# df_anime_reduced = df_anime_reduced.withColumn('intial_feature', f.concat(f.col('Genres'), f.col('Name_tokens_removed'), f.col('Studios')))
df_anime_reduced = df_anime_reduced.withColumn('intial_feature', f.concat(f.col('Genres'), f.col('Name_tokens_removed')))


df_anime_reduced.select('intial_feature').show(truncate=False)

+-------------------------------------------------------------+
|intial_feature                                               |
+-------------------------------------------------------------+
|[Action, Award Winning, Sci-Fi, cowboy, bebop]               |
|[Action, Sci-Fi, cowboy, bebop, tengoku, tobira]             |
|[Action, Adventure, Sci-Fi, trigun]                          |
|[Action, Drama, Mystery, Supernatural, witch, hunter, robin] |
|[Adventure, Fantasy, Supernatural, bouken, ou, beet]         |
|[Sports, eyeshield, 21]                                      |
|[Comedy, Drama, Romance, hachimitsu, clover]                 |
|[Comedy, Slice of Life, Sports, hungry, heart, wild, striker]|
|[Action, Drama, initial, d, fourth, stage]                   |
|[Drama, Mystery, Suspense, monster]                          |
|[Action, Adventure, Fantasy, naruto]                         |
|[Action, Adventure, Fantasy, one, piece]                     |
|[Sports, tennis, oujisama]             

In [31]:
hashingTF = HashingTF(inputCol="intial_feature", outputCol="rawFeatures")
featurizedData = hashingTF.transform(df_anime_reduced)

In [32]:
featurizedData.show(5)

+--------+--------------------+--------------------+--------------------+----------------+-----------------------+--------------------+--------------------+
|anime_id|                Name|              Genres| Name_tokens_removed|         Studios|Synopsis_tokens_stemmed|      intial_feature|         rawFeatures|
+--------+--------------------+--------------------+--------------------+----------------+-----------------------+--------------------+--------------------+
|       1|        Cowboy Bebop|[Action, Award Wi...|     [cowboy, bebop]|       [Sunrise]|   [crime, timeless,...|[Action, Award Wi...|(262144,[15444,61...|
|       5|Cowboy Bebop: Ten...|    [Action, Sci-Fi]|[cowboy, bebop, t...|         [Bones]|   [anoth, day, anot...|[Action, Sci-Fi, ...|(262144,[15444,61...|
|       6|              Trigun|[Action, Adventur...|            [trigun]|      [Madhouse]|   [vash, stamped, m...|[Action, Adventur...|(262144,[15444,42...|
|       7|  Witch Hunter Robin|[Action, Drama, M...|[witch

                                                                                

In [33]:
idf = IDF(inputCol="rawFeatures", outputCol="features")
idfModel = idf.fit(featurizedData)
rescaledData = idfModel.transform(featurizedData)

                                                                                

In [34]:
rescaledData.show(5)

24/08/01 09:55:22 WARN DAGScheduler: Broadcasting large task binary with size 4.1 MiB


+--------+--------------------+--------------------+--------------------+----------------+-----------------------+--------------------+--------------------+--------------------+
|anime_id|                Name|              Genres| Name_tokens_removed|         Studios|Synopsis_tokens_stemmed|      intial_feature|         rawFeatures|            features|
+--------+--------------------+--------------------+--------------------+----------------+-----------------------+--------------------+--------------------+--------------------+
|       1|        Cowboy Bebop|[Action, Award Wi...|     [cowboy, bebop]|       [Sunrise]|   [crime, timeless,...|[Action, Award Wi...|(262144,[15444,61...|(262144,[15444,61...|
|       5|Cowboy Bebop: Ten...|    [Action, Sci-Fi]|[cowboy, bebop, t...|         [Bones]|   [anoth, day, anot...|[Action, Sci-Fi, ...|(262144,[15444,61...|(262144,[15444,61...|
|       6|              Trigun|[Action, Adventur...|            [trigun]|      [Madhouse]|   [vash, stamped, m

                                                                                

In [35]:
import numpy as np 

def cos_sim(u, v):
    # result = (np.dot(u, p))/(np.linalg.norm(u) * np.linalg.norm(p))
    # result = result.item()
    return float(u.dot(v) / (u.norm(2) * v.norm(2)))
    # return result

compute_sim = f.udf(cos_sim, FloatType())

In [36]:
df_anime_reduced = rescaledData.select('anime_id', 'features')

In [37]:
df_anime_reduced.show(5)

24/08/01 09:55:23 WARN DAGScheduler: Broadcasting large task binary with size 4.1 MiB


+--------+--------------------+
|anime_id|            features|
+--------+--------------------+
|       1|(262144,[15444,61...|
|       5|(262144,[15444,61...|
|       6|(262144,[15444,42...|
|       7|(262144,[4973,954...|
|       8|(262144,[4973,382...|
+--------+--------------------+
only showing top 5 rows



In [38]:
df_anime_reduced = df_anime_reduced.crossJoin(df_anime_reduced.withColumnRenamed('anime_id', 'anime_id_2').withColumnRenamed('features', 'features2'))

In [39]:
df_anime_reduced.show(5)

24/08/01 09:55:23 WARN DAGScheduler: Broadcasting large task binary with size 4.1 MiB
24/08/01 09:55:24 WARN DAGScheduler: Broadcasting large task binary with size 4.1 MiB


+--------+--------------------+----------+--------------------+
|anime_id|            features|anime_id_2|           features2|
+--------+--------------------+----------+--------------------+
|       1|(262144,[15444,61...|         1|(262144,[15444,61...|
|       1|(262144,[15444,61...|         5|(262144,[15444,61...|
|       1|(262144,[15444,61...|         6|(262144,[15444,42...|
|       1|(262144,[15444,61...|         7|(262144,[4973,954...|
|       1|(262144,[15444,61...|         8|(262144,[4973,382...|
+--------+--------------------+----------+--------------------+
only showing top 5 rows



In [40]:
df_anime_reduced.count()

141610000

In [41]:
df_anime_reduced = df_anime_reduced.filter(df_anime_reduced.anime_id != df_anime_reduced.anime_id_2)
df_anime_reduced.count()

                                                                                

141598100

In [42]:
computed_df = df_anime_reduced.withColumn('cos_sim', compute_sim(f.col('features'), f.col('features2')))

In [43]:
computed_df.show(5)

24/08/01 09:55:27 WARN DAGScheduler: Broadcasting large task binary with size 4.1 MiB
24/08/01 09:55:28 WARN DAGScheduler: Broadcasting large task binary with size 4.1 MiB


+--------+--------------------+----------+--------------------+-----------+
|anime_id|            features|anime_id_2|           features2|    cos_sim|
+--------+--------------------+----------+--------------------+-----------+
|       1|(262144,[15444,61...|         5|(262144,[15444,61...|  0.6602944|
|       1|(262144,[15444,61...|         6|(262144,[15444,42...| 0.03865153|
|       1|(262144,[15444,61...|         7|(262144,[4973,954...|0.008507955|
|       1|(262144,[15444,61...|         8|(262144,[4973,382...|        0.0|
|       1|(262144,[15444,61...|        15|(262144,[87466,16...|        0.0|
+--------+--------------------+----------+--------------------+-----------+
only showing top 5 rows



                                                                                

In [61]:
df_anime.filter(f.col('Name').contains('K-On')).select('anime_id', 'Name', 'English name').show(truncate=False)

+--------+------------------+-----------------+
|anime_id|Name              |English name     |
+--------+------------------+-----------------+
|5680    |K-On!             |K-ON!            |
|6862    |K-On!: Live House!|K-ON! Live House!|
|7017    |K-On!: Ura-On!    |K-On!: Ura-On!   |
|7791    |K-On!!            |K-ON! Season 2   |
|9203    |K-On!!: Ura-On!!  |K-ON!!: Ura-On!! |
|9617    |K-On! Movie       |K-ON! The Movie  |
|9734    |K-On!!: Keikaku!  |K-On!!: Plan!    |
+--------+------------------+-----------------+



In [62]:
anime_id = 5680

print("Chosen anime: ")
chosen_anime = df_anime.filter(df_anime.anime_id == anime_id).select('anime_id', 'Name', 'Score', 'Genres', 'Studios')
chosen_anime.show(truncate=False)

Chosen anime: 
+--------+-----+-----+------+---------------+
|anime_id|Name |Score|Genres|Studios        |
+--------+-----+-----+------+---------------+
|5680    |K-On!|7.85 |Comedy|Kyoto Animation|
+--------+-----+-----+------+---------------+



In [63]:
recommendation_result = computed_df.filter(computed_df.anime_id == anime_id).orderBy('cos_sim', ascending=False)
recommendation_result.show(20)

24/08/01 09:57:59 WARN DAGScheduler: Broadcasting large task binary with size 4.1 MiB
24/08/01 09:57:59 WARN DAGScheduler: Broadcasting large task binary with size 4.1 MiB
[Stage 78:>                                                         (0 + 1) / 1]

+--------+--------------------+----------+--------------------+------------+
|anime_id|            features|anime_id_2|           features2|     cos_sim|
+--------+--------------------+----------+--------------------+------------+
|    5680|(262144,[4915,204...|      7791|(262144,[4915,833...|   0.8679531|
|    5680|(262144,[4915,204...|      9617|(262144,[4915,833...|   0.8217346|
|    5680|(262144,[4915,204...|      9734|(262144,[4915,937...|   0.7107434|
|    5680|(262144,[4915,204...|      7017|(262144,[4915,265...|   0.6638239|
|    5680|(262144,[4915,204...|      9203|(262144,[4915,265...|   0.6638239|
|    5680|(262144,[4915,204...|      6862|(262144,[4915,330...|  0.63034767|
|    5680|(262144,[4915,204...|      3470|(262144,[4915,205...| 0.021373251|
|    5680|(262144,[4915,204...|       527|(262144,[4915,154...| 0.018533273|
|    5680|(262144,[4915,204...|     34009|(262144,[4915,335...| 0.017481092|
|    5680|(262144,[4915,204...|     37286|(262144,[4915,172...| 0.015772797|

                                                                                

In [64]:
recommendation_result_reduced = recommendation_result.select('anime_id_2', 'cos_sim')
recommendation_result_reduced.show()

24/08/01 09:58:04 WARN DAGScheduler: Broadcasting large task binary with size 4.1 MiB
24/08/01 09:58:05 WARN DAGScheduler: Broadcasting large task binary with size 4.1 MiB
[Stage 80:>                                                         (0 + 1) / 1]

+----------+------------+
|anime_id_2|     cos_sim|
+----------+------------+
|      7791|   0.8679531|
|      9617|   0.8217346|
|      9734|   0.7107434|
|      7017|   0.6638239|
|      9203|   0.6638239|
|      6862|  0.63034767|
|      3470| 0.021373251|
|       527| 0.018533273|
|     34009| 0.017481092|
|     37286| 0.015772797|
|     52022| 0.015639778|
|     10165| 0.015531302|
|     14045|  0.01530865|
|     41916| 0.015303672|
|     43807| 0.014930413|
|       918|0.0148495445|
|      9969|0.0148495445|
|     28977|0.0148495445|
|     34096|0.0148495445|
|     31174| 0.014507501|
+----------+------------+
only showing top 20 rows



                                                                                

In [65]:
df_anime_joined = recommendation_result_reduced \
                    .join(df_anime, recommendation_result_reduced.anime_id_2 == df_anime.anime_id) \
                    .orderBy('cos_sim', ascending=False) \
                    .select('anime_id', 'cos_sim', 'Name', 'Score', 'Genres', 'Studios')

df_anime_joined.show(truncate=False)

24/08/01 09:58:09 WARN DAGScheduler: Broadcasting large task binary with size 4.1 MiB
24/08/01 09:58:09 WARN DAGScheduler: Broadcasting large task binary with size 4.1 MiB
[Stage 83:>                                                         (0 + 1) / 1]

+--------+------------+------------------+-----+----------------------------------+------------------------------+
|anime_id|cos_sim     |Name              |Score|Genres                            |Studios                       |
+--------+------------+------------------+-----+----------------------------------+------------------------------+
|7791    |0.8679531   |K-On!!            |8.17 |Award Winning, Comedy             |Kyoto Animation               |
|9617    |0.8217346   |K-On! Movie       |8.35 |Award Winning, Comedy             |Kyoto Animation               |
|9734    |0.7107434   |K-On!!: Keikaku!  |7.85 |Comedy, Slice of Life             |Kyoto Animation               |
|7017    |0.6638239   |K-On!: Ura-On!    |6.36 |Comedy                            |Kyoto Animation               |
|9203    |0.6638239   |K-On!!: Ura-On!!  |6.54 |Comedy                            |Kyoto Animation               |
|6862    |0.63034767  |K-On!: Live House!|7.84 |Comedy                          

                                                                                

In [66]:
number_of_recommendations = 20
studio = chosen_anime.select(f.collect_list('Studios')).first()[0][0]

df_anime_joined.filter(f.col('Studios').contains(studio)).show(int(number_of_recommendations), truncate=False)

24/08/01 09:58:14 WARN DAGScheduler: Broadcasting large task binary with size 4.1 MiB
24/08/01 09:58:14 WARN DAGScheduler: Broadcasting large task binary with size 4.1 MiB
[Stage 89:>                                                         (0 + 1) / 1]

+--------+------------+-------------------------------+-----+------------------------------------+-----------------------------+
|anime_id|cos_sim     |Name                           |Score|Genres                              |Studios                      |
+--------+------------+-------------------------------+-----+------------------------------------+-----------------------------+
|7791    |0.8679531   |K-On!!                         |8.17 |Award Winning, Comedy               |Kyoto Animation              |
|9617    |0.8217346   |K-On! Movie                    |8.35 |Award Winning, Comedy               |Kyoto Animation              |
|9734    |0.7107434   |K-On!!: Keikaku!               |7.85 |Comedy, Slice of Life               |Kyoto Animation              |
|7017    |0.6638239   |K-On!: Ura-On!                 |6.36 |Comedy                              |Kyoto Animation              |
|9203    |0.6638239   |K-On!!: Ura-On!!               |6.54 |Comedy                              

                                                                                

In [67]:
df_anime_joined.filter(~f.col('Studios').contains(studio)).show(int(number_of_recommendations), truncate=False)

24/08/01 09:58:59 WARN DAGScheduler: Broadcasting large task binary with size 4.1 MiB
24/08/01 09:58:59 WARN DAGScheduler: Broadcasting large task binary with size 4.1 MiB
[Stage 92:>                                                         (0 + 1) / 1]

+--------+------------+----------------+-----+----------------------------------+------------------------------+
|anime_id|cos_sim     |Name            |Score|Genres                            |Studios                       |
+--------+------------+----------------+-----+----------------------------------+------------------------------+
|3470    |0.021373251 |Special A       |7.51 |Comedy, Romance                   |Gonzo, AIC                    |
|527     |0.018533273 |Pokemon         |7.37 |Action, Adventure, Comedy, Fantasy|OLM                           |
|34009   |0.017481092 |To Be Hero      |6.96 |Comedy                            |Haoliners Animation League    |
|37286   |0.015772797 |It's My Life    |5.67 |Comedy, Fantasy                   |Creators in Pack              |
|52022   |0.015639778 |In the Film     |6.38 |Comedy                            |Uguisu Kobo                   |
|14045   |0.01530865  |Mangirl!        |5.8  |Comedy, Slice of Life             |Doga Kobo      

                                                                                