# MyAnimeList

### Prerequisites:

In [None]:
%pip install --upgrade pyspark
%pip install --upgrade numpy
%pip install --upgrade pandas
%pip freeze > requirements.txt


In [None]:
from pyspark.sql import SparkSession
from pyspark import SparkContext

sc = SparkContext()
spark = SparkSession\
    .builder\
    .getOrCreate()

anime_df = spark.read.csv(
    "./csv/AnimeList.csv",
    header=True,
    inferSchema=True,
    sep=",",
)

In [None]:
from pyspark.sql.functions import split, regexp_replace, col
from pyspark.sql.types import FloatType

anime_df = anime_df\
    .drop(
        "title",
        "title_english",
        "title_japanese",
        "title_synonyms",
        "image_url",
        "background",
        "broadcast",
        "related",
        "opening_theme",
        "ending_theme",
        "studio",
        "premiere",
        "producer",
        "licensor",
        "rank",
    )\
    .fillna({"genre": ""})\
    .withColumn(
        "genre",
        split(
            regexp_replace('genre', ' ', ''),
            ",",
        )
    )

for col_name in [
    "episodes",
    "score",
    "scored_by",
    "popularity",
    "members",
    "favorites",
    ]:
    anime_df = anime_df.withColumn(col_name, col(col_name).cast(FloatType()))

In [78]:
from pyspark.ml.feature import CountVectorizer

CountVectorizer(
    inputCol="genre",
    outputCol="genre_fv"
)\
    .fit(anime_df)\
    .transform(anime_df)\
    .select("genre_fv")\
    .show(truncate=False)

+-----------------------------------------------+
|genre_fv                                       |
+-----------------------------------------------+
|(181,[0,7,8,13],[1.0,1.0,1.0,1.0])             |
|(181,[0,7,8,10,22],[1.0,1.0,1.0,1.0,1.0])      |
|(181,[0,10,16,19],[1.0,1.0,1.0,1.0])           |
|(181,[0,2,4,8,16],[1.0,1.0,1.0,1.0,1.0])       |
|(181,[0,4,7,8],[1.0,1.0,1.0,1.0])              |
|(181,[6,10,19],[1.0,1.0,1.0])                  |
|(181,[0,8,10,16,19],[1.0,1.0,1.0,1.0,1.0])     |
|(181,[1,2,4,8,10,13],[1.0,1.0,1.0,1.0,1.0,1.0])|
|(181,[0,8,9,11,39],[1.0,1.0,1.0,1.0,1.0])      |
|(181,[0,8,10,19,28],[1.0,1.0,1.0,1.0,1.0])     |
|(181,[0,4,8,40],[1.0,1.0,1.0,1.0])             |
|(181,[0,8,10,19],[1.0,1.0,1.0,1.0])            |
|(181,[0,4,8,40],[1.0,1.0,1.0,1.0])             |
|(181,[0,8,9,10,19],[1.0,1.0,1.0,1.0,1.0])      |
|(181,[0,1,3,7,13,23],[1.0,1.0,1.0,1.0,1.0,1.0])|
|(181,[85,155],[1.0,1.0])                       |
|(181,[4,8,9,10,19],[1.0,1.0,1.0,1.0,1.0])      |
