# Import libraries

## PYSPARK

In [1]:
import os

os.environ["PYSPARK_PYTHON"] = "python"

In [2]:
from pyspark.sql import SparkSession

spark = SparkSession\
    .builder\
    .master("local[*]")\
    .getOrCreate()

## SQLITE3

In [3]:
import sqlite3

# Load databases

## CSV

In [4]:
csv_anime = spark.read.format("csv")\
    .option("header", "true")\
    .option("escape", "\"")\
    .load("../csv/anime_cleaned.csv")

## SQLITE

In [5]:
sqlite_conn = sqlite3.connect("../prisma/dev.db")
sqlite_cur = sqlite_conn.cursor()

# Benchmarks

## Count the number of genres

### SQLITE

In [9]:
n_genres = sqlite_cur.execute("SELECT COUNT(id) FROM genre").fetchone()[0]
print(f"There are {n_genres} genres in the database.")

There are 43 genres in the database.


### CSV

In [10]:
n_genres = csv_anime\
    .rdd\
    .map(lambda row: row["genre"])\
    .filter(lambda g: g is not None)\
    .flatMap(lambda g: g.split(", "))\
    .distinct()\
    .count()
print(f"There are {n_genres} genres in the database.")

There are 43 genres in the database.


## Count the numbers of studios

### SQLITE

In [11]:
n_studios = sqlite_cur.execute("SELECT COUNT(id) FROM studio").fetchone()[0]
print(f"There are {n_studios} studios in the database.")

There are 475 studios in the database.


### CSV

In [12]:
n_studios = csv_anime\
    .rdd\
    .map(lambda row: row["studio"])\
    .filter(lambda g: g is not None)\
    .flatMap(lambda g: g.split(", "))\
    .distinct()\
    .count()
print(f"There are {n_studios} studios in the database.")

There are 475 studios in the database.
