In [1]:
import os
from pyspark.sql import SparkSession
from pyspark.sql.functions import *
from pyspark.sql.types import *
from dotenv import load_dotenv
import datetime

In [2]:
load_dotenv()
jdbc_driver_path = "postgresql-42.7.4.jar"

In [3]:
spark = SparkSession.builder \
    .appName('Solution') \
    .config("spark.driver.extraClassPath", jdbc_driver_path) \
    .config("spark.sql.execution.arrow.pyspark.enabled", "true") \
    .getOrCreate()

In [4]:
jdbcHostname = os.getenv("HOST")
jdbcDatabase = os.getenv("DB_NAME")
jdbcUsername = os.getenv("USER")
jdbcPassword = os.getenv("PASSWORD")
jdbcPort = 5432 
jdbcDriver = "org.postgresql.Driver"

connProperties = {
  "user": jdbcUsername,
  "password": jdbcPassword,
  "driver": jdbcDriver
}

jdbcUrl = f"jdbc:postgresql://{jdbcHostname}:{jdbcPort}/{jdbcDatabase}"


In [5]:
work_df = spark.read.jdbc(url=jdbcUrl, table="(SELECT * FROM work)", properties=connProperties)
product_size_df = spark.read.jdbc(url=jdbcUrl, table="(SELECT * FROM product_size)", properties=connProperties)
subject_df = spark.read.jdbc(url=jdbcUrl, table="(SELECT * FROM subject)", properties=connProperties)
artist_df = spark.read.jdbc(url=jdbcUrl, table="(SELECT * FROM artist)", properties=connProperties)
canvas_size_df = spark.read.jdbc(url=jdbcUrl, table="(SELECT * FROM canvas_size)", properties=connProperties)
image_link_df = spark.read.jdbc(url=jdbcUrl, table="(SELECT * FROM image_link)", properties=connProperties)
museum_hours_df = spark.read.jdbc(url=jdbcUrl, table="(SELECT * FROM museum_hours)", properties=connProperties)
museum_df = spark.read.jdbc(url=jdbcUrl, table="(SELECT * FROM museum)", properties=connProperties)

### 1.  Fetch all the paintings which are not displayed on any museums?

In [6]:
not_displayed_paintings = work_df.filter(work_df.museum_id.isNull())
print(not_displayed_paintings.count())
not_displayed_paintings.show()

10223
+-------+--------------------+---------+-------+---------+
|work_id|                name|artist_id|  style|museum_id|
+-------+--------------------+---------+-------+---------+
| 125752|Arabian Horses at...|      757|Baroque|     NULL|
| 125818|Count Halm on His...|      757|Baroque|     NULL|
| 125763|Napoleon Before t...|      757|Baroque|     NULL|
| 125774|Peasants Resting ...|      757|Baroque|     NULL|
| 125785|Portrait Oberleut...|      757|Baroque|     NULL|
| 125796|The Rescue of Cou...|      757|Baroque|     NULL|
| 125807|     The Stable Yard|      757|Baroque|     NULL|
|  24532|Jacob A. Stamler ...|      563|   NULL|     NULL|
| 124470| Kaleda off Le Havre|      563|   NULL|     NULL|
| 124479|R. Bell &amp; Co....|      563|   NULL|     NULL|
| 124488|Steam Sailing Shi...|      563|   NULL|     NULL|
| 124497|The American Ship...|      563|   NULL|     NULL|
| 124506|The Atalanta Runn...|      563|   NULL|     NULL|
| 124515|The Auxiliary Ste...|      563|   NULL|  

### 2.  Are there museums without any paintings?

In [7]:
museum_without_paintings = museum_df.join(work_df, museum_df["museum_id"] == work_df["museum_id"], "left_anti")
print(museum_without_paintings.count())
museum_without_paintings.show()

0
+---------+----+-------+----+-----+------+-------+-----+---+
|museum_id|name|address|city|state|postal|country|phone|url|
+---------+----+-------+----+-----+------+-------+-----+---+
+---------+----+-------+----+-----+------+-------+-----+---+

