## Imports


In [1]:
from IPython.core.interactiveshell import InteractiveShell

InteractiveShell.ast_node_interactivity = "all"
from dotenv import load_dotenv
from os import getenv
from pathlib import Path
load_dotenv()

import findspark

findspark.init()

In [2]:
CWD: Path = Path("/app/")
EXAMPLE_INPUT_PATH: Path = CWD / Path("./coding_challenge_files/example_input.txt")

## TAST PART TWO MYSQL DB


### Database Config


First we setup the connector driver path


In [3]:
MYSQL_CONNECTOR_FILENAME: str = "mysql-connector-j-8.2.0.jar"
MYSQL_CONNECTOR_PATH: str = f"/app/mysql_connector/{MYSQL_CONNECTOR_FILENAME}"
TABLE_NAME: str = "INSTRUMENT_PRICE_MODIFIER"

We check if hte connector path is correct

In [4]:
Path(MYSQL_CONNECTOR_PATH).exists()

True

Then we move the connector driver in the proper place in order to be recognizable by pyspark


In [5]:
findspark.add_jars(MYSQL_CONNECTOR_PATH)

We setup the relevant credentials for the database connection


In [6]:
# database connection info
DB_CON_DICT = dict(
    user=getenv("MYSQL_ROOT_USER"),
    password=getenv("MYSQL_ROOT_PASSWORD"),
    host=getenv("HOST"),
    port=int(getenv("MYSQL_DOCKER_PORT")), # type: ignore
    database=getenv("MYSQL_DATABASE"),
)

DB_CON_DICT

{'user': 'root',
 'password': 'example',
 'host': 'db',
 'port': 3306,
 'database': 'mydb'}

We also setup the pyspark specific format we need for the database connection


In [7]:
# Configure MySQL connection properties
MYSQL_PROPERTIES = {
    "driver": "com.mysql.cj.jdbc.Driver",
    "url": "jdbc:mysql://{host}:{port}/{database}".format(**DB_CON_DICT), # type: ignore
    "user": DB_CON_DICT["user"], # type: ignore
    "password": DB_CON_DICT["password"], # type: ignore
}

MYSQL_PROPERTIES

{'driver': 'com.mysql.cj.jdbc.Driver',
 'url': 'jdbc:mysql://db:3306/mydb',
 'user': 'root',
 'password': 'example'}

We test the database connection


In [8]:
from tests import test_mysql_conx

In [9]:
# test database connection
test_mysql_conx(**DB_CON_DICT) # type: ignore

Connection Success


We create a test table with some values for pyspark database test

In [10]:
from tests import table_preparation

In [11]:
table_preparation()

Table 'test_table' deleted successfully.
Table 'test_table' created successfully.
Sample data inserted successfully.


We test the pyspark session against this test_table

In [12]:
from tests import test_pyspark_db_conx

In [13]:
test_pyspark_db_conx()

MYSQL_PROPERTIES={'driver': 'com.mysql.cj.jdbc.Driver', 'url': 'jdbc:mysql://db:3306/mydb', 'user': 'root', 'password': 'example'}
MYSQL driver path existence: True


24/01/11 12:55:53 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).


+---+-----+
| id| name|
+---+-----+
|  1| John|
|  2|Alice|
|  3|  Bob|
+---+-----+



We clean up the test_table

In [14]:
from tests import drop_table

In [15]:
drop_table()

Table 'test_table' deleted successfully.


### `So as part of your task we would like you to set up a database with only one table, called INSTRUMENT_PRICE_MODIFIER with the following columns:`

- ID (primary key)
- NAME (instrument name as read from the input file)
- MULTIPLIER - double value
