## Imports


In [1]:
from IPython.core.interactiveshell import InteractiveShell

InteractiveShell.ast_node_interactivity = "all"
from dotenv import load_dotenv
from os import getenv
from pathlib import Path
from tests import test_mysql_conx, test_pyspark_con

import findspark
findspark.init()

load_dotenv()

False

In [2]:
# MYSQL_CONNECTOR_PATH = "usr/share/java/mysql-connector-java-8.2.0.jar"
MYSQL_CONNECTOR_FILENAME = "mysql-connector-j-8.2.0.jar"
MYSQL_CONNECTOR_PATH = f"./app/mysql_connector/{MYSQL_CONNECTOR_FILENAME}"
Path(MYSQL_CONNECTOR_PATH).exists()

True

Move the connector driver in the proper place in order to be recognizable by pyspark

In [3]:
findspark.add_jars(MYSQL_CONNECTOR_PATH)

In [4]:
CWD: Path = Path("./app/")
EXAMPLE_INPUT_PATH: Path = CWD / Path("./coding_challenge_files/example_input.txt")
TABLE_NAME = "instruments"

## Testing Database connection


First we test the connection to our mysql database


In [5]:
# database connection info
DB_CON_DICT = dict(
    user=getenv("MYSQL_ROOT_USER"),
    password=getenv("MYSQL_ROOT_PASSWORD"),
    host=getenv("HOST"),
    port=int(getenv("MYSQL_DOCKER_PORT")),
    database=getenv("MYSQL_DATABASE"),
)

# test database connection
test_mysql_conx(**DB_CON_DICT)

Connection Success


## Testing Pyspark installation


Then we test if pyspark is properly installed


In [6]:
test_pyspark_con()

24/01/10 16:40:30 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
                                                                                

+-----+
|value|
+-----+
|Hello|
|World|
+-----+



## Importing example txt data into mysql


Then we will import the example_input.txt file as a table in our database in order to simulate a real world scenario.


In [7]:
from pyspark.sql import SparkSession
from pyspark.sql.types import StructType, StructField, StringType, DoubleType

# Create a Spark session
spark = (
    SparkSession.builder.appName("TxtToMySQL")
    .config("spark.jars", MYSQL_CONNECTOR_PATH)
    .getOrCreate()
)

# Specify the path to your .txt file
txt_file_path: str = f"{EXAMPLE_INPUT_PATH}"

# Define the schema with StringType for DATE initially
schema = StructType(
    [
        StructField(name="INSTRUMENT_NAME", dataType=StringType(), nullable=True),
        StructField(name="DATE", dataType=StringType(), nullable=True),
        StructField(name="VALUE", dataType=DoubleType(), nullable=True),
    ]
)

In [8]:
# Read the .txt file into a PySpark DataFrame
extr = spark.read.option("delimiter", ",").csv(
    txt_file_path, header=False, schema=schema
)

# transform to dataframe
df = extr.toDF("INSTRUMENT_NAME", "DATE", "VALUE")

In [9]:
df.printSchema()

df.show(5)

root
 |-- INSTRUMENT_NAME: string (nullable = true)
 |-- DATE: string (nullable = true)
 |-- VALUE: double (nullable = true)

+---------------+-----------+------+
|INSTRUMENT_NAME|       DATE| VALUE|
+---------------+-----------+------+
|    INSTRUMENT1|01-Jan-1996|2.4655|
|    INSTRUMENT1|02-Jan-1996|2.4685|
|    INSTRUMENT1|03-Jan-1996| 2.473|
|    INSTRUMENT1|04-Jan-1996|2.4845|
|    INSTRUMENT1|05-Jan-1996|2.4868|
+---------------+-----------+------+
only showing top 5 rows



We need to properly handle the Date type from string to actual Date


In [10]:
# Convert the DATE column to a DateType using to_date function and an appropriate date format
from pyspark.sql.functions import to_date
date_format_str = "dd-MMM-yyyy"

# Convert the string to a DateType using to_date function
col_date_str = "DATE"
col_transformed_to_date = "DATE"  # "transformed_date"
col_formatted_Date = "DATE"  # "formatted_date"

df = df.withColumn(col_transformed_to_date, to_date(df[col_date_str], date_format_str))

df.printSchema()

df.show(5)

root
 |-- INSTRUMENT_NAME: string (nullable = true)
 |-- DATE: date (nullable = true)
 |-- VALUE: double (nullable = true)

+---------------+----------+------+
|INSTRUMENT_NAME|      DATE| VALUE|
+---------------+----------+------+
|    INSTRUMENT1|1996-01-01|2.4655|
|    INSTRUMENT1|1996-01-02|2.4685|
|    INSTRUMENT1|1996-01-03| 2.473|
|    INSTRUMENT1|1996-01-04|2.4845|
|    INSTRUMENT1|1996-01-05|2.4868|
+---------------+----------+------+
only showing top 5 rows



Save the df into an sql table


In [11]:
# Configure MySQL connection properties
mysql_properties = {
    "driver": "com.mysql.cj.jdbc.Driver",
    "url": "jdbc:mysql://{host}:{port}/{database}".format(**DB_CON_DICT),
    "user": DB_CON_DICT["user"],
    "password": DB_CON_DICT["password"],
}
mysql_properties

{'driver': 'com.mysql.cj.jdbc.Driver',
 'url': 'jdbc:mysql://db:3306/mydb',
 'user': 'root',
 'password': 'example'}

In [12]:
# Write the DataFrame to MySQL
df.write.jdbc(
    url=mysql_properties["url"],
    table=TABLE_NAME,
    mode="overwrite",  # or "append" if needed
    properties=mysql_properties,
)

                                                                                

In [20]:
# Stop the Spark session
spark.stop()