## Imports

In [1]:
from IPython.core.interactiveshell import InteractiveShell

InteractiveShell.ast_node_interactivity = "all"
from dotenv import load_dotenv
from os import getenv
from pathlib import Path
from tests import test_mysql_conx, test_pyspark_con
load_dotenv()

False

In [2]:
CWD: Path = Path("./app/")
EXAMPLE_INPUT_PATH: Path = CWD / Path('./coding_challenge_files/example_input.txt')
TABLE_NAME = 'instruments'

## Testing Database connection

First we test the connection to our mysql database

In [3]:
# database connection info
DB_CON_DICT = dict(
    user=getenv("MYSQL_ROOT_USER"),
    password=getenv("MYSQL_ROOT_PASSWORD"),
    host=getenv("HOST"),
    port=int(getenv("MYSQL_DOCKER_PORT")),
    database=getenv("MYSQL_DATABASE")    
)

# test database connection
test_mysql_conx(**DB_CON_DICT)

Connection Success


## Testing Pyspark installation

Then we test if pyspark is properly installed

In [4]:
test_pyspark_con()

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
24/01/10 13:08:12 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
                                                                                

+-----+
|value|
+-----+
|Hello|
|World|
+-----+



## Importing example txt data into mysql

Then we will import the example_input.txt file as a table in our database in order to simulate a real world scenario.

In [26]:
from pyspark.sql import SparkSession
from pyspark.sql.types import (StructType, StructField, StringType, IntegerType, DoubleType, DateType)
# Create a Spark session
spark = SparkSession.builder.appName("TxtToMySQL").getOrCreate()

# Specify the path to your .txt file
txt_file_path: str = f'{EXAMPLE_INPUT_PATH}'

# Define the schema with StringType for DATE initially
schema = StructType([
    StructField(name="INSTRUMENT_NAME", dataType=StringType(), nullable=True),
    StructField(name="DATE", dataType=StringType(), nullable=True),
    StructField(name="VALUE", dataType=DoubleType(), nullable=True),
])

In [34]:
# Read the .txt file into a PySpark DataFrame
extr = spark.read.option("delimiter", ",").csv(txt_file_path, header=False, schema=schema)

In [35]:
df = extr.toDF("INSTRUMENT_NAME", "DATE", "VALUE")

In [36]:
df.show()

+---------------+-----------+------+
|INSTRUMENT_NAME|       DATE| VALUE|
+---------------+-----------+------+
|    INSTRUMENT1|01-Jan-1996|2.4655|
|    INSTRUMENT1|02-Jan-1996|2.4685|
|    INSTRUMENT1|03-Jan-1996| 2.473|
|    INSTRUMENT1|04-Jan-1996|2.4845|
|    INSTRUMENT1|05-Jan-1996|2.4868|
|    INSTRUMENT1|08-Jan-1996|2.4825|
|    INSTRUMENT1|09-Jan-1996| 2.487|
|    INSTRUMENT1|10-Jan-1996|2.4865|
|    INSTRUMENT1|11-Jan-1996|2.4845|
|    INSTRUMENT1|21-Feb-1996| 2.537|
|    INSTRUMENT1|22-Feb-1996|  2.54|
|    INSTRUMENT1|23-Feb-1996| 2.549|
|    INSTRUMENT1|26-Feb-1996| 2.538|
|    INSTRUMENT1|27-Feb-1996| 2.543|
|    INSTRUMENT1|28-Feb-1996| 2.547|
|    INSTRUMENT1|29-Feb-1996|2.5565|
|    INSTRUMENT1|01-Mar-1996|2.5685|
|    INSTRUMENT1|04-Mar-1996|2.5667|
|    INSTRUMENT1|05-Mar-1996|2.5695|
|    INSTRUMENT1|06-Mar-1996| 2.569|
+---------------+-----------+------+
only showing top 20 rows



In [37]:
df.printSchema()

root
 |-- INSTRUMENT_NAME: string (nullable = true)
 |-- DATE: string (nullable = true)
 |-- VALUE: double (nullable = true)



We need to properly handle the Date type from string to actual Date


In [38]:
# Convert the DATE column to a DateType using to_date function and an appropriate date format
from pyspark.sql.functions import to_date


date_format = "%d-%b-%Y"
df = df.withColumn("DATE", to_date(df["DATE"], date_format))

Now there is the proper Date type

In [41]:
df.show()

IllegalArgumentException: All week-based patterns are unsupported since Spark 3.0, detected: Y, Please use the SQL function EXTRACT instead

In [39]:
df.printSchema()

root
 |-- INSTRUMENT_NAME: string (nullable = true)
 |-- DATE: date (nullable = true)
 |-- VALUE: double (nullable = true)



In [6]:


# Configure MySQL connection properties
mysql_properties = {
    "driver": "com.mysql.cj.jdbc.Driver",
    "url": "jdbc:mysql://{host}:{port}/{database}".format(**DB_CON_DICT), # type: ignore
    "user": DB_CON_DICT['user'],
    "password": DB_CON_DICT['password'],
}

# Write the DataFrame to MySQL
df.write.jdbc(url=mysql_properties["url"],
              table=TABLE_NAME,
              mode="overwrite",  # or "append" if needed
              properties=mysql_properties)

# Stop the Spark session
spark.stop()

IllegalArgumentException: requirement failed: The number of columns doesn't match.
Old column names (1): _c0
New column names (3): INSTRUMENT_NAME, DATE, VALUE