## Imports


In [1]:
from IPython.core.interactiveshell import InteractiveShell

InteractiveShell.ast_node_interactivity = "all"
from dotenv import load_dotenv
from os import getenv
from pathlib import Path
from tests import test_mysql_conx, test_pyspark_con

import findspark

findspark.init()

load_dotenv()

False

In [2]:
# MYSQL_CONNECTOR_PATH = "usr/share/java/mysql-connector-java-8.2.0.jar"
MYSQL_CONNECTOR_FILENAME = "mysql-connector-j-8.2.0.jar"
MYSQL_CONNECTOR_PATH = f"./app/mysql_connector/{MYSQL_CONNECTOR_FILENAME}"
Path(MYSQL_CONNECTOR_PATH).exists()

True

Move the connector driver in the proper place in order to be recognizable by pyspark


In [3]:
findspark.add_jars(MYSQL_CONNECTOR_PATH)

In [4]:
CWD: Path = Path("./app/")
EXAMPLE_INPUT_PATH: Path = CWD / Path("./coding_challenge_files/example_input.txt")
TABLE_NAME = "instruments"

Defining mysql credentials


In [18]:
# database connection info
DB_CON_DICT = dict(
    user=getenv("MYSQL_ROOT_USER"),
    password=getenv("MYSQL_ROOT_PASSWORD"),
    host=getenv("HOST"),
    port=int(getenv("MYSQL_DOCKER_PORT")),
    database=getenv("MYSQL_DATABASE"),
)

# Configure MySQL connection properties
MYSQL_PROPERTIES = {
    "driver": "com.mysql.cj.jdbc.Driver",
    "url": "jdbc:mysql://{host}:{port}/{database}".format(**DB_CON_DICT),
    "user": DB_CON_DICT["user"],
    "password": DB_CON_DICT["password"],
}
MYSQL_PROPERTIES

{'driver': 'com.mysql.cj.jdbc.Driver',
 'url': 'jdbc:mysql://db:3306/mydb',
 'user': 'root',
 'password': 'example'}

## Testing Database connection


First we test the connection to our mysql database


In [17]:
# test database connection
test_mysql_conx(**DB_CON_DICT)

Connection Success


## Testing Pyspark installation


Then we test if pyspark is properly installed


In [6]:
test_pyspark_con()

24/01/10 16:40:30 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
                                                                                

+-----+
|value|
+-----+
|Hello|
|World|
+-----+



## TASK PART ONE

Read time series from the file provided and pass all of them to the "calculation module".

Calculation engine needs to calculate:

1. For INSTRUMENT1 – mean

1. For INSTRUMENT2 – mean for November 2014

1. For INSTRUMENT3 – any other statistical calculation that we can compute "on-the-fly" as we read the file (it's up to you)

1. For any other instrument from the input file - sum of the newest 10 elements (in terms of the date).

### Read time series from the file provided and pass all of them to the "calculation module".

First we read and transform the data to a pyspark dataframe

In [48]:
from pyspark.sql import SparkSession
from pyspark.sql.types import StructType, StructField, StringType, DoubleType

# Create a Spark session
spark = (
    SparkSession.builder.appName("Calc_Engine")
    .config("spark.jars", MYSQL_CONNECTOR_PATH)
    .getOrCreate()
)

# Specify the path to the .txt file
txt_file_path: str = f"{EXAMPLE_INPUT_PATH}"

# Define the schema with StringType for DATE initially
schema = StructType(
    [
        StructField(name="INSTRUMENT_NAME", dataType=StringType(), nullable=True),
        StructField(name="DATE", dataType=StringType(), nullable=True),
        StructField(name="VALUE", dataType=DoubleType(), nullable=True),
    ]
)

# Read the .txt file into a PySpark DataFrame
extr = spark.read.option("delimiter", ",").csv(
    txt_file_path, header=False, schema=schema
)

# transform to dataframe
df = extr.toDF("INSTRUMENT_NAME", "DATE", "VALUE")

df.printSchema()

df.show(5)

root
 |-- INSTRUMENT_NAME: string (nullable = true)
 |-- DATE: string (nullable = true)
 |-- VALUE: double (nullable = true)

+---------------+-----------+------+
|INSTRUMENT_NAME|       DATE| VALUE|
+---------------+-----------+------+
|    INSTRUMENT1|01-Jan-1996|2.4655|
|    INSTRUMENT1|02-Jan-1996|2.4685|
|    INSTRUMENT1|03-Jan-1996| 2.473|
|    INSTRUMENT1|04-Jan-1996|2.4845|
|    INSTRUMENT1|05-Jan-1996|2.4868|
+---------------+-----------+------+
only showing top 5 rows



24/01/10 17:21:11 WARN SparkSession: Using an existing Spark session; only runtime SQL configurations will take effect.


Then we properly handle the date column as a date datatype

In [49]:
# Convert the DATE column to a DateType using to_date function and an appropriate date format
from pyspark.sql.functions import to_date

date_format_str = "dd-MMM-yyyy"

# Convert the string to a DateType using to_date function
col_date_str = "DATE"
col_transformed_to_date = "DATE"  # "transformed_date"
col_formatted_Date = "DATE"  # "formatted_date"

df = df.withColumn(col_transformed_to_date, to_date(df[col_date_str], date_format_str))

df.printSchema()

df.show(5)

root
 |-- INSTRUMENT_NAME: string (nullable = true)
 |-- DATE: date (nullable = true)
 |-- VALUE: double (nullable = true)

+---------------+----------+------+
|INSTRUMENT_NAME|      DATE| VALUE|
+---------------+----------+------+
|    INSTRUMENT1|1996-01-01|2.4655|
|    INSTRUMENT1|1996-01-02|2.4685|
|    INSTRUMENT1|1996-01-03| 2.473|
|    INSTRUMENT1|1996-01-04|2.4845|
|    INSTRUMENT1|1996-01-05|2.4868|
+---------------+----------+------+
only showing top 5 rows



We sort the Data by Date

In [51]:
from pyspark.sql.functions import col

df = df.orderBy(col("DATE").desc())

df.show(10)

+---------------+----------+-----------+
|INSTRUMENT_NAME|      DATE|      VALUE|
+---------------+----------+-----------+
|    INSTRUMENT1|2014-12-19|   3.475244|
|    INSTRUMENT2|2014-12-19|9.226391955|
|    INSTRUMENT3|2014-12-19|     119.37|
|    INSTRUMENT1|2014-12-18|   3.460937|
|    INSTRUMENT2|2014-12-18|9.223690651|
|    INSTRUMENT3|2014-12-18|    119.275|
|    INSTRUMENT1|2014-12-17|   3.404217|
|    INSTRUMENT2|2014-12-17|9.222419168|
|    INSTRUMENT3|2014-12-17|   117.2525|
|    INSTRUMENT1|2014-12-16|   3.371051|
+---------------+----------+-----------+
only showing top 10 rows

