In [None]:
# Defines the file name (triaxial) to be processed.
# This file should exist in the folder lakehouse/default/Files/Triaxials.
triaxial_filename = "SRK-169-TX ConsolData.xlsx" # default

# It also includes the triaxial sample name
triaxial_sample_name = "SRK-169-3095-Rem-CU"     # default

In [51]:
# Funciones que vamos a usar en el notebook
from uuid import uuid4
import pandas as pd


def _extract_test_information(nombre_ensayo):
    """
    SRK-169-3107-Undis-CU:

        - **SRK-169-3107** es el código que identifica al **ensayo**

        - **Undis** significa que es una muestra "indisturbada" ("Undisturbed" perhaps?)
        - **Rem** es otra posibilidad y significa "remoldeada" ("Remodeled" perhaps?)
            - Tanto "Undis" como "Rem" pertenecen al **estado de la muestra** (`Sample` table, `State` column)

        - **CU** es el tipo de ensayo, significa "consolidado no drenado" ("Consolidated Un-drained" perhaps?)
        - **CD** es el otro tipo, supongo que significa "consolidado drenado" ("Consolidated Drained" perhaps?)
            - Pertenecen a la columna `consolidation_tupe` de la tabla `triaxial`
    """
    partes = nombre_ensayo.split("-")

    code = "-".join(partes[:3])
    consolation_tupe = str(partes[3])
    sample_state = str(partes[4])

    return code, consolation_tupe, sample_state


def extract_ensayo(ensayo_nombre, fuente_id):
    id = uuid4().hex
    code, tupe, state = _extract_test_information(ensayo_nombre)

    df = pd.DataFrame([{
        "id": id, 
        "code": code, 
        "consolidation_tupe": tupe, 
        "sample_state": state,
        "fuente_id": fuente_id
    }])

    return id, df


def extract_fuente(file_name):
    id = uuid4().hex
    df = pd.DataFrame([{
        "id": id,
        "name": file_name
    }])

    return id, df


def _extract_single_test(df, test_number, ensayo_id):
    pwp = f"pwp_{test_number}"
    time = f"time_{test_number}"
    pore = f"pore_vol_{test_number}"

    return (
        df[[time, pore, pwp]]
        .rename(columns={time: "time", pore: "pore_vol", pwp: "pwp"})
        .dropna()
        .assign(test_number=test_number, ensayo_id=ensayo_id)
    )


def extract_test_data(full_data, ensayo_id):
    """Devuelve tres dataframes con los datos de los tests"""
    t1 = _extract_single_test(full_data, 1, ensayo_id)
    t2 = _extract_single_test(full_data, 2, ensayo_id)
    t3 = _extract_single_test(full_data, 3, ensayo_id)

    return pd.concat([t1, t2, t3], ignore_index=True)


def init_tables(spark_session):
    """Inicializa las delta tables, si es que no existen"""
    from delta.tables import DeltaTable
    from pyspark.sql.types import StringType, LongType, DoubleType

    builder_fuentes = (DeltaTable
        .createIfNotExists(spark_session)
        .tableName("fuentes")
        .addColumn("id", dataType=StringType())
        .addColumn("name", dataType=StringType())
        .execute()
    )

    builder_ensayos = (
        DeltaTable
        .createIfNotExists(spark_session)
        .tableName("ensayos")
        .addColumn("id", dataType=StringType())
        .addColumn("code", dataType=StringType())
        .addColumn("sample_state", dataType=StringType())
        .addColumn("consolidation_tupe", dataType=StringType())
        .addColumn("fuente_id", dataType=StringType())
        .execute()
    )

    builder_triaxials = (
         DeltaTable
        .createIfNotExists(spark_session)
        .tableName("triaxial_tests")
        .addColumn("id", dataType=StringType())
        .addColumn("test_number", dataType=LongType())
        .addColumn("time", dataType=DoubleType())
        .addColumn("pore_vol", dataType=DoubleType())
        .addColumn("pwp", dataType=DoubleType())
        .addColumn("ensayo_id", dataType=StringType())
        .execute()
    )


StatementMeta(, ee209495-229b-48b1-897a-d304ab449f71, 59, Finished, Available, Finished)

In [52]:
# Lee los datos triaxiales crudos de la capa de bronce en un pandas dataframe

FILENAME = triaxial_filename
ENSAYO = triaxial_sample_name

spark_df = spark.read.table("bronze_shortcut.data_triaxials")
FULL_DF = spark_df.where(spark_df.sheet_name == ENSAYO).toPandas()


StatementMeta(, ee209495-229b-48b1-897a-d304ab449f71, 60, Finished, Available, Finished)

In [53]:
# Separa los tests de los datos crudos

fuente_id, fuente_df = extract_fuente(FILENAME)
ensayo_id, ensayo_df = extract_ensayo(ENSAYO, fuente_id)
all_tests = extract_test_data(FULL_DF, ensayo_id)


StatementMeta(, ee209495-229b-48b1-897a-d304ab449f71, 61, Finished, Available, Finished)

In [54]:
# Guardar los resultados como tables en el lakehouse

# Convierte los pandas' dataframes a spark dataframes
spark_fuente = spark.createDataFrame(fuente_df)
spark_ensayo = spark.createDataFrame(ensayo_df)
spark_triaxales = spark.createDataFrame(all_tests)

# Lee los datos ya existentes
from delta.tables import DeltaTable
init_tables(spark)
fuentes_old = DeltaTable.forName(spark, "fuentes")
ensayos_old = DeltaTable.forName(spark, "ensayos")
triaxials_old = DeltaTable.forName(spark, "triaxial_tests")

# Guarda los nuevos datos, si es que no existen ya
fuentes_old.alias("old").merge(
    spark_fuente.alias("new"),
    "old.name = new.name"
).whenNotMatchedInsertAll().execute()

ensayos_old.alias("old").merge(
    spark_ensayo.alias("new"),
    """
    old.code = new.code
    AND
    old.sample_state = new.sample_state
    AND
    old.consolidation_tupe = new.consolidation_tupe
    """
).whenNotMatchedInsertAll().execute()

# el ensayo id de los tests va a estar en la tabla de ensayos si son hay que guardarlos
# el ensayo id de los tests NO va a estar en la tabla de ensayos si NO hay que guardarlos
filtered_triaxials = spark_triaxales.join(spark_ensayo, spark_triaxales.ensayo_id == spark_ensayo.id, "inner").select(spark_triaxales["*"])
filtered_triaxials.write.mode("append").saveAsTable("triaxial_tests")


StatementMeta(, ee209495-229b-48b1-897a-d304ab449f71, 62, Finished, Available, Finished)

In [58]:
%%sql

SELECT src.name AS archivo, en.code AS cod_ensayo,
       en.consolidation_tupe, en.sample_state,
       t1.time AS time_1, t1.pore_vol AS pore_vol_1, t1.pwp as pwp_1
FROM dbo.fuentes AS src
INNER JOIN dbo.ensayos AS en on en.fuente_id = src.id
INNER JOIN dbo.triaxial_tests as t1 on t1.ensayo_id = en.id and t1.test_number = 1
INNER JOIN dbo.triaxial_tests as t2 on t2.ensayo_id = en.id and t2.test_number = 2
INNER JOIN dbo.triaxial_tests as t3 on t3.ensayo_id = en.id and t3.test_number = 3
LIMIT 10;


StatementMeta(, ee209495-229b-48b1-897a-d304ab449f71, 69, Finished, Available, Finished)

<Spark SQL result set with 10 rows and 7 fields>

<Spark SQL result set with 1 rows and 1 fields>

<Spark SQL result set with 1 rows and 1 fields>

In [None]:
import json

# return filename and sample name to be used for another item (notebook) if any

params = {
    'triaxial_filename': triaxial_filename,
    'triaxial_sample_name': triaxial_sample_name
}
result = json.dumps(params)

mssparkutils.notebook.exit(result)