In [1]:
# 1. Definir el esquema de las tablas de datos crudos de los triaxiales

from pyspark.sql.types import DoubleType, StructType, StructField, StringType, IntegerType


# Indice de datos triaxiales
triaxial_index_schema = StructType([
    StructField("Code", IntegerType(), nullable=False),
    StructField("Specimen", IntegerType(), nullable=False),
    StructField("Test", StringType(), nullable=False),
    StructField("Material_group", StringType(), nullable=False),
    StructField("Test_pit", StringType(), nullable=False),
    StructField("Deph", StringType(), nullable=False),
    StructField("CD_CU", StringType(), nullable=False),
    StructField("Sample_type", StringType(), nullable=False),
    StructField("Confinment", IntegerType(), nullable=True),
    StructField("Sheetname", StringType(), nullable=False),
    StructField("First_row", IntegerType(), nullable=True),
    StructField("Last_row", IntegerType(), nullable=True),

    StructField("Index_filename", StringType(), nullable=False),
    StructField("Data_filename", StringType(), nullable=False)
])

# Datos del ensayo triaxial
triaxial_data_schema = StructType([
    StructField("Deflection_mm", DoubleType(), nullable=False),
    StructField("Loadcell_N", DoubleType(), nullable=False),
    StructField("PWP_kPa", DoubleType(), nullable=False),
    StructField("Load_N", DoubleType(), nullable=False),
    StructField("Deflection_cm", DoubleType(), nullable=False),
    StructField("Axial_strain_%", DoubleType(), nullable=False),
    StructField("Corrected_area", DoubleType(), nullable=False),
    StructField("Deviator_stress", DoubleType(), nullable=False),
    StructField("Filter_correction", DoubleType(), nullable=False),
    StructField("Membrane_correction", DoubleType(), nullable=False),
    StructField("Corrected_dev_str", DoubleType(), nullable=False),
    StructField("PWP", DoubleType(), nullable=False),
    StructField("Cell_pressure", DoubleType(), nullable=False),
    StructField("delta_PWP", DoubleType(), nullable=False),

    StructField("Data_sheetname", StringType(), nullable=False),
    StructField("Data_filename", StringType(), nullable=False)
])


StatementMeta(, f2dad900-6af7-4b88-83e7-957ba259e6f6, 3, Finished, Available, Finished)

In [29]:
# Collección de funciones utiles para el notebook
from pathlib import Path

import pandas as pd


def cargar_indice_triaxial(index_file_path, data_file_path):
    """Lee el archivo indice de datos triaxiales y lo carga en un pandas' dataframe.
    Como el archivo de indice es tan sencillo y estandar las columnas ya están cargadas"""
    df = pd.read_excel(index_file_path)

    # Agregamos el nombre de los archivos al dataframe
    df["Index_filename"] = Path(index_file_path).name
    df["Data_filename"] = Path(data_file_path).name

    return df


def cargar_datos_triaxiales(file_path):
    """Lee el archivo con los datos triaxiales y devuelve como un solo pandas' dataframe
    donde cada hoja esta distiguida por un columna extra.
    Posee otra columna extra con el nombre del archivo original"""

    # Lee las hojas dentro del excel
    excel_file = pd.ExcelFile(file_path)

    # Los nombres de las columnas del archivo, modificadas para matchear con las del esquema
    cols = [
       "Deflection_mm", "Loadcell_N", "PWP_kPa", "Load_N", "Deflection_cm", "Axial_strain_%",
       "Corrected_area", "Deviator_stress", "Filter_correction", "Membrane_correction", 
       "Corrected_dev_str", "PWP", "Cell_pressure", "delta_PWP"
    ]

    # Lee todas las hojas y las carga en un dict indexado por el nombre de la hoja (sheetname)
    datos_por_hoja = {
        hoja: pd.read_excel(file_path, sheet_name=hoja, skiprows=1, names=cols)
        for hoja in excel_file.sheet_names
    }

    # Agrega a cada dataframe la columna con el nombre de la hoja a la que pertencen los datos
    # y la columna con el nombre del archivo donde pertenecen
    nombre_arch = Path(file_path).name
    for hoja, df in datos_por_hoja.items():
        df["Data_sheetname"] = hoja
        df["Data_filename"] = nombre_arch
        df.reset_index(inplace=True)
        print(f"Columns -> {df.columns}")
    
    # Une a todos los dataframes en uno solo
    ldf = [d for d in datos_por_hoja.values()]
    return pd.concat(ldf, axis=0, ignore_index=True)


# FIXME: Como manejamos las columnas con resultados extras? y las tablas multiples?

print("==>")
data_df = cargar_datos_triaxiales("/lakehouse/default/Files/Triaxials/SRK-169-TX-ExcelData.xlsx")
print(data_df.head())
print("<==")


StatementMeta(, f2dad900-6af7-4b88-83e7-957ba259e6f6, 31, Finished, Available, Finished)

Columns -> Index(['level_0', 'level_1', 'level_2', 'level_3', 'level_4', 'level_5',
       'level_6', 'level_7', 'level_8', 'Deflection_mm', 'Loadcell_N',
       'PWP_kPa', 'Load_N', 'Deflection_cm', 'Axial_strain_%',
       'Corrected_area', 'Deviator_stress', 'Filter_correction',
       'Membrane_correction', 'Corrected_dev_str', 'PWP', 'Cell_pressure',
       'delta_PWP', 'Data_sheetname', 'Data_filename'],
      dtype='object')
Columns -> Index(['level_0', 'level_1', 'level_2', 'level_3', 'level_4', 'level_5',
       'level_6', 'Deflection_mm', 'Loadcell_N', 'PWP_kPa', 'Load_N',
       'Deflection_cm', 'Axial_strain_%', 'Corrected_area', 'Deviator_stress',
       'Filter_correction', 'Membrane_correction', 'Corrected_dev_str', 'PWP',
       'Cell_pressure', 'delta_PWP', 'Data_sheetname', 'Data_filename'],
      dtype='object')
Columns -> Index(['index', 'Deflection_mm', 'Loadcell_N', 'PWP_kPa', 'Load_N',
       'Deflection_cm', 'Axial_strain_%', 'Corrected_area', 'Deviator_stress',

In [25]:
# 2. Cargar datos en dataframes

TRIAXIAL_INDEX_PATH = "/lakehouse/default/Files/Triaxials/Sampleslist using lab numbers .xlsx"
TRIAXIAL_DATA_PATH = "/lakehouse/default/Files/Triaxials/SRK-169-TX-ExcelData.xlsx"

index_df = cargar_indice_triaxial(TRIAXIAL_INDEX_PATH, TRIAXIAL_DATA_PATH)
data_df = cargar_datos_triaxiales(TRIAXIAL_DATA_PATH)


# 3. Guardar los datos en tablas

index_spark = spark.createDataFrame(index_df, schema=triaxial_index_schema)
data_spark = spark.createDataFrame(data_df, schema=triaxial_data_schema)

(
    index_spark
    .write.format("delta").mode("overwrite")
    .option("overwriteSchema", "true")
    .saveAsTable("triaxial_index")
)

(
    data_spark
    .write.format("delta").mode("overwrite")
    .option("overwriteSchema", "true")
    .saveAsTable("triaxial_data")
)


# TODO: Otra forma posible, pero necesita mas refinamiento
#
#from delta.tables import DeltaTable
#
#datos_nuevos = spark.createDataFrame(TABLE_RESULT)
#datos_actuales = DeltaTable.forName(spark, "data_triaxials")
#
#datos_actuales.alias("data_actual").merge(
#    datos_nuevos.alias("data_nueva"),
#    """
#    data_actual.archivo    = data_nueva.archivo
#    AND
#    data_actual.sheet_name = data_nueva.sheet_name
#    """
#).whenNotMatchedInsertAll().execute()


StatementMeta(, f2dad900-6af7-4b88-83e7-957ba259e6f6, 27, Finished, Available, Finished)

==>
    level_0    level_1     level_2    level_3   level_4   level_5    level_6  \
0       NaN        NaN         NaN        NaN       NaN       NaN        NaN   
1         0  65.391698   291.77086          0         0         0  19.207572   
2  0.024271  75.098903  294.757696   9.707205  0.002427  0.024464  19.212272   
3  0.041348  83.828181  298.156849  18.436483  0.004135  0.041678   19.21558   
4  0.061908  90.416316   301.05056  25.024618  0.006191  0.062401  19.219565   

     level_7 level_8 Deflection_mm  ... Deviator_stress Filter_correction  \
0        NaN     NaN           NaN  ...             NaN                ea   
1          0       0             0  ...        98.22914                 0   
2   5.052606       0      0.008311  ...       96.923736          0.024464   
3   9.594549       0      0.014159  ...       95.036615          0.041678   
4  13.020387       0      0.021199  ...       93.282503          0.062401   

  Membrane_correction Corrected_dev_str        PWP C

'\nindex_spark = spark.createDataFrame(index_df, schema=triaxial_index_schema)\ndata_spark = spark.createDataFrame(data_df, schema=triaxial_data_schema)\n\n(\n    index_spark\n    .write.format("delta").mode("overwrite")\n    .option("overwriteSchema", "true")\n    .saveAsTable("triaxial_index")\n)\n\n(\n    data_spark\n    .write.format("delta").mode("overwrite")\n    .option("overwriteSchema", "true")\n    .saveAsTable("triaxial_data")\n)\n'

In [None]:
%%sql

SELECT * AS cant_filas FROM triaxial_index;
SELECT count(1) AS cant_filas FROM triaxial_data;


StatementMeta(, f2dad900-6af7-4b88-83e7-957ba259e6f6, -1, Cancelled, , Cancelled)

In [None]:
import json

# return filename to be used for another item (notebook)

params = {
    'triaxial_filename': triaxial_filename,
    'triaxial_sample_name': triaxial_sample_name
}
result = json.dumps(params)

mssparkutils.notebook.exit(result)

StatementMeta(, f2dad900-6af7-4b88-83e7-957ba259e6f6, -1, Cancelled, , Cancelled)