In [39]:
# Welcome to your new notebook
# Type here in the cell editor to add code!
#spark.sql("SHOW TABLES").show()


StatementMeta(, b5e42823-4cd8-4bb3-9e3a-d5a486420597, 41, Finished, Available, Finished)

In [37]:
spark.sql("""
CREATE TABLE IF NOT EXISTS silver_processed.dbo.Project (
    ProjectID STRING NOT NULL,
    Client STRING,
    Country STRING,
    Region STRING
) USING DELTA
""")

StatementMeta(, b5e42823-4cd8-4bb3-9e3a-d5a486420597, 39, Finished, Available, Finished)

DataFrame[]

In [1]:
spark.sql("""
CREATE TABLE IF NOT EXISTS silver_processed.dbo.Program (
    ProjectID STRING NOT NULL,
    ProgramID STRING NOT NULL,
    CompanyName STRING,
    Year INT,
    StartDate DATE,
    EndDate DATE,
    ReportTitle STRING,
    ReportDate DATE,
    Author STRING,
    AddedBy STRING,
    Comment STRING
) USING DELTA
""")

spark.sql("""
CREATE TABLE IF NOT EXISTS silver_processed.dbo.Location (
    LocationID STRING NOT NULL,
    ProgramID STRING NOT NULL,
    LocationType STRING,
    LocationFlag STRING,
    LocationDescription STRING,
    StartDate DATE,
    EndDate DATE,
    Easting DOUBLE,
    Northing DOUBLE,
    CoordinateCollectionType STRING,
    CollarElevation_masl DOUBLE,
    ElevationCollectionType STRING,
    Azimuth DOUBLE,
    Dip DOUBLE,
    Comment STRING
) USING DELTA
""")

spark.sql("""
CREATE TABLE IF NOT EXISTS silver_processed.dbo.Sample (
    SampleID STRING NOT NULL,
    LocationID STRING NOT NULL,
    SampleType STRING,
    MaterialType STRING,
    State STRING,
    DepthFrom_m FLOAT,
    DepthTo_m FLOAT,
    MiddleDepth_m FLOAT,
    Laboratory STRING,
    Comment STRING
) USING DELTA
""")

spark.sql("""
CREATE TABLE IF NOT EXISTS silver_processed.dbo.LabTest (
    LabTestID STRING NOT NULL,
    SampleID STRING NOT NULL,
    TestType STRING,
    TestDate DATE,
    ReceivedDate DATE,
    Comment STRING
) USING DELTA
""")

spark.sql("""
CREATE TABLE IF NOT EXISTS silver_processed.dbo.Triaxial (
    LabTestID STRING NOT NULL,
    TriaxialType STRING,
    Direction STRING,
    ConsolidationType STRING,
    AxialStrain_Percent FLOAT,
    VolumetricStrain_Percent FLOAT,
    MajorPrincipalEffectiveStress_kPa FLOAT,
    MinorPrincipalEffectiveStress_kPa FLOAT,
    ExcessPorePressure_kPa FLOAT,
    VoidRatio FLOAT
) USING DELTA
""")

StatementMeta(, 7ea257fc-94d8-4833-b5f3-038cd82c12de, 3, Finished, Available, Finished)

DataFrame[]

# Carga manual Project

### Via spark dataframe

In [2]:
from pyspark.sql import SparkSession

def validate_and_insert(new_data_df, table_name):
    """
    Valida que los ProjectID sean únicos antes de insertar
    y muestra los IDs duplicados encontrados
    
    Args:
        new_data_df: DataFrame con los nuevos datos a insertar
        table_name: Nombre completo de la tabla (ej. "silver_processed.dbo.Project")
    """
    # Verificar si la tabla existe
    if spark.catalog.tableExists(table_name):
        # Obtener IDs existentes
        existing_ids = spark.sql(f"SELECT ProjectID FROM {table_name}")
        
        # Encontrar duplicados
        duplicate_df = new_data_df.join(existing_ids, "ProjectID")
        
        if duplicate_df.count() > 0:
            # Obtener lista de IDs duplicados
            duplicate_ids = [row['ProjectID'] for row in duplicate_df.select("ProjectID").distinct().collect()]
            raise ValueError(
                f"Error: Se encontraron {len(duplicate_ids)} ProjectIDs duplicados:\n"
                f"{', '.join(duplicate_ids)}"
            )
    
    # Insertar datos (si no hay duplicados o la tabla no existe)
    new_data_df.write.format("delta").mode("append").saveAsTable(table_name)
    print("Datos insertados correctamente.")

# Datos de ejemplo (con un duplicado intencional)
project_data = [
    ("P001", "Contoso Corp", "México", "Norte"),
    ("P002", "Fabrikam", "Brasil", "Sudamérica"),
    ("P001", "Contoso Dupe", "México", "Norte"),
    ("P004", "Tailspin Toys", "España", "Europa")
]

# Crear DataFrame
project_df = spark.createDataFrame(project_data, ["ProjectID", "Client", "Country", "Region"])

# Llamar a la función
try:
    validate_and_insert(project_df, "silver_processed.dbo.Project")
except ValueError as e:
    print(e)
    print("\nSolución:")
    print("1. Elimina los registros duplicados de tus datos de entrada")
    print("2. Cambia los ProjectIDs duplicados por valores únicos")
    print("3. Si es intencional, considera usar mode('overwrite')")

StatementMeta(, a0c6bdd4-d6aa-4f1e-afc9-b477aea44422, 3, Finished, Available, Finished)

Error: Se encontraron 3 ProjectIDs duplicados:
P001, P002, P004

Solución:
1. Elimina los registros duplicados de tus datos de entrada
2. Cambia los ProjectIDs duplicados por valores únicos
3. Si es intencional, considera usar mode('overwrite')


### Via SQL

In [4]:
%%sql
INSERT INTO silver_processed.dbo.Project
VALUES 
    ('P010', 'CloudTech Innovations', 'Alemania', 'Europa Central'),
    ('P011', 'GreenEnergy Solutions', 'Peru', 'Sudamérica');

StatementMeta(, 1c721fc6-8b32-4839-98a2-f4f6349d0181, 6, Finished, Available, Finished)

<Spark SQL result set with 0 rows and 0 fields>

### Via archivo

In [7]:
import pandas as pd
from pyspark.sql import SparkSession

excel_path = '/lakehouse/default/Files/Projects raw.xlsx'
df_pandas = pd.read_excel(excel_path, sheet_name=0)

spark = SparkSession.builder.getOrCreate()
df_spark = spark.createDataFrame(df_pandas)

df_spark.write.format("delta").mode("append").saveAsTable("silver_processed.dbo.Project")


StatementMeta(, 1c721fc6-8b32-4839-98a2-f4f6349d0181, 9, Finished, Available, Finished)

In [8]:
%%sql
SELECT * FROM Silver_processed.dbo.Project

StatementMeta(, 1c721fc6-8b32-4839-98a2-f4f6349d0181, 10, Finished, Available, Finished)

<Spark SQL result set with 8 rows and 4 fields>

# Carga manual Program

### Via spark dataframe

In [5]:
from pyspark.sql import Row
from pyspark.sql.functions import col, to_date

def validate_and_insert_program(new_program_df, program_table, project_table):
    """
    Valida:
    1. ProgramIDs sean únicos
    2. ProjectIDs existan en la tabla Project
    antes de insertar datos en la tabla Program
    
    Args:
        new_program_df: DataFrame con los nuevos programas
        program_table: Nombre de la tabla Program (ej. "silver_processed.dbo.Program")
        project_table: Nombre de la tabla Project (ej. "silver_processed.dbo.Project")
    """
    errors = []
    
    # 1. Validar ProgramIDs únicos (si la tabla existe)
    if spark.catalog.tableExists(program_table):
        existing_program_ids = spark.sql(f"SELECT ProgramID FROM {program_table}")
        duplicate_programs = new_program_df.join(existing_program_ids, "ProgramID")
        
        if duplicate_programs.count() > 0:
            duplicate_ids = [row['ProgramID'] for row in duplicate_programs.select("ProgramID").distinct().collect()]
            errors.append(f"Error: {len(duplicate_ids)} ProgramIDs duplicados: {', '.join(duplicate_ids)}")
    
    # 2. Validar ProjectIDs existentes
    if spark.catalog.tableExists(project_table):
        existing_project_ids = spark.sql(f"SELECT DISTINCT ProjectID FROM {project_table}")
        missing_projects = new_program_df.join(existing_project_ids, "ProjectID", "left_anti")
        
        if missing_projects.count() > 0:
            missing_ids = [row['ProjectID'] for row in missing_projects.select("ProjectID").distinct().collect()]
            errors.append(f"Error: {len(missing_ids)} ProjectIDs no existen: {', '.join(missing_ids)}")
    
    # Si hay errores, mostrarlos y abortar
    if errors:
        raise ValueError("\n".join(errors))
    
    # Insertar datos si todo está bien
    new_program_df.write.format("delta").mode("append").option("mergeSchema", "true").saveAsTable(program_table)
    print("Datos de Program insertados correctamente.")

# Preparar datos de Program
program_data = [
    ("P001", "PRG101", "Contoso Corp", 2023, "2023-01-15", "2023-12-20", "Reporte Anual 2023", "2023-12-15", "Juan Pérez", "Admin", "Sin comentarios"),
    ("P002", "PRG102", "Fabrikam", 2024, "2024-02-01", "2024-11-30", "Innovación 2024", "2024-01-10", "María Gómez", "Admin", "Prioridad alta"),
    ("P003", "PRG103", "AdventureWorks", 2023, "2023-03-10", "2023-10-05", "Sostenibilidad", "2023-09-28", "Carlos Ruiz", "Analyst", "Revisar datos"),
    ("P005", "PRG104", "Tailspin Toys", 2024, "2024-01-20", "2024-06-30", "Expansión Europa", "2024-01-15", "Laura Díaz", "Manager", "En progreso")  # P005 no existe en Project
]

# Crear DataFrame
program_df = spark.createDataFrame(program_data, ["ProjectID", "ProgramID", "CompanyName", "Year", "StartDate", "EndDate", "ReportTitle", "ReportDate", "Author", "AddedBy", "Comment"])

# Convertir tipos de datos
program_df = program_df \
    .withColumn("Year", col("Year").cast("INT")) \
    .withColumn("StartDate", to_date(col("StartDate"))) \
    .withColumn("EndDate", to_date(col("EndDate"))) \
    .withColumn("ReportDate", to_date(col("ReportDate")))

# Ejecutar validación e inserción
try:
    validate_and_insert_program(
        program_df,
        "silver_processed.dbo.Program",
        "silver_processed.dbo.Project"
    )
except ValueError as e:
    print("Error al insertar programas:")
    print(e)
    print("\nRecomendaciones:")
    print("- Para ProgramIDs duplicados: usar valores únicos")
    print("- Para ProjectIDs faltantes: verificar en tabla Project")

StatementMeta(, a0c6bdd4-d6aa-4f1e-afc9-b477aea44422, 6, Finished, Available, Finished)

Error al insertar programas:
Error: 4 ProgramIDs duplicados: PRG101, PRG102, PRG103, PRG104

Recomendaciones:
- Para ProgramIDs duplicados: usar valores únicos
- Para ProjectIDs faltantes: verificar en tabla Project o agregar primero los proyectos


### Via SQL

In [None]:
%%sql
INSERT INTO silver_processed.dbo.Program
VALUES 
    ('P007', 'PRG107', 'CloudTech Innovations', 2024, '2024-05-01', '2024-11-30', 
     'Migración a Azure', '2024-04-15', 'Alan Turing', 'Tech Lead', 'Proyecto crítico'),
     
    ('P008', 'PRG108', 'GreenEnergy Solutions', 2024, '2024-06-10', '2024-12-31', 
     'Energías Renovables', '2024-05-20', 'Marie Curie', 'Directora', 'Financiamiento aprobado');

StatementMeta(, ed164837-4919-4e61-954d-e39ffd162154, 11, Finished, Available, Finished)

<Spark SQL result set with 0 rows and 0 fields>

### Via Archivo

In [29]:
import pandas as pd
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, to_date

excel_path = '/synfs/notebook/b5e42823-4cd8-4bb3-9e3a-d5a486420597/temp_mnt/Files/Programs/Programs raw.xlsx'
df_pandas = pd.read_excel(excel_path, sheet_name=0)

spark = SparkSession.builder.getOrCreate()
df_spark = spark.createDataFrame(df_pandas)
df_spark = df_spark.withColumn("Year", col("Year").cast("integer")) \
                   .withColumn("StartDate", to_date("StartDate")) \
                   .withColumn("EndDate", to_date("EndDate")) \
                   .withColumn("ReportDate", to_date("ReportDate"))


df_spark.write.format("delta").mode("append").saveAsTable("silver_processed.dbo.Program")

StatementMeta(, b5e42823-4cd8-4bb3-9e3a-d5a486420597, 31, Finished, Available, Finished)

In [1]:
%%sql
SELECT * FROM Silver_processed.dbo.Program

StatementMeta(, 1c721fc6-8b32-4839-98a2-f4f6349d0181, 2, Finished, Available, Finished)

<Spark SQL result set with 0 rows and 11 fields>

In [36]:
%%sql
CREATE OR REPLACE TABLE Silver_processed.dbo.program AS SELECT * FROM silver_processed.dbo.program LIMIT 0

StatementMeta(, b5e42823-4cd8-4bb3-9e3a-d5a486420597, 38, Finished, Available, Finished)

<Spark SQL result set with 0 rows and 0 fields>

## Carga manual Location

In [2]:
from pyspark.sql.functions import col, to_date, lit

def validate_and_insert_location(new_location_df, location_table, program_table):
    """
    Valida:
    1. LocationIDs sean únicos
    2. ProgramIDs existan en la tabla Program
    antes de insertar datos en la tabla Location
    
    Args:
        new_location_df: DataFrame con las nuevas ubicaciones
        location_table: Nombre de la tabla Location (ej. "silver_processed.dbo.Location")
        program_table: Nombre de la tabla Program (ej. "silver_processed.dbo.Program")
    """
    errors = []
    
    # 1. Validar LocationIDs únicos (si la tabla existe)
    if spark.catalog.tableExists(location_table):
        existing_location_ids = spark.sql(f"SELECT LocationID FROM {location_table}")
        duplicate_locations = new_location_df.join(existing_location_ids, "LocationID")
        
        if duplicate_locations.count() > 0:
            duplicate_ids = [row['LocationID'] for row in duplicate_locations.select("LocationID").distinct().collect()]
            errors.append(f"Error: {len(duplicate_ids)} LocationIDs duplicados: {', '.join(duplicate_ids)}")
    
    # 2. Validar ProgramIDs existentes
    if spark.catalog.tableExists(program_table):
        existing_program_ids = spark.sql(f"SELECT DISTINCT ProgramID FROM {program_table}")
        missing_programs = new_location_df.join(existing_program_ids, "ProgramID", "left_anti")
        
        if missing_programs.count() > 0:
            missing_ids = [row['ProgramID'] for row in missing_programs.select("ProgramID").distinct().collect()]
            errors.append(f"Error: {len(missing_ids)} ProgramIDs no existen: {', '.join(missing_ids)}")
    
    # Si hay errores, mostrarlos y abortar
    if errors:
        raise ValueError("\n".join(errors))
    
    # Insertar datos si todo está bien
    new_location_df.write.format("delta").mode("append").option("mergeSchema", "true").saveAsTable(location_table)
    print("Datos de Location insertados correctamente.")

# Datos de ejemplo para Location
location_data = [
    ("TP 1", "PRG101", "Mina", "A", "Zona principal de extracción", "2023-01-20", "2023-12-15", 456123.5, 789456.2, "GPS", 2450.5, "Barométrico", 45.2, -12.3, "Sin comentarios"),
    ("TP 2", "PRG101", "Pozo", "B", "Pozo de exploración", "2023-02-15", None, 456150.3, 789480.1, "GPS", 2430.0, "Barométrico", 90.0, -45.0, "Profundidad estimada 200m"),
    ("TP 3", "PRG102", "Campamento", "A", "Campamento base", "2024-02-05", None, 457000.0, 790000.0, "Manual", 2400.0, "GPS", 0.0, 0.0, "Área administrativa"),
    ("TP 5", "PRG102", "Campamento", "B", "Campamento base", "2024-08-05", None, 497000.0, 890000.0, "Manual", None, None, None, None, "Sin Comentarios"),
]

# Crear DataFrame
location_df = spark.createDataFrame(location_data, [
    "LocationID", "ProgramID", "LocationType", "LocationFlag", "LocationDescription",
    "StartDate", "EndDate", "Easting", "Northing", "CoordinateCollectionType",
    "CollarElevation_masl", "ElevationCollectionType", "Azimuth", "Dip", "Comment"
])

# Convertir tipos de datos
location_df = location_df \
    .withColumn("StartDate", to_date(col("StartDate"))) \
    .withColumn("EndDate", to_date(col("EndDate"))) \
    .withColumn("Easting", col("Easting").cast("double")) \
    .withColumn("Northing", col("Northing").cast("double")) \
    .withColumn("CollarElevation_masl", col("CollarElevation_masl").cast("double")) \
    .withColumn("Azimuth", col("Azimuth").cast("double")) \
    .withColumn("Dip", col("Dip").cast("double"))

# Ejecutar validación e inserción
try:
    validate_and_insert_location(
        location_df,
        "silver_processed.dbo.Location",
        "silver_processed.dbo.Program"
    )
except ValueError as e:
    print("Error al insertar ubicaciones:")
    print(e)
    print("\nRecomendaciones:")
    print("- Para LocationIDs duplicados: usar valores únicos")
    print("- Para ProgramIDs faltantes: verificar en tabla Program o agregar primero los programas")

StatementMeta(, 62117117-a304-4a7d-a9c0-35afc976f7c1, 4, Finished, Available, Finished)

Datos de Location insertados correctamente.


In [3]:
%%sql
SELECT * FROM Silver_processed.dbo.Location

StatementMeta(, 62117117-a304-4a7d-a9c0-35afc976f7c1, 5, Finished, Available, Finished)

<Spark SQL result set with 7 rows and 15 fields>

In [12]:
%%sql
SELECT DISTINCT t.LabTestID, t.TriaxialType, t.AxialStrain_Percent, lt.SampleID, s.LocationID, loc.ProgramID FROM Silver_processed.dbo.triaxial t
INNER JOIN Silver_processed.dbo.labtest lt ON lt.LabTestID = t.LabTestID
INNER JOIN Silver_processed.dbo.sample s ON s.SampleID = lt.SampleID
INNER JOIN Silver_processed.dbo.location loc ON loc.LocationID = s.LocationID


StatementMeta(, fe351a97-65e3-48d9-b47f-cf3e81188849, 13, Finished, Available, Finished)

<Spark SQL result set with 1000 rows and 6 fields>

In [15]:
# 1. Primero crear el esquema gold si no existe
spark.sql("CREATE SCHEMA IF NOT EXISTS gold")

# 2. Ejecutar tu consulta y obtener el DataFrame
result_df = spark.sql("""
SELECT DISTINCT 
    t.LabTestID, 
    t.TriaxialType, 
    t.AxialStrain_Percent, 
    lt.SampleID, 
    s.LocationID, 
    loc.ProgramID 
FROM Silver_processed.dbo.triaxial t
INNER JOIN Silver_processed.dbo.labtest lt ON lt.LabTestID = t.LabTestID
INNER JOIN Silver_processed.dbo.sample s ON s.SampleID = lt.SampleID
INNER JOIN Silver_processed.dbo.location loc ON loc.LocationID = s.LocationID
""")

# 3. Guardar en el esquema gold
result_df.write.format("delta") \
    .mode("overwrite") \
    .option("overwriteSchema", "true") \
    .saveAsTable("gold.triaxial_enriched")  # Nota: quitamos .dbo para Fabric

StatementMeta(, fe351a97-65e3-48d9-b47f-cf3e81188849, 17, Finished, Available, Finished)

In [17]:
%%sql
SELECT * FROM Silver_processed.gold.triaxial_enriched

StatementMeta(, fe351a97-65e3-48d9-b47f-cf3e81188849, 18, Finished, Available, Finished)

<Spark SQL result set with 1000 rows and 6 fields>