# Silver layer

In [45]:
in_parameter_id_run = 13
in_parameter_process_date = "2025-12-05 13:01:03"
out_parameter_count_processed = 0

StatementMeta(, 78129b1d-3ae1-46fe-a480-8bbcc49e02de, 47, Finished, Available, Finished)

In [38]:
v_code_missing_values = "03"
v_code_wrong_data = "04"

StatementMeta(, 78129b1d-3ae1-46fe-a480-8bbcc49e02de, 40, Finished, Available, Finished)

In [39]:
from pyspark.sql.functions import *
from pyspark.sql.types import *

StatementMeta(, 78129b1d-3ae1-46fe-a480-8bbcc49e02de, 41, Finished, Available, Finished)

## 1. Load control tables

Clean table logs

In [None]:
%%sql
TRUNCATE TABLE control.logs

Load validation rules

In [40]:
df_control = spark.read.table("control.validation_rules").toPandas()

StatementMeta(, 78129b1d-3ae1-46fe-a480-8bbcc49e02de, 42, Finished, Available, Finished)

Identify columns

In [41]:
columns_string = ["company", "location", "rocket", "mission", "rocket_status", "mission_status"]
#columns_string_size = [("company", 50), ("location", 100), ("rocket", 50), ("mission", 50), ("rocket_status", 15), ("mission_status", 15)]
columns_decimal = ["price"]
columns_date = ["date"]
#price_usa = 100.00
#price_no_usa = 80.00
#precision_decimal = 10
#scale_decimal = 2

StatementMeta(, 78129b1d-3ae1-46fe-a480-8bbcc49e02de, 43, Finished, Available, Finished)

Get column size for string columns

In [42]:
columns_string_size = []
for c in columns_string:
    column_size = int(df_control[df_control["column_name_new"] == c]["column_size"].values[0])
    columns_string_size.append((c, column_size))

StatementMeta(, 78129b1d-3ae1-46fe-a480-8bbcc49e02de, 44, Finished, Available, Finished)

Get column size for decimal columns

In [43]:
precision_decimal = int(df_control[df_control["column_name_new"] == "price"][["column_size", "column_size_scale"]].values[0][0])
scale_decimal = int(df_control[df_control["column_name_new"] == "price"][["column_size", "column_size_scale"]].values[0][1])

StatementMeta(, 78129b1d-3ae1-46fe-a480-8bbcc49e02de, 45, Finished, Available, Finished)

Get price default value

In [44]:
price_default = float(df_control[df_control["column_name_new"] == "price"]["default_value"].values[0])

StatementMeta(, 78129b1d-3ae1-46fe-a480-8bbcc49e02de, 46, Finished, Available, Finished)

## 2. Extract bronze data

In [46]:
df = spark.read.format("delta").table("bronze.missions")
df.filter((col("process_date") == in_parameter_process_date[:10]) & (col("run_id") == in_parameter_id_run))
df = df.select("company", "location", "date", "rocket", "mission", "rocket_status", "price", "mission_status")

StatementMeta(, 78129b1d-3ae1-46fe-a480-8bbcc49e02de, 48, Finished, Available, Finished)

## 3. Validation

### 3.1. Missing values

In [47]:
import pandas as pd

StatementMeta(, 78129b1d-3ae1-46fe-a480-8bbcc49e02de, 49, Finished, Available, Finished)

In [48]:
df_null_counts = df.select([count(when(col(c).isNull(), c)).alias(c) for c in df.columns])

StatementMeta(, 78129b1d-3ae1-46fe-a480-8bbcc49e02de, 50, Finished, Available, Finished)

In [49]:
list_null_counts = []
df_null_counts = df_null_counts.toPandas()
for c in df_null_counts.columns:
    list_null_counts.append((in_parameter_id_run, v_code_missing_values, f"Column: {c}, count: {df_null_counts[c].values[0]}"))
df_null_counts = pd.DataFrame(list_null_counts, columns=["run_id", "code", "description"])
df_null_counts = spark.createDataFrame(df_null_counts)

StatementMeta(, 78129b1d-3ae1-46fe-a480-8bbcc49e02de, 51, Finished, Available, Finished)

### 3.2. Uppercase

In [50]:
for c in columns_string:
    df = df.withColumn(c, upper(df[c]))

StatementMeta(, 78129b1d-3ae1-46fe-a480-8bbcc49e02de, 52, Finished, Available, Finished)

### 3.3. Trimming

In [51]:
for c in columns_string:
    df = df.withColumn(c, trim(df[c]))

StatementMeta(, 78129b1d-3ae1-46fe-a480-8bbcc49e02de, 53, Finished, Available, Finished)

### 3.4. Identify country

In [52]:
df = df.withColumn("country", split(df["location"], ",")).withColumn("country", col("country")[size(col("country")) - 1])

StatementMeta(, 78129b1d-3ae1-46fe-a480-8bbcc49e02de, 54, Finished, Available, Finished)

### 3.5. Complete price

In [53]:
df = df.withColumn("price", 
                when(col("price").isNull(), lit(price_default)).
                otherwise(col("price"))
                )

StatementMeta(, 78129b1d-3ae1-46fe-a480-8bbcc49e02de, 55, Finished, Available, Finished)

### 3.6. Data integrity

Decimal

In [55]:
for c in columns_decimal:
    df = df.withColumn(f"{c}_is_decimal", col(c).cast(DecimalType(precision_decimal, scale_decimal)).isNotNull())
    df = df.withColumn(f"{c}_wrong", 
                        when(col(f"{c}_is_decimal") == False, concat(lit("Column: "), lit(c), lit(", value: "), col(c))).
                        otherwise(lit("")))

StatementMeta(, 78129b1d-3ae1-46fe-a480-8bbcc49e02de, 57, Finished, Available, Finished)

Date

In [56]:
for c in columns_date:
    df = df.withColumn(f"{c}_is_date", col(c).cast(DateType()).isNotNull())
    df = df.withColumn(f"{c}_wrong", 
                    when(col(f"{c}_is_date") == False, concat(lit("Column: "), lit(c), lit(", value: "), col(c))).
                    otherwise(lit("")))

StatementMeta(, 78129b1d-3ae1-46fe-a480-8bbcc49e02de, 58, Finished, Available, Finished)

String

In [57]:
for c in columns_string_size:
    df = df.withColumn(f"{c[0]}_size", length(col(c[0]))).withColumn(f"{c[0]}_size_valid", (col(f"{c[0]}_size") <= c[1]))
    df = df.withColumn(f"{c[0]}_wrong", 
                    when(col(f"{c[0]}_size_valid") == False, concat(lit("Column: "), lit(c[0]), lit(", value: "), col(c[0]))).
                    otherwise(lit("")))

StatementMeta(, 78129b1d-3ae1-46fe-a480-8bbcc49e02de, 59, Finished, Available, Finished)

Select invalid data

In [58]:
df_invalid = df.filter((col("price_is_decimal") == False) | \
                (col("date_is_date") == False) | \
                (col("company_size_valid") == False) | \
                (col("location_size_valid") == False) | \
                (col("rocket_size_valid") == False) | \
                (col("mission_size_valid") == False) | \
                (col("rocket_status_size_valid") == False) | \
                (col("mission_status_size_valid") == False)
)

StatementMeta(, 78129b1d-3ae1-46fe-a480-8bbcc49e02de, 60, Finished, Available, Finished)

In [59]:
columns_wrong = [f"{c}_wrong" for c in columns_decimal + columns_string + columns_date]
df_invalid = df_invalid.withColumn("description", concat_ws(", ", *columns_wrong))
df_invalid = df_invalid.withColumn("run_id", lit(in_parameter_id_run)).withColumn("code", lit(v_code_wrong_data))
df_invalid = df_invalid.select("run_id", "code", "description")

StatementMeta(, 78129b1d-3ae1-46fe-a480-8bbcc49e02de, 61, Finished, Available, Finished)

Select valid

In [60]:
df = df.filter((col("price_is_decimal") == True) & \
                (col("date_is_date") == True) & \
                (col("company_size_valid") == True) & \
                (col("location_size_valid") == True) & \
                (col("rocket_size_valid") == True) & \
                (col("mission_size_valid") == True) & \
                (col("rocket_status_size_valid") == True) & \
                (col("mission_status_size_valid") == True)
).select("company", "location", "date", "rocket", "mission", "rocket_status", "price", "mission_status", "country")

StatementMeta(, 78129b1d-3ae1-46fe-a480-8bbcc49e02de, 62, Finished, Available, Finished)

Transform to final data types

In [61]:
for c in columns_decimal:
    df = df.withColumn(c, col(c).cast(DecimalType(precision_decimal, scale_decimal)))

StatementMeta(, 78129b1d-3ae1-46fe-a480-8bbcc49e02de, 63, Finished, Available, Finished)

In [62]:
for c in columns_date:
    df = df.withColumn(c, col(c).cast(DateType()))

StatementMeta(, 78129b1d-3ae1-46fe-a480-8bbcc49e02de, 64, Finished, Available, Finished)

## 4. Load control data

In [63]:
df_logs = df_null_counts.unionAll(df_invalid)
df_logs = df_logs.withColumn("run_id", col("run_id").cast(IntegerType()))
df_logs.write.format("delta").mode("append").saveAsTable("control.logs")

StatementMeta(, 78129b1d-3ae1-46fe-a480-8bbcc49e02de, 65, Finished, Available, Finished)

## 5. Load silver data

In [64]:
df = df.withColumn("process_datetime", lit(in_parameter_process_date))
df = df.withColumn("process_datetime", to_timestamp("process_datetime", "yyyy-MM-dd HH:mm:ss"))
df = df.withColumn("process_date", to_date("process_datetime", "yyyy-MM-dd"))
df = df.withColumn("run_id", lit(in_parameter_id_run))

StatementMeta(, 78129b1d-3ae1-46fe-a480-8bbcc49e02de, 66, Finished, Available, Finished)

In [65]:
df.write.partitionBy("process_date", "run_id").format("delta").mode("append").saveAsTable("silver.missions")

StatementMeta(, 78129b1d-3ae1-46fe-a480-8bbcc49e02de, 67, Finished, Available, Finished)

In [66]:
out_parameter_count_processed = df.count()

StatementMeta(, 78129b1d-3ae1-46fe-a480-8bbcc49e02de, 68, Finished, Available, Finished)

In [67]:
mssparkutils.notebook.exit(out_parameter_count_processed)

StatementMeta(, 78129b1d-3ae1-46fe-a480-8bbcc49e02de, 69, Finished, Available, Finished)

ExitValue: 4