# Silver layer - DimOrganization

In [None]:
in_parameter_run_id = 0
in_parameter_process_date = ""
out_parameter_count_processed = 0

In [None]:
v_table_name = "DimOrganization"
v_code_missing_values = "03"

## 1. Load validation rules

In [None]:
from pyspark.sql.functions import *
from pyspark.sql.types import *

In [None]:
df_validation_rules = spark.read.table("control.validation_rules").filter(col("table_name") == v_table_name).toPandas()

## 2. Extract data

In [None]:
df = spark.read.format("delta").table(f"bronze.{v_table_name}")

## 3. Validation

In [None]:
import pandas as pd
from decimal import Decimal
from datetime import datetime

### 3.1. Missing values

In [None]:
df = df.na.drop("all")

In [None]:
df_null_counts = df.select([count(when(col(c).isNull(), c)).alias(c) for c in df.columns])

In [None]:
list_null_counts = []
df_null_counts = df_null_counts.toPandas()
for c in df_null_counts.columns:
    list_null_counts.append((in_parameter_run_id, v_code_missing_values, f"Column: {c}, count: {df_null_counts[c].values[0]}"))
df_null_counts = pd.DataFrame(list_null_counts, columns=["run_id", "code", "description"])
df_null_counts = spark.createDataFrame(df_null_counts)

In [None]:
for index, row in df_validation_rules.iterrows():
    if row["column_type"] == "INT":
        df = df.withColumn(row["column_name_original"], when(col(row["column_name_original"]).isNull(), int(row["default_value"])).otherwise(col(row["column_name_original"])))
    elif row["column_type"] == "TINYINT" or row["column_type"] == "SMALLINT":
        df = df.withColumn(row["column_name_original"], when(col(row["column_name_original"]).isNull(), int(row["default_value"])).otherwise(col(row["column_name_original"])))
    elif row["column_type"] == "DECIMAL":
        df = df.withColumn(row["column_name_original"], when(col(row["column_name_original"]).isNull(), Decimal(row["default_value"])).otherwise(col(row["column_name_original"])))
    elif row["column_type"] == "DATETIME":
        df = df.withColumn(row["column_name_original"], when(col(row["column_name_original"]).isNull(), datetime.strptime(row["default_value"], "%Y-%m-%d %H:%M:%S")).otherwise(col(row["column_name_original"])))
    elif row["column_type"] == "DATE":
        df = df.withColumn(row["column_name_original"], when(col(row["column_name_original"]).isNull(), datetime.strptime(row["default_value"], "%Y-%m-%d")).otherwise(col(row["column_name_original"])))
    else:
        df = df.withColumn(row["column_name_original"], when(col(row["column_name_original"]).isNull(), row["default_value"]).otherwise(col(row["column_name_original"])))

## 4. Load control data

In [None]:
df_logs = df_null_counts.withColumn("run_id", col("run_id").cast(IntegerType()))
df_logs.write.format("delta").mode("append").saveAsTable("control.logs")

## 5. Load silver data

In [None]:
df = df.withColumn("process_date", lit(in_parameter_process_date))
df = df.withColumn("process_date", to_date("process_date", "yyyy-MM-dd"))
df = df.withColumn("run_id", lit(in_parameter_run_id))

In [None]:
df.write.format("delta").mode("overwrite").saveAsTable(f"silver.{v_table_name}")

In [None]:
out_parameter_count_processed = df.count()

In [None]:
mssparkutils.notebook.exit(out_parameter_count_processed)