In [2]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, regexp_extract, to_date
import pyspark.sql.functions as F
import matplotlib.pyplot as plt
import seaborn as sns

# Initialize Spark Session
spark = SparkSession.builder \
    .appName("example_app") \
    .getOrCreate()

StatementMeta(, edad1196-7451-4e50-9692-aed8be29e5ab, 4, Finished, Available, Finished)

## Data Quality Functions

###### Function 1 Primary key validation

In [27]:

from pyspark.sql.window import Window

def handle_duplicates(df, subset_columns, strategy='remove', keep='last'):
    if strategy == 'remove':
        if keep == 'first':
            return df.dropDuplicates([subset_columns])
        elif keep == 'last':
            window_spec = Window.partitionBy(subset_columns).orderBy(F.monotonically_increasing_id())
            return df.withColumn("row_num", F.row_number().over(window_spec)) \
                     .filter(F.col("row_num") == 1) \
                     .drop("row_num")
        else:
            raise ValueError("Invalid 'keep' parameter. Use 'first' or 'last'.")
    
    elif strategy == 'flag':
        window_spec = Window.partitionBy(subset_columns)
        return df.withColumn("is_duplicate", F.count("*").over(window_spec) > 1)
    
    elif strategy == 'aggregate':
        # You need to specify the aggregation functions for each column
        # For example, let's assume you want to take the first value for each column
        agg_exprs = [F.first(col).alias(col) for col in df.columns if col not in subset_columns]
        return df.groupBy(subset_columns).agg(*agg_exprs)
    
    else:
        raise ValueError("Invalid strategy provided. Use 'remove', 'flag', or 'aggregate'.")

StatementMeta(, edad1196-7451-4e50-9692-aed8be29e5ab, 29, Finished, Available, Finished)

## Customer

In [57]:
# Read the Parquet file
df_customer = spark.read.parquet("Files/SalesLT.Customer_W01_data_delta_bronze")

print((df_customer.count(), len(df_customer.columns)))
print('----------')
df_customer.printSchema()

StatementMeta(, edad1196-7451-4e50-9692-aed8be29e5ab, 59, Finished, Available, Finished)

(100, 15)
----------
root
 |-- CustomerID: long (nullable = true)
 |-- NameStyle: long (nullable = true)
 |-- Title: string (nullable = true)
 |-- FirstName: string (nullable = true)
 |-- MiddleName: string (nullable = true)
 |-- LastName: string (nullable = true)
 |-- Suffix: string (nullable = true)
 |-- CompanyName: string (nullable = true)
 |-- SalesPerson: string (nullable = true)
 |-- EmailAddress: string (nullable = true)
 |-- Phone: string (nullable = true)
 |-- PasswordHash: string (nullable = true)
 |-- PasswordSalt: string (nullable = true)
 |-- rowguid: string (nullable = true)
 |-- ModifiedDate: string (nullable = true)



In [56]:
df1 = test(df_customer,'CustomerID')
df1.count()

StatementMeta(, edad1196-7451-4e50-9692-aed8be29e5ab, 58, Finished, Available, Finished)

100