# Static Data Quality Analysis using AWS Deequ

This notebook demonstrates how to perform data quality analysis using AWS Deequ on a static dataset. 


## Setup and Data Preparation

First, import necessary libraries and setup the environment.


In [0]:
%scala
import spark.implicits._
import org.apache.spark.sql._
import org.apache.spark.sql.functions._
import com.amazon.deequ.{VerificationSuite, VerificationResult}
import com.amazon.deequ.VerificationResult.checkResultsAsDataFrame
import com.amazon.deequ.checks.{Check, CheckLevel, CheckStatus}
import com.amazon.deequ.suggestions.{ConstraintSuggestionRunner, Rules}
import com.amazon.deequ.analyzers._
import com.amazon.deequ.analyzers.runners.AnalysisRunner
import com.amazon.deequ.analyzers.runners.AnalyzerContext.successMetricsAsDataFrame
import com.amazon.deequ.analyzers.{Analysis, ApproxCountDistinct, Completeness, Compliance, Distinctness}



In [0]:
%scala


// Load the static dataset
val data_path = "/FileStore/tables/stockTicks.json"
val base_df = spark.read.json(data_path)

base_df.createOrReplaceTempView("static_data")

### Analyze Data Quality with AWS Deequ

In [0]:


%scala
// Suggest constraints based on the static dataset
val suggestionResult = ConstraintSuggestionRunner()
  .onData(spark.sql("SELECT * FROM static_data"))
  .addConstraintRules(Rules.DEFAULT)
  .run()

suggestionResult.constraintSuggestions.foreach { case (column, suggestions) =>
  suggestions.foreach { suggestion =>
    println(s"Constraint suggestion for '$column':\t${suggestion.description}\n" +
    s"The corresponding scala code is: ${suggestion.codeForConstraint}\n")
  }
}


// # %scala
// # // Suggest constraints based on the static dataset
// # val suggestionResult = ConstraintSuggestionRunner()
// #   .onData(spark.sql("SELECT * FROM static_data"))
// #   .addConstraintRules(Rules.DEFAULT)
// #   .run()

// # suggestionResult.constraintSuggestions.foreach { case (column, suggestions) =>
// #   suggestions.foreach { suggestion =>
// #     println(s"Constraint suggestion for '$column':\t${suggestion.description}\n" +
// #     s"The corresponding scala code is: ${suggestion.codeForConstraint}\n")
// #   }
// # }


### Perform Data Quality Checks


In [0]:
%scala

// Deequ执行前的时间戳
val startTimeDeequ = System.nanoTime()

val analysis = Analysis()
  .addAnalyzer(Size())
  .addAnalyzer(ApproxCountDistinct("symbol"))
  .addAnalyzer(Uniqueness("buysell"))
  .addAnalyzer(Uniqueness("date"))
  .addAnalyzer(Uniqueness("ipaddr"))
  .addAnalyzer(Uniqueness("ordertype"))
  .addAnalyzer(Uniqueness("price"))
  .addAnalyzer(Uniqueness("quantity"))
  .addAnalyzer(Uniqueness("symbol"))
  .addAnalyzer(Uniqueness("time"))
  .addAnalyzer(Completeness("ipaddr"))
  .addAnalyzer(Completeness("symbol"))
  .addAnalyzer(Completeness("quantity"))
  .addAnalyzer(Completeness("price"))
  .addAnalyzer(Compliance("positive quantity", "quantity >= 0"))

val analysisResult = AnalysisRunner
  .onData(base_df)
  .addAnalyzers(analysis.analyzers)
  .run()

// // Deequ执行后的时间戳
// val endTimeDeequ = System.nanoTime()

// // 计算运行时间（转换为毫秒）
// val durationDeequ = (endTimeDeequ - startTimeDeequ) / 1e6d
// println(s"Deequ test runtime: $durationDeequ milliseconds")

// Convert analysis results to DataFrame for visualization
val metricsDataFrame = successMetricsAsDataFrame(spark, analysisResult)
display(metricsDataFrame)




// # val analysis = Analysis()
// #   .addAnalyzer(Size())
// #   .addAnalyzer(ApproxCountDistinct("title"))
// #   .addAnalyzer(Distinctness("title"))
// #   .addAnalyzer(Completeness("title"))
// #   .addAnalyzer(Completeness("release_date"))
// #   .addAnalyzer(Completeness("rating"))
// #   .addAnalyzer(Completeness("description"))
// #   .addAnalyzer(Compliance("rating", "rating >= 0"))


// # val analysisResult = AnalysisRunner
// #   .onData(base_df)
// #   .addAnalyzers(analysis.analyzers)
// #   .run()

// # // Convert analysis results to DataFrame for visualization
// # val metricsDataFrame = successMetricsAsDataFrame(spark, analysisResult)
// # display(metricsDataFrame)

entity,instance,name,value
Column,quantity,Uniqueness,0.9528301886792452
Column,buysell,Uniqueness,0.0
Column,ipaddr,Uniqueness,1.0
Column,date,Uniqueness,0.0
Column,symbol,Uniqueness,0.864
Column,ordertype,Uniqueness,0.0
Column,time,Uniqueness,0.992
Column,price,Uniqueness,0.9837067209775968


### Verification of Data Quality Constraints

In [0]:

%scala
val verificationResult: VerificationResult = VerificationSuite()
  .onData(base_df)
  .addCheck(
    Check(CheckLevel.Error, "Data Quality Verification")
    .hasSize(_ == 1000)
    .isComplete("buysell") 
    .isComplete("date") 
    .isComplete("ipaddr") 
    .isComplete("ordertype") 
    .isComplete("price") 
    .isComplete("quantity") 
    .isComplete("symbol") 
    .isComplete("time") 
    .isUnique("buysell") 
    .isUnique("date") 
    .isUnique("ipaddr") 
    .isUnique("ordertype") 
    .isUnique("price") 
    .isUnique("quantity") 
    .isUnique("symbol") 
    .isUnique("time") 
    .hasDistinctness(Seq("buysell"), _ >= 0.1) 
    .hasDistinctness(Seq("date"), _ >= 0.1) 
    .hasDistinctness(Seq("ipaddr"), _ >= 0.1) 
    .hasDistinctness(Seq("ordertype"), _ >= 0.1) 
    .hasDistinctness(Seq("price"), _ >= 0.1) 
    .hasDistinctness(Seq("quantity"), _ >= 0.1) 
    .hasDistinctness(Seq("symbol"), _ >= 0.1) 
    .hasDistinctness(Seq("time"), _ >= 0.1) 
    .hasApproxQuantile("price", 0.5, _ <= 40)
    .hasApproxQuantile("quantity", 0.5, _ <= 2000)
    .isNonNegative("price")
    .isNonNegative("quantity")
  )
  .run()

  val resultDataFrame = checkResultsAsDataFrame(spark, verificationResult)
  display(resultDataFrame)


    // .hasDistinctness("date", _ >= 0.1) 
    // .hasDistinctness("ipaddr", _ >= 0.1) 
    // .hasDistinctness("ordertype", _ >= 0.1) 
    // .hasDistinctness("price", _ >= 0.1) 
    // .hasDistinctness("quantity", _ >= 0.1) 
    // .hasDistinctness("symbol", _ >= 0.1) 
    // .hasDistinctness("time", _ >= 0.1) 
    // .isNonNegative("quantity", _ >= 0.1)
    // .hasDistinctness()
    // .hasCompleteness("title", _ >= 0.95)
    // .isUnique("title")
    // .hasCompleteness("ipaddr", _ >= 0.95)
    // // .isContainedIn("buysell", Array("buy", "sell"))
    // .isNonNegative("quantity")
  


// val verificationResult = VerificationSuite()
//   .onData(data)
//   .addCheck(
//     Check(CheckLevel.Error, "unit testing my data")
//       .hasSize(_ == 5) // we expect 5 rows
//       .isComplete("id") // should never be NULL
//       .isUnique("id") // should not contain duplicates
//       .isComplete("productName") // should never be NULL
//       // should only contain the values "high" and "low"
//       .isContainedIn("priority", Array("high", "low"))
//       .isNonNegative("numViews") // should not contain negative values
//       // at least half of the descriptions should contain a url
//       .containsURL("description", _ >= 0.5)
//       // half of the items should have less than 10 views
//       .hasApproxQuantile("numViews", 0.5, _ <= 10))
//     .run()

  // .addAnalyzer(ApproxCountDistinct("title"))
  // .addAnalyzer(Distinctness("title"))
  // .addAnalyzer(Completeness("release_date"))
  // .addAnalyzer(Completeness("rating"))
  // .addAnalyzer(Completeness("description"))
  // .addAnalyzer(Compliance("rating", "rating >= 0"))


  // Convert check results to a Spark data frame for visualization


check,check_level,check_status,constraint,constraint_status,constraint_message
Data Quality Verification,Error,Error,"UniquenessConstraint(Uniqueness(List(buysell),None))",Failure,Value: 0.0 does not meet the constraint requirement!
Data Quality Verification,Error,Error,"UniquenessConstraint(Uniqueness(List(date),None))",Failure,Value: 0.0 does not meet the constraint requirement!
Data Quality Verification,Error,Error,"UniquenessConstraint(Uniqueness(List(ipaddr),None))",Success,
Data Quality Verification,Error,Error,"UniquenessConstraint(Uniqueness(List(ordertype),None))",Failure,Value: 0.0 does not meet the constraint requirement!
Data Quality Verification,Error,Error,"UniquenessConstraint(Uniqueness(List(price),None))",Failure,Value: 0.9837067209775967 does not meet the constraint requirement!
Data Quality Verification,Error,Error,"UniquenessConstraint(Uniqueness(List(quantity),None))",Failure,Value: 0.9528301886792453 does not meet the constraint requirement!
Data Quality Verification,Error,Error,"UniquenessConstraint(Uniqueness(List(symbol),None))",Failure,Value: 0.864 does not meet the constraint requirement!
Data Quality Verification,Error,Error,"UniquenessConstraint(Uniqueness(List(time),None))",Failure,Value: 0.992 does not meet the constraint requirement!
