In [1]:
from jenga.tasks.reviews import VideogameReviewsTask
from jenga.evaluation.schema_stresstest import SchemaStresstest

import tensorflow_data_validation as tfdv

import numpy as np
import pandas as pd

### Instantiate the video game reviews task with a randomly chosen seed

In [2]:
seed = np.random.randint(2**32 - 1)

task = VideogameReviewsTask(seed=seed)

### Create a tfdv schema by first auto-infering it from training data and then adjusting it

In [3]:
train_data_stats = tfdv.generate_statistics_from_dataframe(task.train_data)
schema = tfdv.infer_schema(statistics=train_data_stats)
review_date_feature = tfdv.get_feature(schema, 'review_date')
review_date_feature.distribution_constraints.min_domain_mass = 0.0

In [4]:
schema

feature {
  name: "marketplace"
  type: BYTES
  domain: "marketplace"
  presence {
    min_fraction: 1.0
    min_count: 1
  }
  shape {
    dim {
      size: 1
    }
  }
}
feature {
  name: "customer_id"
  type: BYTES
  int_domain {
  }
  presence {
    min_fraction: 1.0
    min_count: 1
  }
  shape {
    dim {
      size: 1
    }
  }
}
feature {
  name: "review_id"
  type: BYTES
  presence {
    min_fraction: 1.0
    min_count: 1
  }
  shape {
    dim {
      size: 1
    }
  }
}
feature {
  name: "product_id"
  type: BYTES
  presence {
    min_fraction: 1.0
    min_count: 1
  }
  shape {
    dim {
      size: 1
    }
  }
}
feature {
  name: "product_parent"
  type: BYTES
  int_domain {
  }
  presence {
    min_fraction: 1.0
    min_count: 1
  }
  shape {
    dim {
      size: 1
    }
  }
}
feature {
  name: "product_title"
  type: BYTES
  presence {
    min_fraction: 1.0
    min_count: 1
  }
  shape {
    dim {
      size: 1
    }
  }
}
feature {
  name: "product_category"
  type: BYT

### Train the baseline model for the task

In [5]:
model = task.fit_baseline_model(task.train_data, task.train_labels)

### Run a stress test for the schema and the model with 250 randomly chosen corruptions, and mark performance drops of 3% as failures

In [6]:
stress_test = SchemaStresstest()
results = stress_test.run(task, model, schema, num_corruptions=250, performance_threshold=.03)

MissingValues: {'column': 'product_parent', 'fraction': 0.33, 'sampling': 'MAR', 'na_value': ''}
BrokenCharacters: {'column': 'review_headline', 'fraction': 0.73}
MissingValues: {'column': 'review_id', 'fraction': 0.68, 'sampling': 'MNAR', 'na_value': ''}
BrokenCharacters: {'column': 'vine', 'fraction': 0.17}
MissingValues: {'column': 'product_parent', 'fraction': 0.71, 'sampling': 'MNAR', 'na_value': ''}
GaussianNoise: {'column': 'star_rating', 'fraction': 0.17, 'sampling': 'CAR'}
MissingValues: {'column': 'review_id', 'fraction': 0.4, 'sampling': 'MAR', 'na_value': ''}
Scaling: {'column': 'star_rating', 'fraction': 0.54, 'sampling': 'CAR'}
Scaling: {'column': 'star_rating', 'fraction': 0.52, 'sampling': 'CAR'}
GaussianNoise: {'column': 'star_rating', 'fraction': 0.09, 'sampling': 'CAR'}
MissingValues: {'column': 'vine', 'fraction': 0.02, 'sampling': 'MNAR', 'na_value': ''}
BrokenCharacters: {'column': 'vine', 'fraction': 0.68}
MissingValues: {'column': 'review_id', 'fraction': 0.7, '

MissingValues: {'column': 'review_id', 'fraction': 0.13, 'sampling': 'MCAR', 'na_value': ''}
MissingValues: {'column': 'review_body', 'fraction': 0.86, 'sampling': 'MCAR', 'na_value': ''}
BrokenCharacters: {'column': 'review_body', 'fraction': 0.46}
BrokenCharacters: {'column': 'product_title', 'fraction': 0.84}
SwappedValues: {'column': 'title_and_review_text', 'fraction': 0.14, 'sampling': 'CAR', 'swap_with': 'review_body'}
MissingValues: {'column': 'review_id', 'fraction': 0.03, 'sampling': 'MNAR', 'na_value': ''}
MissingValues: {'column': 'customer_id', 'fraction': 0.69, 'sampling': 'MCAR', 'na_value': ''}
SwappedValues: {'column': 'customer_id', 'fraction': 0.19, 'sampling': 'CAR', 'swap_with': 'product_id'}
MissingValues: {'column': 'product_parent', 'fraction': 0.44, 'sampling': 'MNAR', 'na_value': ''}
MissingValues: {'column': 'review_id', 'fraction': 0.74, 'sampling': 'MAR', 'na_value': ''}
MissingValues: {'column': 'title_and_review_text', 'fraction': 0.85, 'sampling': 'MAR',

GaussianNoise: {'column': 'star_rating', 'fraction': 0.89, 'sampling': 'CAR'}
MissingValues: {'column': 'review_headline', 'fraction': 0.51, 'sampling': 'MAR', 'na_value': ''}
MissingValues: {'column': 'star_rating', 'fraction': 0.02, 'sampling': 'MCAR', 'na_value': nan}
SwappedValues: {'column': 'vine', 'fraction': 0.57, 'sampling': 'CAR', 'swap_with': 'marketplace'}
BrokenCharacters: {'column': 'verified_purchase', 'fraction': 0.36}
MissingValues: {'column': 'product_title', 'fraction': 0.82, 'sampling': 'MAR', 'na_value': ''}
MissingValues: {'column': 'customer_id', 'fraction': 0.29, 'sampling': 'MNAR', 'na_value': ''}
BrokenCharacters: {'column': 'marketplace', 'fraction': 0.36}
Scaling: {'column': 'star_rating', 'fraction': 0.49, 'sampling': 'CAR'}
MissingValues: {'column': 'customer_id', 'fraction': 0.01, 'sampling': 'MAR', 'na_value': ''}
MissingValues: {'column': 'review_id', 'fraction': 0.04, 'sampling': 'MNAR', 'na_value': ''}
BrokenCharacters: {'column': 'product_id', 'fract

### Look at the dataframe containing the results

In [7]:
results

Unnamed: 0,corruption,status,anomalies,baseline_score,corrupted_score
0,"MissingValues: {'column': 'product_parent', 'f...",FP,"{'product_parent': description: ""String values...",0.790875,0.790875
1,BrokenCharacters: {'column': 'review_headline'...,TN,{},0.790875,0.790875
2,"MissingValues: {'column': 'review_id', 'fracti...",TN,{},0.790875,0.790875
3,"BrokenCharacters: {'column': 'vine', 'fraction...",TN,{},0.790875,0.790875
4,"MissingValues: {'column': 'product_parent', 'f...",FP,"{'product_parent': description: ""String values...",0.790875,0.790875
...,...,...,...,...,...
245,"BrokenCharacters: {'column': 'review_body', 'f...",TN,{},0.790875,0.790875
246,BrokenCharacters: {'column': 'review_headline'...,TN,{},0.790875,0.790875
247,"BrokenCharacters: {'column': 'marketplace', 'f...",FP,"{'marketplace': description: ""Examples contain...",0.790875,0.790875
248,"BrokenCharacters: {'column': 'product_parent',...",TN,{},0.790875,0.790875
