# Debugging and Testing Pandas

## Code to Transform Data

### How to do it...

In [1]:
import pandas as pd
import numpy as np
import zipfile
url = 'data/kaggle-survey-2018.zip'

In [2]:
with zipfile.ZipFile(url) as z:
    print(z.namelist())
    kag = pd.read_csv(z.open('multipleChoiceResponses.csv'))
    df = kag.iloc[1:]

['multipleChoiceResponses.csv', 'freeFormResponses.csv', 'SurveySchema.csv']


  interactivity=interactivity, compiler=compiler, result=result)


In [3]:
df.T

Unnamed: 0,1,2,3,4,5,6,7,8,9,10,...,23850,23851,23852,23853,23854,23855,23856,23857,23858,23859
Time from Start to Finish (seconds),710,434,718,621,731,1142,959,1758,641,751,...,820,683,57,122,348,575,131,370,36,502
Q1,Female,Male,Female,Male,Male,Male,Male,Male,Male,Male,...,Female,Male,Female,Female,Male,Male,Female,Male,Male,Male
Q1_OTHER_TEXT,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,...,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1
Q2,45-49,30-34,30-34,35-39,22-24,25-29,35-39,18-21,25-29,30-34,...,18-21,22-24,18-21,30-34,30-34,45-49,25-29,22-24,25-29,25-29
Q3,United States of America,Indonesia,United States of America,United States of America,India,Colombia,Chile,India,Turkey,Hungary,...,India,Turkey,Turkey,Turkey,Turkey,France,Turkey,Turkey,United Kingdom of Great Britain and Northern I...,Spain
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
Q50_Part_5,,,,Not enough incentives to share my work,Not enough incentives to share my work,,,Not enough incentives to share my work,,,...,,,,,,,,,,
Q50_Part_6,,,,,,I had never considered making my work easier f...,I had never considered making my work easier f...,,,,...,,,,,,,,,,
Q50_Part_7,,,,,,,,,,,...,,,,,,,,,,
Q50_Part_8,,,,,,,,,,,...,,,,,,,,,,


In [4]:
df.dtypes

Time from Start to Finish (seconds)    object
Q1                                     object
Q1_OTHER_TEXT                          object
Q2                                     object
Q3                                     object
                                        ...  
Q50_Part_5                             object
Q50_Part_6                             object
Q50_Part_7                             object
Q50_Part_8                             object
Q50_OTHER_TEXT                         object
Length: 395, dtype: object

In [5]:
df.Q1.value_counts(dropna=False)

Male                       19430
Female                      4010
Prefer not to say            340
Prefer to self-describe       79
Name: Q1, dtype: int64

In [6]:
def tweak_kag(df):
    na_mask = df.Q9.isna()
    hide_mask = df.Q9.str.startswith('I do not').fillna(False)
    df = df[~na_mask & ~hide_mask]


    q1 = (df.Q1
      .replace({'Prefer not to say': 'Another',
               'Prefer to self-describe': 'Another'})
      .rename('Gender')
    )
    q2 = df.Q2.str.slice(0,2).astype(int).rename('Age')
    def limit_countries(val):
        if val in  {'United States of America', 'India', 'China'}:
            return val
        return 'Another'
    q3 = df.Q3.apply(limit_countries).rename('Country')


    q4 = (df.Q4
     .replace({'Master’s degree': 18,
     'Bachelor’s degree': 16,
     'Doctoral degree': 20,
     'Some college/university study without earning a bachelor’s degree': 13,
     'Professional degree': 19,
     'I prefer not to answer': None,
     'No formal education past high school': 12})
     .fillna(11)
     .rename('Edu')
    )


    def only_cs_stat_val(val):
        if val not in {'cs', 'eng', 'stat'}:
            return 'another'
        return val


    q5 = (df.Q5
            .replace({
                'Computer science (software engineering, etc.)': 'cs',
                'Engineering (non-computer focused)': 'eng',
                'Mathematics or statistics': 'stat'})
             .apply(only_cs_stat_val)
             .rename('Studies'))
    def limit_occupation(val):
        if val in {'Student', 'Data Scientist', 'Software Engineer', 'Not employed',
                  'Data Engineer'}:
            return val
        return 'Another'


    q6 = df.Q6.apply(limit_occupation).rename('Occupation')


    q8 = (df.Q8
      .str.replace('+', '')
      .str.split('-', expand=True)
      .iloc[:,0]
      .fillna(-1)
      .astype(int)
      .rename('Experience')
    )


    q9 = (df.Q9
     .str.replace('+','')
     .str.replace(',','')
     .str.replace('500000', '500')
     .str.replace('I do not wish to disclose my approximate yearly compensation','')
     .str.split('-', expand=True)
     .iloc[:,0]
     .astype(int)
     .mul(1000)
     .rename('Salary'))
    return pd.concat([q1, q2, q3, q4, q5, q6, q8, q9], axis=1)

In [7]:
tweak_kag(df)

Unnamed: 0,Gender,Age,Country,Edu,Studies,Occupation,Experience,Salary
2,Male,30,Another,16.0,eng,Another,5,10000
3,Female,30,United States of America,18.0,cs,Data Scientist,0,0
5,Male,22,India,18.0,stat,Another,0,0
7,Male,35,Another,20.0,another,Another,10,10000
8,Male,18,India,18.0,another,Another,0,0
...,...,...,...,...,...,...,...,...
23844,Male,30,Another,18.0,cs,Software Engineer,10,90000
23845,Male,22,Another,18.0,stat,Student,0,0
23854,Male,30,Another,20.0,cs,Another,5,10000
23855,Male,45,Another,20.0,cs,Another,5,250000


In [8]:
tweak_kag(df).dtypes

Gender         object
Age             int64
Country        object
Edu           float64
Studies        object
Occupation     object
Experience      int64
Salary          int64
dtype: object

### How it works...

In [9]:
kag = tweak_kag(df)
(kag
    .groupby('Country')
    .apply(lambda g: g.Salary.corr(g.Experience))
)

Country
Another                     0.289827
China                       0.252974
India                       0.167335
United States of America    0.354125
dtype: float64

## Apply Performance

### How to do it...

In [13]:
def limit_countries(val):
     if val in  {'United States of America', 'India', 'China'}:
         return val
     return 'Another'

In [14]:
%%timeit
q3 = df.Q3.apply(limit_countries).rename('Country')

6.35 ms ± 1.71 ms per loop (mean ± std. dev. of 7 runs, 100 loops each)


In [15]:
%%timeit
other_values = df.Q3.value_counts().iloc[3:].index
q3_2 = df.Q3.replace(other_values, 'Another')

30.8 ms ± 6.26 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)


In [16]:
%%timeit
values = {'United States of America', 'India', 'China'}
q3_3 = df.Q3.where(df.Q3.isin(values), 'Another')

2.53 ms ± 119 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)


In [17]:
%%timeit
values = {'United States of America', 'India', 'China'}
q3_4 = pd.Series(np.where(df.Q3.isin(values), df.Q3, 'Another'), 
     index=df.index)

3.32 ms ± 1.15 ms per loop (mean ± std. dev. of 7 runs, 100 loops each)


In [18]:
q3.equals(q3_2)

NameError: name 'q3' is not defined

In [None]:
q3.equals(q3_3)

In [None]:
q3.equals(q3_4)

### How it works...

### There's more...

In [19]:
def limit_countries(val):
     if val in  {'United States of America', 'India', 'China'}:
         return val
     return 'Another'

In [20]:
q3 = df.Q3.apply(limit_countries).rename('Country')

In [21]:
def debug(something):
    # what is something? A cell, series, dataframe?
    print(type(something), something)
    1/0

In [22]:
q3.apply(debug)

<class 'str'> United States of America


ZeroDivisionError: division by zero

In [28]:
the_item = None
def debug(something):
    global the_item
    the_item = something
    return something

In [29]:
_ = q3.apply(debug)

In [30]:
the_item

'Another'

## Improving Apply Performance with Dask, Pandarell, Swifter, and More

### How to do it...

In [31]:
from pandarallel import pandarallel
pandarallel.initialize()

INFO: Pandarallel will run on 4 workers.
INFO: Pandarallel will use standard multiprocessing data transfer (pipe) to transfer data between the main process and workers.


In [32]:
def limit_countries(val):
     if val in  {'United States of America', 'India', 'China'}:
         return val
     return 'Another'

In [33]:
%%timeit
res_p = df.Q3.parallel_apply(limit_countries).rename('Country')

117 ms ± 11.5 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)


In [41]:
import swifter

In [42]:
%%timeit
res_s = df.Q3.swifter.apply(limit_countries).rename('Country')

  from pandas import Panel


HBox(children=(FloatProgress(value=0.0, description='Pandas Apply', max=23859.0, style=ProgressStyle(descripti…




HBox(children=(FloatProgress(value=0.0, description='Pandas Apply', max=23859.0, style=ProgressStyle(descripti…




HBox(children=(FloatProgress(value=0.0, description='Pandas Apply', max=23859.0, style=ProgressStyle(descripti…




HBox(children=(FloatProgress(value=0.0, description='Pandas Apply', max=23859.0, style=ProgressStyle(descripti…




HBox(children=(FloatProgress(value=0.0, description='Pandas Apply', max=23859.0, style=ProgressStyle(descripti…




HBox(children=(FloatProgress(value=0.0, description='Pandas Apply', max=23859.0, style=ProgressStyle(descripti…




HBox(children=(FloatProgress(value=0.0, description='Pandas Apply', max=23859.0, style=ProgressStyle(descripti…




HBox(children=(FloatProgress(value=0.0, description='Pandas Apply', max=23859.0, style=ProgressStyle(descripti…




HBox(children=(FloatProgress(value=0.0, description='Pandas Apply', max=23859.0, style=ProgressStyle(descripti…




HBox(children=(FloatProgress(value=0.0, description='Pandas Apply', max=23859.0, style=ProgressStyle(descripti…




HBox(children=(FloatProgress(value=0.0, description='Pandas Apply', max=23859.0, style=ProgressStyle(descripti…




HBox(children=(FloatProgress(value=0.0, description='Pandas Apply', max=23859.0, style=ProgressStyle(descripti…




HBox(children=(FloatProgress(value=0.0, description='Pandas Apply', max=23859.0, style=ProgressStyle(descripti…




HBox(children=(FloatProgress(value=0.0, description='Pandas Apply', max=23859.0, style=ProgressStyle(descripti…




HBox(children=(FloatProgress(value=0.0, description='Pandas Apply', max=23859.0, style=ProgressStyle(descripti…




HBox(children=(FloatProgress(value=0.0, description='Pandas Apply', max=23859.0, style=ProgressStyle(descripti…




HBox(children=(FloatProgress(value=0.0, description='Pandas Apply', max=23859.0, style=ProgressStyle(descripti…




HBox(children=(FloatProgress(value=0.0, description='Pandas Apply', max=23859.0, style=ProgressStyle(descripti…




HBox(children=(FloatProgress(value=0.0, description='Pandas Apply', max=23859.0, style=ProgressStyle(descripti…




HBox(children=(FloatProgress(value=0.0, description='Pandas Apply', max=23859.0, style=ProgressStyle(descripti…




HBox(children=(FloatProgress(value=0.0, description='Pandas Apply', max=23859.0, style=ProgressStyle(descripti…




HBox(children=(FloatProgress(value=0.0, description='Pandas Apply', max=23859.0, style=ProgressStyle(descripti…




HBox(children=(FloatProgress(value=0.0, description='Pandas Apply', max=23859.0, style=ProgressStyle(descripti…




HBox(children=(FloatProgress(value=0.0, description='Pandas Apply', max=23859.0, style=ProgressStyle(descripti…




HBox(children=(FloatProgress(value=0.0, description='Pandas Apply', max=23859.0, style=ProgressStyle(descripti…




HBox(children=(FloatProgress(value=0.0, description='Pandas Apply', max=23859.0, style=ProgressStyle(descripti…




HBox(children=(FloatProgress(value=0.0, description='Pandas Apply', max=23859.0, style=ProgressStyle(descripti…




HBox(children=(FloatProgress(value=0.0, description='Pandas Apply', max=23859.0, style=ProgressStyle(descripti…




HBox(children=(FloatProgress(value=0.0, description='Pandas Apply', max=23859.0, style=ProgressStyle(descripti…




HBox(children=(FloatProgress(value=0.0, description='Pandas Apply', max=23859.0, style=ProgressStyle(descripti…




HBox(children=(FloatProgress(value=0.0, description='Pandas Apply', max=23859.0, style=ProgressStyle(descripti…




HBox(children=(FloatProgress(value=0.0, description='Pandas Apply', max=23859.0, style=ProgressStyle(descripti…




HBox(children=(FloatProgress(value=0.0, description='Pandas Apply', max=23859.0, style=ProgressStyle(descripti…




HBox(children=(FloatProgress(value=0.0, description='Pandas Apply', max=23859.0, style=ProgressStyle(descripti…




HBox(children=(FloatProgress(value=0.0, description='Pandas Apply', max=23859.0, style=ProgressStyle(descripti…




HBox(children=(FloatProgress(value=0.0, description='Pandas Apply', max=23859.0, style=ProgressStyle(descripti…




HBox(children=(FloatProgress(value=0.0, description='Pandas Apply', max=23859.0, style=ProgressStyle(descripti…




HBox(children=(FloatProgress(value=0.0, description='Pandas Apply', max=23859.0, style=ProgressStyle(descripti…




HBox(children=(FloatProgress(value=0.0, description='Pandas Apply', max=23859.0, style=ProgressStyle(descripti…




HBox(children=(FloatProgress(value=0.0, description='Pandas Apply', max=23859.0, style=ProgressStyle(descripti…




HBox(children=(FloatProgress(value=0.0, description='Pandas Apply', max=23859.0, style=ProgressStyle(descripti…




HBox(children=(FloatProgress(value=0.0, description='Pandas Apply', max=23859.0, style=ProgressStyle(descripti…




HBox(children=(FloatProgress(value=0.0, description='Pandas Apply', max=23859.0, style=ProgressStyle(descripti…




HBox(children=(FloatProgress(value=0.0, description='Pandas Apply', max=23859.0, style=ProgressStyle(descripti…




HBox(children=(FloatProgress(value=0.0, description='Pandas Apply', max=23859.0, style=ProgressStyle(descripti…




HBox(children=(FloatProgress(value=0.0, description='Pandas Apply', max=23859.0, style=ProgressStyle(descripti…




HBox(children=(FloatProgress(value=0.0, description='Pandas Apply', max=23859.0, style=ProgressStyle(descripti…




HBox(children=(FloatProgress(value=0.0, description='Pandas Apply', max=23859.0, style=ProgressStyle(descripti…




HBox(children=(FloatProgress(value=0.0, description='Pandas Apply', max=23859.0, style=ProgressStyle(descripti…




HBox(children=(FloatProgress(value=0.0, description='Pandas Apply', max=23859.0, style=ProgressStyle(descripti…




HBox(children=(FloatProgress(value=0.0, description='Pandas Apply', max=23859.0, style=ProgressStyle(descripti…




HBox(children=(FloatProgress(value=0.0, description='Pandas Apply', max=23859.0, style=ProgressStyle(descripti…




HBox(children=(FloatProgress(value=0.0, description='Pandas Apply', max=23859.0, style=ProgressStyle(descripti…




HBox(children=(FloatProgress(value=0.0, description='Pandas Apply', max=23859.0, style=ProgressStyle(descripti…




HBox(children=(FloatProgress(value=0.0, description='Pandas Apply', max=23859.0, style=ProgressStyle(descripti…




HBox(children=(FloatProgress(value=0.0, description='Pandas Apply', max=23859.0, style=ProgressStyle(descripti…




HBox(children=(FloatProgress(value=0.0, description='Pandas Apply', max=23859.0, style=ProgressStyle(descripti…




HBox(children=(FloatProgress(value=0.0, description='Pandas Apply', max=23859.0, style=ProgressStyle(descripti…




HBox(children=(FloatProgress(value=0.0, description='Pandas Apply', max=23859.0, style=ProgressStyle(descripti…




HBox(children=(FloatProgress(value=0.0, description='Pandas Apply', max=23859.0, style=ProgressStyle(descripti…




HBox(children=(FloatProgress(value=0.0, description='Pandas Apply', max=23859.0, style=ProgressStyle(descripti…




HBox(children=(FloatProgress(value=0.0, description='Pandas Apply', max=23859.0, style=ProgressStyle(descripti…




HBox(children=(FloatProgress(value=0.0, description='Pandas Apply', max=23859.0, style=ProgressStyle(descripti…




HBox(children=(FloatProgress(value=0.0, description='Pandas Apply', max=23859.0, style=ProgressStyle(descripti…




HBox(children=(FloatProgress(value=0.0, description='Pandas Apply', max=23859.0, style=ProgressStyle(descripti…




HBox(children=(FloatProgress(value=0.0, description='Pandas Apply', max=23859.0, style=ProgressStyle(descripti…




HBox(children=(FloatProgress(value=0.0, description='Pandas Apply', max=23859.0, style=ProgressStyle(descripti…




HBox(children=(FloatProgress(value=0.0, description='Pandas Apply', max=23859.0, style=ProgressStyle(descripti…




HBox(children=(FloatProgress(value=0.0, description='Pandas Apply', max=23859.0, style=ProgressStyle(descripti…




HBox(children=(FloatProgress(value=0.0, description='Pandas Apply', max=23859.0, style=ProgressStyle(descripti…




HBox(children=(FloatProgress(value=0.0, description='Pandas Apply', max=23859.0, style=ProgressStyle(descripti…




HBox(children=(FloatProgress(value=0.0, description='Pandas Apply', max=23859.0, style=ProgressStyle(descripti…




HBox(children=(FloatProgress(value=0.0, description='Pandas Apply', max=23859.0, style=ProgressStyle(descripti…




HBox(children=(FloatProgress(value=0.0, description='Pandas Apply', max=23859.0, style=ProgressStyle(descripti…




HBox(children=(FloatProgress(value=0.0, description='Pandas Apply', max=23859.0, style=ProgressStyle(descripti…




HBox(children=(FloatProgress(value=0.0, description='Pandas Apply', max=23859.0, style=ProgressStyle(descripti…




HBox(children=(FloatProgress(value=0.0, description='Pandas Apply', max=23859.0, style=ProgressStyle(descripti…




HBox(children=(FloatProgress(value=0.0, description='Pandas Apply', max=23859.0, style=ProgressStyle(descripti…




HBox(children=(FloatProgress(value=0.0, description='Pandas Apply', max=23859.0, style=ProgressStyle(descripti…




HBox(children=(FloatProgress(value=0.0, description='Pandas Apply', max=23859.0, style=ProgressStyle(descripti…




HBox(children=(FloatProgress(value=0.0, description='Pandas Apply', max=23859.0, style=ProgressStyle(descripti…


179 ms ± 81.1 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)


In [43]:
import dask

In [44]:
%%timeit
res_d = (dask.dataframe.from_pandas(
       df, npartitions=4)
   .map_partitions(lambda df: df.Q3.apply(limit_countries))
   .rename('Countries')
)

710 ms ± 72.2 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [45]:
np_fn = np.vectorize(limit_countries)

In [39]:
%%timeit
res_v = df.Q3.apply(np_fn).rename('Country')

414 ms ± 16.1 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [46]:
from numba import jit

In [50]:
@jit
def limit_countries2(val):
     if val in  ['United States of America', 'India', 'China']:
         return val
     return 'Another'

In [51]:
%%timeit
res_n = df.Q3.apply(limit_countries2).rename('Country')

106 ms ± 45.7 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


### How it works...

## Inspecting Code 

### How to do it...

In [52]:
import zipfile
url = 'data/kaggle-survey-2018.zip'

In [53]:
with zipfile.ZipFile(url) as z:
    kag = pd.read_csv(z.open('multipleChoiceResponses.csv'))
    df = kag.iloc[1:]

  interactivity=interactivity, compiler=compiler, result=result)


In [54]:
df.Q3.apply?

In [55]:
df.Q3.apply??

In [56]:
import pandas.core.series
pandas.core.series.lib

<module 'pandas._libs.lib' from '/Users/matt/.env/pandas1/lib/python3.7/site-packages/pandas/_libs/lib.cpython-37m-darwin.so'>

In [57]:
pandas.core.series.lib.map_infer??

### How it works...

### There's more...

## Debugging in Jupyter

### How to do it...

In [58]:
import zipfile
url = 'data/kaggle-survey-2018.zip'

In [59]:
with zipfile.ZipFile(url) as z:
    kag = pd.read_csv(z.open('multipleChoiceResponses.csv'))
    df = kag.iloc[1:]

In [60]:
def add1(x):
    return x + 1

In [61]:
df.Q3.apply(add1)

TypeError: can only concatenate str (not "int") to str

In [62]:
from IPython.core.debugger import set_trace

In [63]:
def add1(x):
    set_trace()
    return x + 1

In [None]:
df.Q3.apply(add1)

### How it works...

### There's more...

##  Managing data integrity with Great Expectations

### How to do it...

In [64]:
kag = tweak_kag(df)

In [66]:
import great_expectations as ge
kag_ge = ge.from_pandas(kag)

In [67]:
sorted([x for x in set(dir(kag_ge)) - set(dir(kag))
    if not x.startswith('_')])

['autoinspect',
 'batch_id',
 'batch_kwargs',
 'batch_markers',
 'batch_parameters',
 'column_aggregate_expectation',
 'column_map_expectation',
 'column_pair_map_expectation',
 'discard_failing_expectations',
 'edit_expectation_suite',
 'expect_column_bootstrapped_ks_test_p_value_to_be_greater_than',
 'expect_column_chisquare_test_p_value_to_be_greater_than',
 'expect_column_distinct_values_to_be_in_set',
 'expect_column_distinct_values_to_contain_set',
 'expect_column_distinct_values_to_equal_set',
 'expect_column_kl_divergence_to_be_less_than',
 'expect_column_max_to_be_between',
 'expect_column_mean_to_be_between',
 'expect_column_median_to_be_between',
 'expect_column_min_to_be_between',
 'expect_column_most_common_value_to_be_in_set',
 'expect_column_pair_values_A_to_be_greater_than_B',
 'expect_column_pair_values_to_be_equal',
 'expect_column_pair_values_to_be_in_set',
 'expect_column_parameterized_distribution_ks_test_p_value_to_be_greater_than',
 'expect_column_proportion_of_u

In [68]:
kag_ge.expect_column_to_exist('Salary')

{
  "result": {},
  "meta": {},
  "expectation_config": {
    "meta": {},
    "expectation_type": "expect_column_to_exist",
    "kwargs": {
      "column": "Salary",
      "result_format": "BASIC"
    }
  },
  "exception_info": null,
  "success": true
}

In [69]:
kag_ge.expect_column_mean_to_be_between(
   'Salary', min_value=10_000, max_value=100_000)

{
  "result": {
    "observed_value": 43869.66102793441,
    "element_count": 15429,
    "missing_count": 0,
    "missing_percent": 0.0
  },
  "meta": {},
  "expectation_config": {
    "meta": {},
    "expectation_type": "expect_column_mean_to_be_between",
    "kwargs": {
      "column": "Salary",
      "min_value": 10000,
      "max_value": 100000,
      "result_format": "BASIC"
    }
  },
  "exception_info": null,
  "success": true
}

In [70]:
kag_ge.expect_column_values_to_be_between(
   'Salary', min_value=0, max_value=500_000)

{
  "result": {
    "element_count": 15429,
    "missing_count": 0,
    "missing_percent": 0.0,
    "unexpected_count": 0,
    "unexpected_percent": 0.0,
    "unexpected_percent_nonmissing": 0.0,
    "partial_unexpected_list": []
  },
  "meta": {},
  "expectation_config": {
    "meta": {},
    "expectation_type": "expect_column_values_to_be_between",
    "kwargs": {
      "column": "Salary",
      "min_value": 0,
      "max_value": 500000,
      "result_format": "BASIC"
    }
  },
  "exception_info": null,
  "success": true
}

In [71]:
kag_ge.expect_column_values_to_not_be_null('Salary')

{
  "result": {
    "element_count": 15429,
    "unexpected_count": 0,
    "unexpected_percent": 0.0,
    "partial_unexpected_list": []
  },
  "meta": {},
  "expectation_config": {
    "meta": {},
    "expectation_type": "expect_column_values_to_not_be_null",
    "kwargs": {
      "column": "Salary",
      "result_format": "BASIC"
    }
  },
  "exception_info": null,
  "success": true
}

In [72]:
kag_ge.expect_column_values_to_match_regex(
    'Country', r'America|India|Another|China')

{
  "result": {
    "element_count": 15429,
    "missing_count": 0,
    "missing_percent": 0.0,
    "unexpected_count": 0,
    "unexpected_percent": 0.0,
    "unexpected_percent_nonmissing": 0.0,
    "partial_unexpected_list": []
  },
  "meta": {},
  "expectation_config": {
    "meta": {},
    "expectation_type": "expect_column_values_to_match_regex",
    "kwargs": {
      "column": "Country",
      "regex": "America|India|Another|China",
      "result_format": "BASIC"
    }
  },
  "exception_info": null,
  "success": true
}

In [73]:
kag_ge.expect_column_values_to_be_of_type(
   'Salary', type_='int')

{
  "result": {
    "observed_value": "int64"
  },
  "meta": {},
  "expectation_config": {
    "meta": {},
    "expectation_type": "_expect_column_values_to_be_of_type__aggregate",
    "kwargs": {
      "column": "Salary",
      "type_": "int",
      "result_format": "BASIC"
    }
  },
  "exception_info": null,
  "success": true
}

In [74]:
kag_ge.save_expectation_suite('kaggle_expectations.json')

In [75]:
kag_ge.to_csv('kag.csv')
import json
ge.validate(ge.read_csv('kag.csv'), 
    expectation_suite=json.load(
        open('kaggle_expectations.json')))

{
  "evaluation_parameters": {},
  "meta": {
    "great_expectations.__version__": "0.9.2",
    "expectation_suite_name": "default",
    "run_id": "20200224T221709.246886Z",
    "batch_kwargs": {
      "ge_batch_id": "65ebc1d8-5753-11ea-a940-a45e60ecc33f"
    },
    "batch_markers": {},
    "batch_parameters": {}
  },
  "statistics": {
    "evaluated_expectations": 6,
    "successful_expectations": 6,
    "unsuccessful_expectations": 0,
    "success_percent": 100.0
  },
  "results": [
    {
      "result": {},
      "meta": {},
      "expectation_config": {
        "meta": {},
        "expectation_type": "expect_column_to_exist",
        "kwargs": {
          "column": "Salary"
        }
      },
      "exception_info": {
        "raised_exception": false,
        "exception_message": null,
        "exception_traceback": null
      },
      "success": true
    },
    {
      "result": {
        "observed_value": 43869.66102793441,
        "element_count": 15429,
        "missing_count"

### How it works...

## Using pytest with pandas

### How to do it...

### How it works...

### There's more...

## Generating Tests with Hypothesis

### How to do it...

### How it works...