### Cargar el dataset original y dividir en train/test

In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split

In [2]:
data = pd.read_csv("../data/insurance.csv")

In [3]:
train, test = train_test_split(data, test_size=0.3, random_state=0)

In [4]:
train.head()

Unnamed: 0,age,sex,bmi,children,smoker,region,charges
1163,18,female,28.215,0,no,northeast,2200.83085
196,39,female,32.8,0,no,southwest,5649.715
438,52,female,46.75,5,no,southeast,12592.5345
183,44,female,26.41,0,no,northwest,7419.4779
1298,33,male,27.455,2,no,northwest,5261.46945


In [5]:
test.head()

Unnamed: 0,age,sex,bmi,children,smoker,region,charges
578,52,male,30.2,1,no,southwest,9724.53
610,47,female,29.37,1,no,southeast,8547.6913
569,48,male,40.565,2,yes,northwest,45702.02235
1034,61,male,38.38,0,no,northwest,12950.0712
198,51,female,18.05,0,no,northwest,9644.2525


In [9]:
train.to_csv("../data/train.csv", index=False)
test.to_csv("../data/test.csv", index=False)

### Generar expectations a partir del train.csv

In [10]:
import great_expectations as ge
import pandas as pd

In [11]:
data = ge.read_csv("../data/train.csv")

In [12]:
data.head()

Unnamed: 0,age,sex,bmi,children,smoker,region,charges
0,18,female,28.215,0,no,northeast,2200.83085
1,39,female,32.8,0,no,southwest,5649.715
2,52,female,46.75,5,no,southeast,12592.5345
3,44,female,26.41,0,no,northwest,7419.4779
4,33,male,27.455,2,no,northwest,5261.46945


In [13]:
data.region.unique()

array(['northeast', 'southwest', 'southeast', 'northwest'], dtype=object)

In [14]:
data.expect_column_values_to_be_in_set("sex",['female','male'])
data.expect_column_values_to_be_in_set("region",['northeast', 'southwest', 'southeast', 'northwest'])
data.expect_column_values_to_be_in_set("smoker",['no','yes'])

{'success': True,
 'result': {'element_count': 936,
  'missing_count': 0,
  'missing_percent': 0.0,
  'unexpected_count': 0,
  'unexpected_percent': 0.0,
  'unexpected_percent_nonmissing': 0.0,
  'partial_unexpected_list': []}}

In [15]:
data.expect_column_values_to_be_of_type("age","int")
data.expect_column_values_to_be_of_type("sex","string")
data.expect_column_values_to_be_of_type("children","int")
data.expect_column_values_to_be_of_type("region","string")
data.expect_column_values_to_be_of_type("charges","float")

{'success': True,
 'result': {'element_count': 936,
  'missing_count': 0,
  'missing_percent': 0.0,
  'unexpected_count': 0,
  'unexpected_percent': 0.0,
  'unexpected_percent_nonmissing': 0.0,
  'partial_unexpected_list': []}}

In [16]:
data.expect_column_to_exist("age")
data.expect_column_to_exist("sex")
data.expect_column_to_exist("bmi")
data.expect_column_to_exist("smoker")
data.expect_column_to_exist("region")
data.expect_column_to_exist("charges")

{'success': True}

In [17]:
age_partition = ge.dataset.util.continuous_partition_data(data['age'])
data.expect_column_bootstrapped_ks_test_p_value_to_be_greater_than('age', age_partition)

{'success': True,
 'result': {'observed_value': 0.954045954045954,
  'element_count': 936,
  'missing_count': 0,
  'missing_percent': 0.0}}

In [18]:
bmi_partition = ge.dataset.util.continuous_partition_data(data['bmi'])
data.expect_column_bootstrapped_ks_test_p_value_to_be_greater_than('bmi', bmi_partition)

{'success': True,
 'result': {'observed_value': 0.954045954045954,
  'element_count': 936,
  'missing_count': 0,
  'missing_percent': 0.0}}

In [19]:
charges_partition = ge.dataset.util.continuous_partition_data(data['charges'])
data.expect_column_bootstrapped_ks_test_p_value_to_be_greater_than('charges', charges_partition)

{'success': True,
 'result': {'observed_value': 0.9440559440559441,
  'element_count': 936,
  'missing_count': 0,
  'missing_percent': 0.0}}

#### Guardamos las expectations

In [20]:
data.save_expectations_config("expectations.json")

	0 failing expectations
	17 result_format kwargs
	0 include_configs kwargs
	0 catch_exceptions kwargs
If you wish to change this behavior, please set discard_failed_expectations, discard_result_format_kwargs, discard_include_configs_kwargs, and discard_catch_exceptions_kwargs appropirately.
