# Imports

In [1]:
import os
from datetime import datetime
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
# from ydata_profiling import ProfileReport
# from ydata_quality import DataQuality
# from ydata_quality.erroneous_data import ErroneousDataIdentifier
# from ydata_quality.duplicates import DuplicateChecker
# from ydata_quality.missings import MissingsProfiler
import great_expectations as gx

# EDA

## Madrid

### Loading the data

In [2]:
df = pd.read_csv('houses_Madrid.csv')
# df.head()

### Dropping empty columns

In [69]:
empty_cols = [col for col in df.columns if df[col].isnull().all()]
df = df.drop(empty_cols, axis=1)
# df.info()

### Handling boolean columns

In [3]:
# Bool columns
columns_bool = [
    'is_exact_address_hidden',
    'is_renewal_needed',
    'has_parking'
]

# Columns with TRUE/FALSE/blank
columns_true_false_blank = [
    'is_floor_under',
    'is_new_development',
    'has_central_heating',
    'has_individual_heating',
    'has_lift',
    'is_parking_included_in_price',
    'is_orientation_north',
    'is_orientation_west',
    'is_orientation_south',
    'is_orientation_east'
]

# Columns with TRUE/blank
columns_true_blank = [
    'has_ac',
    'has_fitted_wardrobes',
    'is_exterior',
    'has_garden',
    'has_pool',
    'has_terrace',
    'has_balcony',
    'has_storage_room',
    'is_accessible',
    'has_green_zones'
]

In [4]:
# Mapping function for boolean columns
def map_bool(val):
    if val == True:
        return 1
    else:
        return 0

# Mapping function for TRUE/FALSE/blank columns
def map_true_false_blank(val):
    if np.isnan(val):
        return np.nan
    elif val == True:
        return 1
    else:
        return 0

# Mapping function for TRUE/blank columns
def map_true_blank(val):
    if val == True:
        return 1
    else:
        return 0

In [72]:
# print the head of the columns_true_false_blank
# df[columns_true_false_blank].head()

In [5]:
# Apply mappings
for col in columns_bool:
    df[col] = df[col].map(map_bool)

for col in columns_true_false_blank:
    df[col] = df[col].map(map_true_false_blank)

for col in columns_true_blank:
    df[col] = df[col].map(map_true_blank)

In [74]:
# df.info()

### Handling categorical columns

In [75]:
# df['neighborhood_id'].value_counts()

In [6]:
# extract the neighborhood name, and the district name from the neighborhood_id
chop = df['neighborhood_id'].str.extract(r'Neighborhood (\d+): (.*?) \(.*\) - District (\d+): (.*)')
df['neighborhood'] = chop[1]+ ' ' + chop[0]
df['district'] = chop[3] + ' ' + chop[2]
df.drop(axis=1, columns=['neighborhood_id'], inplace=True)
# df[['neighborhood', 'district']].head()

In [7]:
# Categorical columns
columns_categorical = [
    'neighborhood',
    'district',
    'house_type_id',
    'energy_certificate',
]

columns_categorical += columns_bool
columns_categorical += columns_true_false_blank
columns_categorical += columns_true_blank

In [8]:
# Convert to categorical
df[columns_categorical] = df[columns_categorical].astype('category')
# df.info()

### Creating column lists

In [10]:
# create a list of column names that are not numerical
non_numerical_columns = df.select_dtypes(include=['object']).columns.to_list()

# create a list of column names that are int64 or float64 and exclude the 'id' column
numerical_columns = df.select_dtypes(include=['int64', 'float64']).columns.to_list()
numerical_columns.remove('id')

# create a list of column names that are categorical
categorical_columns = df.select_dtypes(include=['category']).columns.to_list()

In [11]:
id_columns = ['id']

In [12]:
df[numerical_columns].describe()

Unnamed: 0,sq_mt_built,sq_mt_useful,n_rooms,n_bathrooms,n_floors,sq_mt_allotment,rent_price,buy_price,buy_price_by_area,built_year,parking_price
count,21616.0,8228.0,21742.0,21726.0,1437.0,1432.0,21742.0,21742.0,21742.0,10000.0,7719.0
mean,146.920892,103.458192,3.005749,2.091687,3.12874,241.692737,-59170.31,653735.6,4020.523871,1970.0464,2658.000518
std,134.181865,88.259192,1.510497,1.406992,0.907713,247.484853,917116.2,782082.1,1908.418774,69.386705,13360.966258
min,13.0,1.0,0.0,1.0,1.0,1.0,-34590280.0,36000.0,447.0,1723.0,0.0
25%,70.0,59.0,2.0,1.0,2.0,2.0,725.0,198000.0,2551.0,1957.0,0.0
50%,100.0,79.0,3.0,2.0,3.0,232.0,1116.0,375000.0,3720.0,1970.0,0.0
75%,162.0,113.0,4.0,2.0,4.0,354.0,1687.0,763600.0,5000.0,1994.0,0.0
max,999.0,998.0,24.0,16.0,7.0,997.0,2517.0,8800000.0,18889.0,8170.0,600000.0


In [13]:
df_num = df[numerical_columns].copy()
df_num_cat = df[numerical_columns + categorical_columns + id_columns + non_numerical_columns].copy()

## Energy Consumption

### Loading the data

In [14]:
# Loading meta data for each of the houses
meta = pd.read_csv('energy_consumption/residential_meta.csv')
meta.rename(columns={'House':'HouseID'}, inplace=True)
meta.drop(columns=['RUs', 'Cover'], inplace=True)

In [15]:
meta['FirstReading'] = pd.to_datetime(meta['FirstReading'])
meta['LastReading'] = pd.to_datetime(meta['LastReading'])
meta['Duration'] = meta['LastReading'] - meta['FirstReading']
meta['Duration'] = meta['Duration'].dt.days

In [16]:
# Read every csv file in the energy_consumption/Residential folder and before concatenating them, map the meta data to each of the houses using the House ID
# and the name of the Residential csv file which is in the format of Residential_<HouseID>.csv
dfs = []
for file in os.listdir('energy_consumption/Residential'):
    edf = pd.read_csv('energy_consumption/Residential/' + file)
    edf['HouseID'] = file.split('_')[1].split('.')[0]
    edf['HouseID'] = edf['HouseID'].astype('int64')
    edf = edf.merge(meta, on='HouseID')
    dfs.append(edf)

edf = pd.concat(dfs)
edf.rename(columns={'energy_kWh': 'kWh'}, inplace=True)
edf.info()

<class 'pandas.core.frame.DataFrame'>
Index: 592630 entries, 0 to 24672
Data columns (total 22 columns):
 #   Column        Non-Null Count   Dtype         
---  ------        --------------   -----         
 0   date          592630 non-null  object        
 1   hour          592630 non-null  int64         
 2   kWh           587659 non-null  float64       
 3   HouseID       592630 non-null  int64         
 4   FirstReading  592630 non-null  datetime64[ns]
 5   LastReading   592630 non-null  datetime64[ns]
 6   HouseType     592630 non-null  object        
 7   Facing        592630 non-null  object        
 8   Region        592630 non-null  object        
 9   FAGF          592630 non-null  int64         
 10  HP            592630 non-null  int64         
 11  FPG           592630 non-null  int64         
 12  FPE           592630 non-null  int64         
 13  IFRHG         592630 non-null  int64         
 14  NAC           592630 non-null  int64         
 15  FAC           592630 no

### Handling categorical columns

In [17]:
# Categorical columns
e_categorical_columns = [
    'HouseID',
    'HouseType',
    'Facing',
    'Region',
]

In [18]:
# Convert to categorical
edf[e_categorical_columns] = edf[e_categorical_columns].astype('category')

### Creating column lists

In [19]:
e_numerical_columns = ['hour', 'kWh', 'Duration']

In [20]:
e_date_columns = ['date', 'FirstReading', 'LastReading']

In [21]:
e_df_num = edf[e_numerical_columns].copy()
e_df_num_cat = edf[e_numerical_columns + e_categorical_columns + e_date_columns].copy()

In [22]:
e_df_num_cat.head()

Unnamed: 0,hour,kWh,Duration,HouseID,HouseType,Facing,Region,date,FirstReading,LastReading
0,1,1.011,1219,1,bungalow,South,YVR,2012-06-01,2012-06-01,2015-10-03
1,2,0.451,1219,1,bungalow,South,YVR,2012-06-01,2012-06-01,2015-10-03
2,3,0.505,1219,1,bungalow,South,YVR,2012-06-01,2012-06-01,2015-10-03
3,4,0.441,1219,1,bungalow,South,YVR,2012-06-01,2012-06-01,2015-10-03
4,5,0.468,1219,1,bungalow,South,YVR,2012-06-01,2012-06-01,2015-10-03


## Melbourne Housing

### Loading the data

In [52]:
mb_df = pd.read_csv('melb_data.csv')
mb_df.head()

Unnamed: 0,Suburb,Address,Rooms,Type,Price,Method,SellerG,Date,Distance,Postcode,...,Bathroom,Car,Landsize,BuildingArea,YearBuilt,CouncilArea,Lattitude,Longtitude,Regionname,Propertycount
0,Abbotsford,85 Turner St,2,h,1480000.0,S,Biggin,3/12/2016,2.5,3067.0,...,1.0,1.0,202.0,,,Yarra,-37.7996,144.9984,Northern Metropolitan,4019.0
1,Abbotsford,25 Bloomburg St,2,h,1035000.0,S,Biggin,4/02/2016,2.5,3067.0,...,1.0,0.0,156.0,79.0,1900.0,Yarra,-37.8079,144.9934,Northern Metropolitan,4019.0
2,Abbotsford,5 Charles St,3,h,1465000.0,SP,Biggin,4/03/2017,2.5,3067.0,...,2.0,0.0,134.0,150.0,1900.0,Yarra,-37.8093,144.9944,Northern Metropolitan,4019.0
3,Abbotsford,40 Federation La,3,h,850000.0,PI,Biggin,4/03/2017,2.5,3067.0,...,2.0,1.0,94.0,,,Yarra,-37.7969,144.9969,Northern Metropolitan,4019.0
4,Abbotsford,55a Park St,4,h,1600000.0,VB,Nelson,4/06/2016,2.5,3067.0,...,1.0,2.0,120.0,142.0,2014.0,Yarra,-37.8072,144.9941,Northern Metropolitan,4019.0


### Handling categorical columns

In [53]:
# Categorical columns
mb_categorical_columns = [
    'Suburb',
    'Type',
    'Method',
    'SellerG',
    'Postcode',
    'Regionname',
    'CouncilArea',
]

In [54]:
# Convert to categorical
mb_df[mb_categorical_columns] = mb_df[mb_categorical_columns].astype('category')

### Creating column lists

In [55]:
mb_numerical_columns = ['Rooms', 'Price', 'Distance', 'Bedroom2', 'Bathroom', 'Car', 'Landsize', 'BuildingArea', 'YearBuilt', 'Lattitude', 'Longtitude', 'Propertycount']

In [56]:
mb_date_columns = ['Date']

In [57]:
mb_id_columns = ['Address']

In [58]:
mb_df_num = mb_df[mb_numerical_columns].copy()
mb_df_num_cat = mb_df[mb_numerical_columns + mb_categorical_columns + mb_id_columns + mb_date_columns].copy()

In [59]:
mb_df_num_cat.head()

Unnamed: 0,Rooms,Price,Distance,Bedroom2,Bathroom,Car,Landsize,BuildingArea,YearBuilt,Lattitude,...,Propertycount,Suburb,Type,Method,SellerG,Postcode,Regionname,CouncilArea,Address,Date
0,2,1480000.0,2.5,2.0,1.0,1.0,202.0,,,-37.7996,...,4019.0,Abbotsford,h,S,Biggin,3067.0,Northern Metropolitan,Yarra,85 Turner St,3/12/2016
1,2,1035000.0,2.5,2.0,1.0,0.0,156.0,79.0,1900.0,-37.8079,...,4019.0,Abbotsford,h,S,Biggin,3067.0,Northern Metropolitan,Yarra,25 Bloomburg St,4/02/2016
2,3,1465000.0,2.5,3.0,2.0,0.0,134.0,150.0,1900.0,-37.8093,...,4019.0,Abbotsford,h,SP,Biggin,3067.0,Northern Metropolitan,Yarra,5 Charles St,4/03/2017
3,3,850000.0,2.5,3.0,2.0,1.0,94.0,,,-37.7969,...,4019.0,Abbotsford,h,PI,Biggin,3067.0,Northern Metropolitan,Yarra,40 Federation La,4/03/2017
4,4,1600000.0,2.5,3.0,1.0,2.0,120.0,142.0,2014.0,-37.8072,...,4019.0,Abbotsford,h,VB,Nelson,3067.0,Northern Metropolitan,Yarra,55a Park St,4/06/2016


## Madrid Idealista source

### Loading the data

In [32]:
im_df = pd.read_csv('idealista_madrid.csv')
im_df.head()

Unnamed: 0,url,listingUrl,title,id,price,baths,rooms,sqft,description,address,typology,advertiserProfessionalName,advertiserName
0,https://www.idealista.com/inmueble/104027174/,https://www.idealista.com/venta-viviendas/madr...,Piso en venta en calle de Villanueva,104027174,1920000,3,3,183,Residencia única con acabados de la más alta c...,"Recoletos, Madrid",Pisos,Promora Madrid,Promora Madrid
1,https://www.idealista.com/inmueble/102321942/,https://www.idealista.com/venta-viviendas/madr...,Piso en venta en calle de Núñez de Balboa,102321942,1995000,3,3,170,"Preciosa reforma a estrenar, con terrazas y ga...","Castellana, Madrid",Pisos,Madrid MMC,Engel & Völkers Madrid
2,https://www.idealista.com/inmueble/103334142/,https://www.idealista.com/venta-viviendas/madr...,Piso en venta en Conde Orgaz-Piovera,103334142,1300000,3,4,270,DE SALAS CONSULTORES INMOBILIARIOS MAV02679 le...,"Hortaleza, Madrid",Pisos,De Salas Consultores Inmobiliarios,De Salas Consultores Inmobiliarios
3,https://www.idealista.com/inmueble/104161987/,https://www.idealista.com/venta-viviendas/madr...,Piso en venta en Nueva España,104161987,1650000,3,3,248,Gilmar Real Estate vende espectacular piso en ...,"Chamartín, Madrid",Pisos,departamento comercial,Gilmar Viso - Chamartín
4,https://www.idealista.com/inmueble/103989666/,https://www.idealista.com/venta-viviendas/madr...,Piso en venta en calle de Claudio Coello,103989666,1590750,3,3,116,Descubre esta impresionante vivienda exterior ...,"Recoletos, Madrid",Pisos,Walter Haus Salamanca,Walter Haus Madrid


In [33]:
im_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 915 entries, 0 to 914
Data columns (total 13 columns):
 #   Column                      Non-Null Count  Dtype 
---  ------                      --------------  ----- 
 0   url                         915 non-null    object
 1   listingUrl                  915 non-null    object
 2   title                       915 non-null    object
 3   id                          915 non-null    int64 
 4   price                       915 non-null    int64 
 5   baths                       915 non-null    int64 
 6   rooms                       915 non-null    int64 
 7   sqft                        915 non-null    int64 
 8   description                 914 non-null    object
 9   address                     915 non-null    object
 10  typology                    915 non-null    object
 11  advertiserProfessionalName  915 non-null    object
 12  advertiserName              915 non-null    object
dtypes: int64(5), object(8)
memory usage: 93.1+ KB


### Handling categorical columns

In [34]:
im_categorical_columns = [
    'address',
    'typology',
    'advertiserProfessionalName',
    'advertiserName'
]

In [35]:
# Convert to categorical
im_df[im_categorical_columns] = im_df[im_categorical_columns].astype('category')

### Creating column lists

In [36]:
im_numerical_columns = ['price', 'baths', 'rooms', 'sqft']

In [37]:
im_id_columns = ['id']

In [39]:
im_df_num = im_df[im_numerical_columns].copy()
im_df_num_cat = im_df[im_numerical_columns + id_columns + im_categorical_columns].copy()

In [40]:
im_df_num_cat.head()

Unnamed: 0,price,baths,rooms,sqft,id,address,typology,advertiserProfessionalName,advertiserName
0,1920000,3,3,183,104027174,"Recoletos, Madrid",Pisos,Promora Madrid,Promora Madrid
1,1995000,3,3,170,102321942,"Castellana, Madrid",Pisos,Madrid MMC,Engel & Völkers Madrid
2,1300000,3,4,270,103334142,"Hortaleza, Madrid",Pisos,De Salas Consultores Inmobiliarios,De Salas Consultores Inmobiliarios
3,1650000,3,3,248,104161987,"Chamartín, Madrid",Pisos,departamento comercial,Gilmar Viso - Chamartín
4,1590750,3,3,116,103989666,"Recoletos, Madrid",Pisos,Walter Haus Salamanca,Walter Haus Madrid


# Data Quality

### Great Expectations

In [41]:
path_to_folder = 'C:\\Users\\bened\\Documents\\Fairness in AI\\'

In [42]:
context = gx.data_context.FileDataContext(project_root_dir=path_to_folder)
# print(context)

#### Testing

In [43]:
# Initialize the data source and data asset
data_source = context.sources.add_or_update_pandas(name='houses_madrid')
ds_name = 'houses_madrid'
data_asset = data_source.add_dataframe_asset(name=ds_name)

In [44]:
data_asset = context.get_datasource(ds_name).get_asset(ds_name)
batch_request = data_asset.build_batch_request(dataframe=df_num_cat)

In [45]:
context.add_or_update_expectation_suite('default')

{
  "expectation_suite_name": "default",
  "ge_cloud_id": null,
  "expectations": [],
  "data_asset_type": null,
  "meta": {
    "great_expectations_version": "0.18.12"
  }
}

In [49]:
validator = context.get_validator(batch_request=batch_request, expectation_suite_name='default')
# validator.head()

In [55]:
result_format = {
        "result_format": "COMPLETE",
        "return_unexpected_index_query": True,
    }
result = validator.expect_column_values_to_not_be_null('built_year', result_format=result_format)
# print the size of unexpected index list
print(len(result.result['unexpected_index_list']))

Calculating Metrics:   0%|          | 0/8 [00:00<?, ?it/s]

11742


#### Generic function 2.0

In [43]:
def create_batch_request(context, ds_name, df):
    data_source = context.sources.add_or_update_pandas(name=ds_name)
    data_asset = data_source.add_dataframe_asset(name=f"{ds_name}_dataframe")
    return data_asset.build_batch_request(dataframe=df)

In [44]:
def initialize_validator(context, batch_request, dq_name):
    context.add_or_update_expectation_suite(dq_name)
    return context.get_validator(batch_request=batch_request, expectation_suite_name=dq_name)

In [45]:
def log_expectation_result(expectation_result, column, expectation_name, expectation_weight, failed_expectations_df, success_count, total_expectations_weight, export):
    if expectation_result.success:
        success_count += expectation_weight
    elif export:
        failed_expectations_df["expectation_failed"] = False
        failed_expectations_df.loc[expectation_result.result['unexpected_index_list'], "expectation_failed"] = True
        failed_expectations_df[f"{column}_{expectation_name}"] = False
        failed_expectations_df.loc[~failed_expectations_df.index.isin(expectation_result.result['unexpected_index_list']), f"{column}_{expectation_name}"] = True
    return success_count, total_expectations_weight

In [46]:
def test_column_expectations(columns, result_format, failed_expectations_df, export):
    success_count = 0
    total_expectations_weight = 0

    for column, expectations in columns.items():
        for expectation_func, expectation_name, expectation_weight in expectations:
            result = expectation_func(column, result_format)
            success_count, total_expectations_weight = log_expectation_result(
                result, column, expectation_name, expectation_weight,
                failed_expectations_df, success_count, total_expectations_weight, export
            )
            total_expectations_weight += expectation_weight

    return success_count, total_expectations_weight

In [47]:
def export_failed_expectations(failed_expectations_df, dq_name):
    export_dir = "failed_expectations_exports"
    os.makedirs(export_dir, exist_ok=True)

    current_datetime = datetime.now().strftime("%Y%m%d_%H%M%S")
    file_name = f"{dq_name}_failed_expectations_{current_datetime}.csv"
    file_path = os.path.join(export_dir, file_name)

    failed_expectations_df.to_csv(file_path, index=True)

In [70]:
def test_data_quality(df, dq_name, id_columns=[], categorical_columns=[], numeric_columns=[], date_columns=[], strictness=0.85, export=False):
    ds_name = f"{dq_name}_ds"

    batch_request = create_batch_request(context, ds_name, df)
    validator = initialize_validator(context, batch_request, dq_name)

    failed_expectations_df = df.copy()
    failed_expectations_df["expectation_failed"] = False

    result_format = {"result_format": "COMPLETE" if export else "SUMMARY"}

    id_expectations = [
        (lambda col, fmt: validator.expect_column_values_to_not_be_null(col, mostly=1, result_format=fmt), "not_null", 5),
        (lambda col, fmt: validator.expect_column_values_to_be_unique(col, result_format=fmt), "unique", 10),
        (lambda col, fmt: validator.expect_column_value_lengths_to_be_between(col, min_value=4, max_value=16, mostly=strictness, result_format=fmt), "length_between", 4),
        (lambda col, fmt: validator.expect_column_values_to_match_regex(col, regex="^[a-zA-Z0-9]+$", mostly=strictness, result_format=fmt), "match_regex", 3),
    ]

    categorical_expectations = [
        (lambda col, fmt: validator.expect_column_values_to_not_be_null(col, mostly=strictness, result_format=fmt), "not_null", 1),
        (lambda col, fmt: validator.expect_column_value_lengths_to_be_between(col, min_value=1, max_value=15, mostly=strictness, result_format=fmt), "length_between", 3),
    ]

    numeric_expectations = [
        (lambda col, fmt: validator.expect_column_values_to_be_in_type_list(col, type_list=['int64', 'float64', 'int32', 'float32', 'int16', 'float16'], mostly=strictness, result_format=fmt), "type_list", 3),
        (lambda col, fmt: validator.expect_column_values_to_not_be_null(col, mostly=strictness, result_format=fmt), "not_null", 1),
        (lambda col, fmt: validator.expect_column_value_lengths_to_be_between(col, min_value=1, max_value=15, mostly=strictness, result_format=fmt), "length_between", 3),
        (lambda col, fmt: validator.expect_column_values_to_be_between(col, min_value=df[col].mean() - 3.5 * df[col].std(), max_value=df[col].mean() + 3.5 * df[col].std(), mostly=1, result_format=fmt), "outliers_outside_3.5_std", 5),
        (lambda col, fmt: validator.expect_column_values_to_be_between(col, min_value=df[col].mean() - 2.5 * df[col].std(), max_value=df[col].mean() + 2.5 * df[col].std(), mostly=1, result_format=fmt), "outliers_outside_2.5_std", 3),
        (lambda col, fmt: validator.expect_column_values_to_be_between(col, min_value=df[col].mean() - 1.5 * df[col].std(), max_value=df[col].mean() + 1.5 * df[col].std(), mostly=1, result_format=fmt), "outliers_outside_1.5_std", 1),
    ]

    date_expectations = [
        (lambda col, fmt: validator.expect_column_values_to_be_dateutil_parseable(col, mostly=strictness, result_format=fmt), "dateutil_parseable", 5)
    ]

    columns = {col: id_expectations for col in id_columns}
    columns.update({col: categorical_expectations for col in categorical_columns})
    columns.update({col: numeric_expectations for col in numeric_columns})
    columns.update({col: date_expectations for col in date_columns})

    success_count, total_expectations_weight = test_column_expectations(columns, result_format, failed_expectations_df, export)

    validator.save_expectation_suite(discard_failed_expectations=False)

    if export:
        checkpoint = context.add_or_update_checkpoint(
            name=f"{dq_name}_checkpoint",
            validator=validator,
        )
        checkpoint.run()

    score = success_count / total_expectations_weight

    failed_rows_df = failed_expectations_df[failed_expectations_df["expectation_failed"]]

    if export:
        export_failed_expectations(failed_rows_df, dq_name)

    return score

In [None]:
# was inside the test_data_quality function
    # checkpoint = context.add_or_update_checkpoint(
    #     name=f"{dq_name}_checkpoint",
    #     validator=validator,
    # )

    # checkpoint_result = checkpoint.run()

    # # Calculate success rate
    # success_count = checkpoint_result.list_validation_results()[0]['statistics']['successful_expectations']

In [71]:
hm_score = test_data_quality(df_num_cat, 'houses_madrid', id_columns=id_columns, categorical_columns=categorical_columns, numeric_columns=numerical_columns, date_columns=[], strictness=0.85, export=True)

Calculating Metrics:   0%|          | 0/8 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/10 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/11 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/10 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/8 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/11 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/8 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/11 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/8 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/11 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/8 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/11 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/8 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/11 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/8 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/11 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/8 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/11 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/8 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/11 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/8 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/11 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/8 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/11 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/8 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/11 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/8 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/11 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/8 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/11 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/8 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/11 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/8 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/11 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/8 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/11 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/8 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/11 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/8 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/11 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/8 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/11 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/8 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/11 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/8 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/11 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/8 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/11 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/8 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/11 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/8 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/11 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/8 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/11 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/8 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/11 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/8 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/11 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/1 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/8 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/11 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/10 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/10 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/10 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/1 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/8 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/11 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/10 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/10 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/10 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/1 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/8 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/11 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/10 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/10 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/10 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/1 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/8 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/11 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/10 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/10 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/10 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/1 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/8 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/11 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/10 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/10 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/10 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/1 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/8 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/11 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/10 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/10 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/10 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/1 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/8 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/11 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/10 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/10 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/10 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/1 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/8 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/11 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/10 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/10 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/10 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/1 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/8 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/11 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/10 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/10 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/10 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/1 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/8 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/11 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/10 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/10 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/10 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/1 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/8 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/11 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/10 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/10 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/10 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/497 [00:00<?, ?it/s]

In [72]:
e_score = test_data_quality(e_df_num_cat, 'energy_dataset', id_columns=[], categorical_columns=e_categorical_columns, numeric_columns=e_numerical_columns, date_columns=[], strictness=0.85, export=True)

Calculating Metrics:   0%|          | 0/8 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/11 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/8 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/11 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/8 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/11 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/8 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/11 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/1 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/8 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/11 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/10 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/10 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/10 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/1 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/8 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/11 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/10 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/10 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/10 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/1 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/8 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/11 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/10 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/10 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/10 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/95 [00:00<?, ?it/s]

In [73]:
mb_score = test_data_quality(mb_df_num_cat, 'melbourne', id_columns=mb_id_columns, categorical_columns=mb_categorical_columns, numeric_columns=mb_numerical_columns, date_columns=mb_date_columns, strictness=0.85, export=True)

Calculating Metrics:   0%|          | 0/8 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/10 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/11 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/10 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/8 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/11 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/8 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/11 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/8 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/11 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/8 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/11 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/8 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/11 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/8 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/11 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/8 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/11 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/1 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/8 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/11 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/10 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/10 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/10 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/1 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/8 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/11 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/10 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/10 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/10 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/1 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/8 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/11 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/10 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/10 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/10 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/1 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/8 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/11 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/10 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/10 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/10 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/1 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/8 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/11 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/10 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/10 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/10 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/1 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/8 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/11 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/10 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/10 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/10 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/1 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/8 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/11 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/10 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/10 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/10 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/1 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/8 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/11 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/10 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/10 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/10 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/1 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/8 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/11 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/10 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/10 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/10 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/1 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/8 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/11 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/10 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/10 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/10 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/1 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/8 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/11 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/10 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/10 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/10 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/1 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/8 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/11 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/10 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/10 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/10 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/10 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/300 [00:00<?, ?it/s]

In [74]:
id_score = test_data_quality(im_df, 'idealista_madrid', id_columns=im_id_columns, categorical_columns=im_categorical_columns, numeric_columns=im_numerical_columns, date_columns=[], strictness=0.85, export=True)

Calculating Metrics:   0%|          | 0/8 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/10 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/11 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/10 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/8 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/11 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/8 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/11 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/8 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/11 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/8 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/11 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/1 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/8 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/11 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/10 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/10 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/10 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/1 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/8 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/11 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/10 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/10 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/10 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/1 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/8 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/11 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/10 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/10 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/10 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/1 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/8 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/11 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/10 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/10 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/10 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/132 [00:00<?, ?it/s]

In [75]:
scores = {
    'houses_madrid': hm_score,
    'energy_dataset': e_score,
    'melbourne': mb_score,
    'idealista_madrid': id_score
}
print(scores)

{'houses_madrid': 0.6339869281045751, 'energy_dataset': 0.703125, 'melbourne': 0.5101214574898786, 'idealista_madrid': 0.5588235294117647}


### Tests

In [29]:
# validator.expect_column_values_to_not_be_null(column='buy_price')
# validator.expect_column_min_to_be_between(column='rent_price', min_value=0, max_value=10000000)
validator.expect_column_mean_to_be_between(column='rent_price', min_value=0, max_value=1000000)
# come up with 5 basic expectations for the dataset
validator.expect_column_values_to_be_in_set(column='house_type_id', value_set=['House', 'Flat', 'Studio'])
# validator.expect_column_values_to_be_unique(column='id')

validator.save_expectation_suite(discard_failed_expectations=False)

Calculating Metrics:   0%|          | 0/4 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/8 [00:00<?, ?it/s]

In [30]:
checkpoint = context.add_or_update_checkpoint(
    name="first_checkpoint",
    validator=validator,
)

In [31]:
checkpoint_result = checkpoint.run()
# context.view_validation_result(checkpoint_result)
# context.build_data_docs()

Calculating Metrics:   0%|          | 0/11 [00:00<?, ?it/s]

In [29]:
checkpoint_result.get_statistics()

{'data_asset_count': 1,
 'validation_result_count': 1,
 'successful_validation_count': 0,
 'unsuccessful_validation_count': 1,
 'successful_validation_percent': 0.0,
 'validation_statistics': {ValidationResultIdentifier::default/__none__/20240410T143612.448261Z/houses_madrid-houses_madrid: {'evaluated_expectations': 2,
   'successful_expectations': 0,
   'unsuccessful_expectations': 2,
   'success_percent': 0.0}}}

In [49]:
checkpoint_result.list_validation_results()

[{
   "success": false,
   "results": [
     {
       "success": false,
       "expectation_config": {
         "expectation_type": "expect_column_mean_to_be_between",
         "kwargs": {
           "column": "rent_price",
           "max_value": 1000000,
           "min_value": 0,
           "batch_id": "houses_madrid-houses_madrid"
         },
         "meta": {}
       },
       "result": {
         "observed_value": -59170.30792935332
       },
       "meta": {},
       "exception_info": {
         "raised_exception": false,
         "exception_traceback": null,
         "exception_message": null
       }
     },
     {
       "success": false,
       "expectation_config": {
         "expectation_type": "expect_column_values_to_be_in_set",
         "kwargs": {
           "column": "house_type_id",
           "value_set": [
             "House",
             "Flat",
             "Studio"
           ],
           "batch_id": "houses_madrid-houses_madrid"
         },
         "meta":

In [52]:
# get the number of successful expectations from the checkpoint result
success_count = checkpoint_result.list_validation_results()[0]['statistics']['successful_expectations']
print(success_count)

0
