In [41]:
import pandas as pd
import numpy as np
from ydata_profiling import ProfileReport
import great_expectations as gx
from great_expectations.dataset import PandasDataset
import psycopg2
from sklearn.preprocessing import LabelEncoder

In [42]:
alzheimers_df = gx.read_csv("Alzheimers.csv", sep=';', encoding='ISO-8859-1')
alzheimers_df.head()

Unnamed: 0,RowId,YearStart,YearEnd,LocationAbbr,LocationDesc,Datasource,Class,Topic,Question,Data_Value_Unit,...,Stratification2,Geolocation,ClassID,TopicID,QuestionID,LocationID,StratificationCategoryID1,StratificationID1,StratificationCategoryID2,StratificationID2
0,BRFSS~2022~2022~9004~Q03~TMC01~AGE~GENDER,2022,2022,WEST,West,BRFSS,Mental Health,Frequent mental distress,Percentage of older adults who are experiencin...,%,...,Female,,C05,TMC01,Q03,9004,AGE,5064,GENDER,FEMALE
1,BRFSS~2022~2022~9001~Q03~TMC01~AGE~GENDER,2022,2022,NRE,Northeast,BRFSS,Mental Health,Frequent mental distress,Percentage of older adults who are experiencin...,%,...,Male,,C05,TMC01,Q03,9001,AGE,5064,GENDER,MALE
2,BRFSS~2022~2022~9002~Q03~TMC01~AGE~RACE,2022,2022,MDW,Midwest,BRFSS,Mental Health,Frequent mental distress,Percentage of older adults who are experiencin...,%,...,Hispanic,,C05,TMC01,Q03,9002,AGE,AGE_OVERALL,RACE,HIS
3,BRFSS~2020~2020~9003~Q27~TMC03~AGE~GENDER,2020,2020,SOU,South,BRFSS,Mental Health,Lifetime diagnosis of depression,Percentage of older adults with a lifetime dia...,%,...,Female,,C05,TMC03,Q27,9003,AGE,65PLUS,GENDER,FEMALE
4,BRFSS~2019~2019~01~Q43~TOC11~AGE~OVERALL,2019,2019,AL,Alabama,BRFSS,Overall Health,Arthritis among older adults,Percentage of older adults ever told they have...,%,...,,POINT (-86.63186076199969 32.84057112200048),C01,TOC11,Q43,1,AGE,65PLUS,OVERALL,OVERALL


In [59]:
ProfileReport(alzheimers_df)

Summarize dataset:   0%|          | 0/5 [00:00<?, ?it/s]

Generate report structure:   0%|          | 0/1 [00:00<?, ?it/s]

Render HTML:   0%|          | 0/1 [00:00<?, ?it/s]



In [43]:
alzheimers_df.get_expectation_suite()

{
  "expectation_suite_name": "default",
  "ge_cloud_id": null,
  "expectations": [],
  "data_asset_type": "Dataset",
  "meta": {
    "great_expectations_version": "0.18.12"
  }
}

### The RowId is not unique to each row
For a row ID to serve it's purppose, it needs to uniquely identify each row

In [44]:
alzheimers_df['RowId'] = [x for x in range (len(alzheimers_df['RowId']))]

In [45]:
alzheimers_df.head(3)

Unnamed: 0,RowId,YearStart,YearEnd,LocationAbbr,LocationDesc,Datasource,Class,Topic,Question,Data_Value_Unit,...,Stratification2,Geolocation,ClassID,TopicID,QuestionID,LocationID,StratificationCategoryID1,StratificationID1,StratificationCategoryID2,StratificationID2
0,0,2022,2022,WEST,West,BRFSS,Mental Health,Frequent mental distress,Percentage of older adults who are experiencin...,%,...,Female,,C05,TMC01,Q03,9004,AGE,5064,GENDER,FEMALE
1,1,2022,2022,NRE,Northeast,BRFSS,Mental Health,Frequent mental distress,Percentage of older adults who are experiencin...,%,...,Male,,C05,TMC01,Q03,9001,AGE,5064,GENDER,MALE
2,2,2022,2022,MDW,Midwest,BRFSS,Mental Health,Frequent mental distress,Percentage of older adults who are experiencin...,%,...,Hispanic,,C05,TMC01,Q03,9002,AGE,AGE_OVERALL,RACE,HIS


In [46]:
alzheimers_df.expect_column_values_to_be_unique('RowId')

{
  "success": true,
  "result": {
    "element_count": 284142,
    "missing_count": 0,
    "missing_percent": 0.0,
    "unexpected_count": 0,
    "unexpected_percent": 0.0,
    "unexpected_percent_total": 0.0,
    "unexpected_percent_nonmissing": 0.0,
    "partial_unexpected_list": []
  },
  "meta": {},
  "exception_info": {
    "raised_exception": false,
    "exception_traceback": null,
    "exception_message": null
  }
}

### Repeated columns with different names
Columns such as Class and ClassId are the same but with different names, for an OLAP architecture with a single table this isn't helpful

In [47]:
repeated_cols = ['ClassID','QuestionID','TopicID','Data_Value_Alt','Data_Value_Unit','QuestionID','LocationID','StratificationCategoryID1','StratificationID1','StratificationCategory2','StratificationID2','DataValueTypeID','Data_Value_Footnote_Symbol']
alzheimers_df = alzheimers_df.drop(labels=repeated_cols, axis = 1)
alzheimers_df.head(3)

Unnamed: 0,RowId,YearStart,YearEnd,LocationAbbr,LocationDesc,Datasource,Class,Topic,Question,Data_Value_Type,Data_Value,Data_Value_Footnote,Low_Confidence_Limit,High_Confidence_Limit,StratificationCategory1,Stratification1,Stratification2,Geolocation,StratificationCategoryID2
0,0,2022,2022,WEST,West,BRFSS,Mental Health,Frequent mental distress,Percentage of older adults who are experiencin...,Percentage,15.4,,13.9,17.1,Age Group,50-64 years,Female,,GENDER
1,1,2022,2022,NRE,Northeast,BRFSS,Mental Health,Frequent mental distress,Percentage of older adults who are experiencin...,Percentage,10.3,,9.3,11.5,Age Group,50-64 years,Male,,GENDER
2,2,2022,2022,MDW,Midwest,BRFSS,Mental Health,Frequent mental distress,Percentage of older adults who are experiencin...,Percentage,10.9,,8.8,13.6,Age Group,Overall,Hispanic,,RACE


### StratificationCategory 1 is constant
The column that defines the first stratification is only 'Age group'. Therefore, we may remove the column 'StratificationCategory1' and change the column name where the value of the first stratification is ('Stratification1') to age group.

In [48]:
alzheimers_df = alzheimers_df.drop(labels=['StratificationCategory1'], axis = 1)
alzheimers_df = alzheimers_df.rename(columns={'Stratification1':'age_group'})
alzheimers_df.head(3)

Unnamed: 0,RowId,YearStart,YearEnd,LocationAbbr,LocationDesc,Datasource,Class,Topic,Question,Data_Value_Type,Data_Value,Data_Value_Footnote,Low_Confidence_Limit,High_Confidence_Limit,age_group,Stratification2,Geolocation,StratificationCategoryID2
0,0,2022,2022,WEST,West,BRFSS,Mental Health,Frequent mental distress,Percentage of older adults who are experiencin...,Percentage,15.4,,13.9,17.1,50-64 years,Female,,GENDER
1,1,2022,2022,NRE,Northeast,BRFSS,Mental Health,Frequent mental distress,Percentage of older adults who are experiencin...,Percentage,10.3,,9.3,11.5,50-64 years,Male,,GENDER
2,2,2022,2022,MDW,Midwest,BRFSS,Mental Health,Frequent mental distress,Percentage of older adults who are experiencin...,Percentage,10.9,,8.8,13.6,Overall,Hispanic,,RACE


### Now that there is a single column for age group, it's possible to assure and expect it to be category type

In [49]:
alzheimers_df['age_group'] = pd.Categorical(alzheimers_df['age_group'], categories=alzheimers_df['age_group'].unique())
alzheimers_df.head(3)

Unnamed: 0,RowId,YearStart,YearEnd,LocationAbbr,LocationDesc,Datasource,Class,Topic,Question,Data_Value_Type,Data_Value,Data_Value_Footnote,Low_Confidence_Limit,High_Confidence_Limit,age_group,Stratification2,Geolocation,StratificationCategoryID2
0,0,2022,2022,WEST,West,BRFSS,Mental Health,Frequent mental distress,Percentage of older adults who are experiencin...,Percentage,15.4,,13.9,17.1,50-64 years,Female,,GENDER
1,1,2022,2022,NRE,Northeast,BRFSS,Mental Health,Frequent mental distress,Percentage of older adults who are experiencin...,Percentage,10.3,,9.3,11.5,50-64 years,Male,,GENDER
2,2,2022,2022,MDW,Midwest,BRFSS,Mental Health,Frequent mental distress,Percentage of older adults who are experiencin...,Percentage,10.9,,8.8,13.6,Overall,Hispanic,,RACE


In [50]:
alzheimers_df.expect_column_values_to_be_of_type(column='age_group', type_='CategoricalDtypeType')

{
  "success": true,
  "result": {
    "observed_value": "CategoricalDtypeType"
  },
  "meta": {},
  "exception_info": {
    "raised_exception": false,
    "exception_traceback": null,
    "exception_message": null
  }
}

### 'Data_Value' is either in percentage or a fraction of 30 (days). We can therefore expect the values to be between 0 and 100

In [51]:
alzheimers_df.expect_column_values_to_be_between(column="Data_Value", min_value=0, max_value=100)

{
  "success": true,
  "result": {
    "element_count": 284142,
    "missing_count": 91334,
    "missing_percent": 32.143787261298925,
    "unexpected_count": 0,
    "unexpected_percent": 0.0,
    "unexpected_percent_total": 0.0,
    "unexpected_percent_nonmissing": 0.0,
    "partial_unexpected_list": []
  },
  "meta": {},
  "exception_info": {
    "raised_exception": false,
    "exception_traceback": null,
    "exception_message": null
  }
}

In [52]:
alzheimers_df.expect_column_values_to_be_between(column="Low_Confidence_Limit", min_value=0, max_value=100)
alzheimers_df.expect_column_values_to_be_between(column="High_Confidence_Limit", min_value=0, max_value=100)

{
  "success": true,
  "result": {
    "element_count": 284142,
    "missing_count": 91545,
    "missing_percent": 32.218045906624155,
    "unexpected_count": 0,
    "unexpected_percent": 0.0,
    "unexpected_percent_total": 0.0,
    "unexpected_percent_nonmissing": 0.0,
    "partial_unexpected_list": []
  },
  "meta": {},
  "exception_info": {
    "raised_exception": false,
    "exception_traceback": null,
    "exception_message": null
  }
}

### For the data to be useful, 'Data_Value' must not be null, so we will drop all rows with missing data value

In [53]:
alzheimers_df = alzheimers_df.dropna(subset=['Data_Value'])
print('Number of missing values in Data Value after transformaion: ',alzheimers_df['Data_Value'].isna().sum())

Number of missing values in Data Value after transformaion:  0


In [54]:
alzheimers_df.expect_column_values_to_not_be_null(column='Data_Value')

{
  "success": true,
  "result": {
    "element_count": 192808,
    "unexpected_count": 0,
    "unexpected_percent": 0.0,
    "unexpected_percent_total": 0.0,
    "partial_unexpected_list": []
  },
  "meta": {},
  "exception_info": {
    "raised_exception": false,
    "exception_traceback": null,
    "exception_message": null
  }
}

### Stratification 2 can be either 'Gender' or 'Race', we shall inscrease redundancy by separating a column for race and another for gender. This should make the data more usable

In [55]:
ethnicity_categories = ['White, non-Hispanic','Hispanic','Black', 'non-Hispanic','Native Am/Alaskan Native','Asian/Pacific Islander', 'overall']
gender_categories = ['Male', 'Female', 'overall']

alzheimers_df['gender'] = alzheimers_df['Stratification2'].replace(ethnicity_categories, 'overall')
alzheimers_df['ethnicity'] = alzheimers_df['Stratification2'].replace(gender_categories, 'overall')

alzheimers_df = alzheimers_df.drop(labels=['Stratification2', 'StratificationCategoryID2'], axis = 1)
alzheimers_df.head(3)

Unnamed: 0,RowId,YearStart,YearEnd,LocationAbbr,LocationDesc,Datasource,Class,Topic,Question,Data_Value_Type,Data_Value,Data_Value_Footnote,Low_Confidence_Limit,High_Confidence_Limit,age_group,Geolocation,gender,ethnicity
0,0,2022,2022,WEST,West,BRFSS,Mental Health,Frequent mental distress,Percentage of older adults who are experiencin...,Percentage,15.4,,13.9,17.1,50-64 years,,Female,overall
1,1,2022,2022,NRE,Northeast,BRFSS,Mental Health,Frequent mental distress,Percentage of older adults who are experiencin...,Percentage,10.3,,9.3,11.5,50-64 years,,Male,overall
2,2,2022,2022,MDW,Midwest,BRFSS,Mental Health,Frequent mental distress,Percentage of older adults who are experiencin...,Percentage,10.9,,8.8,13.6,Overall,,overall,Hispanic


### Making Ethnicity and Gender into categories and adding category expectations

In [56]:
alzheimers_df['gender'] = pd.Categorical(alzheimers_df['gender'], gender_categories)
alzheimers_df['ethnicity'] = pd.Categorical(alzheimers_df['ethnicity'], ethnicity_categories)

alzheimers_df.expect_column_values_to_be_of_type(column='gender', type_='CategoricalDtypeType')
alzheimers_df.expect_column_values_to_be_of_type(column='ethnicity', type_='CategoricalDtypeType')

{
  "success": true,
  "result": {
    "observed_value": "CategoricalDtypeType"
  },
  "meta": {},
  "exception_info": {
    "raised_exception": false,
    "exception_traceback": null,
    "exception_message": null
  }
}

### The 'Datasource' column values are constant. It's wasteful to have a whole column repeating the same thing but that information is important, so we will add metadata

In [57]:
alzheimers_df.attrs = {'datasource': 'BRFSS'}
print(alzheimers_df.attrs)

{'datasource': 'BRFSS'}


In [58]:
alzheimers_df = alzheimers_df.drop(columns=['Datasource'], axis = 1)
alzheimers_df.head(3)

Unnamed: 0,RowId,YearStart,YearEnd,LocationAbbr,LocationDesc,Class,Topic,Question,Data_Value_Type,Data_Value,Data_Value_Footnote,Low_Confidence_Limit,High_Confidence_Limit,age_group,Geolocation,gender,ethnicity
0,0,2022,2022,WEST,West,Mental Health,Frequent mental distress,Percentage of older adults who are experiencin...,Percentage,15.4,,13.9,17.1,50-64 years,,Female,overall
1,1,2022,2022,NRE,Northeast,Mental Health,Frequent mental distress,Percentage of older adults who are experiencin...,Percentage,10.3,,9.3,11.5,50-64 years,,Male,overall
2,2,2022,2022,MDW,Midwest,Mental Health,Frequent mental distress,Percentage of older adults who are experiencin...,Percentage,10.9,,8.8,13.6,Overall,,overall,Hispanic


## Silver ready
Now we may check and load the expectation suite

In [76]:
alzheimers_df.expect_column_values_to_match_json_schema(column = 'gender', json_schema= 'b')

JSONDecodeError: Expecting value: line 1 column 1 (char 0)

In [81]:
validator = context.sources.pandas_default.read_csv("Alzheimers.csv", encoding='ISO-8859-1', sep = ';')

alzheimers_df.expect_column_values_to_be_of_type(column='gender', type_='CategoricalDtypeType')
alzheimers_df.expect_column_values_to_be_of_type(column='ethnicity', type_='CategoricalDtypeType')

TypeError: argument of type 'method' is not iterable

In [85]:
checkpoint_result = null_checkpoin.run()

NameError: name 'null_checkpoin' is not defined

In [23]:
alzheimers_df.get_expectation_suite()

{
  "expectation_suite_name": "default",
  "ge_cloud_id": null,
  "expectations": [
    {
      "expectation_type": "expect_column_values_to_be_unique",
      "kwargs": {
        "column": "RowId"
      },
      "meta": {}
    },
    {
      "expectation_type": "expect_column_values_to_be_of_type",
      "kwargs": {
        "column": "age_group",
        "type_": "CategoricalDtypeType"
      },
      "meta": {}
    },
    {
      "expectation_type": "expect_column_values_to_be_between",
      "kwargs": {
        "column": "Data_Value",
        "min_value": 0,
        "max_value": 100
      },
      "meta": {}
    },
    {
      "expectation_type": "expect_column_values_to_be_between",
      "kwargs": {
        "column": "High_Confidence_Limit",
        "min_value": 0,
        "max_value": 100
      },
      "meta": {}
    },
    {
      "expectation_type": "expect_column_values_to_not_be_null",
      "kwargs": {
        "column": "Data_Value"
      },
      "meta": {}
    },
    {
    

In [34]:
context = gx.get_context()
print(context)

{
  "anonymous_usage_statistics": {
    "data_context_id": "8d1d3b17-ad0a-449d-b9e2-56354f1dca8a",
    "explicit_id": true,
    "usage_statistics_url": "https://stats.greatexpectations.io/great_expectations/v1/usage_statistics",
    "explicit_url": false,
    "enabled": true
  },
  "checkpoint_store_name": "checkpoint_store",
  "config_variables_file_path": "uncommitted/config_variables.yml",
  "config_version": 3.0,
  "data_docs_sites": {
    "local_site": {
      "class_name": "SiteBuilder",
      "show_how_to_buttons": true,
      "store_backend": {
        "class_name": "TupleFilesystemStoreBackend",
        "base_directory": "uncommitted/data_docs/local_site/"
      },
      "site_index_builder": {
        "class_name": "DefaultSiteIndexBuilder"
      }
    }
  },
  "datasources": {},
  "evaluation_parameter_store_name": "evaluation_parameter_store",
  "expectations_store_name": "expectations_store",
  "fluent_datasources": {
    "ge_datasource": {
      "type": "postgres",
      

In [29]:
context.list_expectation_suites()

[]

In [30]:
context.list_expectation_suite_names()

[]

In [79]:
checkpoint_null = context.add_or_update_checkpoint(name="null_checkpoin", validator=validator)

In [64]:
alzheimers_df.save_expectation_suite(discard_failed_expectations=False, filepath='check/')

PermissionError: [Errno 13] Permission denied: 'check/'

In [92]:
validator.

Object `validator.` not found.
