In [None]:
import pandas as pd
import numpy

from pydqc import infer_schema, data_summary, data_compare, data_consist

#import sys
#sys.path.insert(0, '../pydqc')
#import infer_schema, data_summary, data_compare, data_consist

# load data
Test data is from Kaggle competition: **Zillow Prize: Zillow’s Home Value Prediction (Zestimate)**  
link: https://www.kaggle.com/c/zillow-prize-1  

You need to download the following files, unzip and put them into the 'data' folder.  
1. properties_2016.csv.zip  
2. properties_2017.csv.zip  

In [None]:
%time data_2016 = pd.read_csv('data/properties_2016.csv')
%time data_2017 = pd.read_csv('data/properties_2017.csv')

In [None]:
print(data_2016.shape)
print(data_2017.shape)

In [None]:
data_2016.head()

In [None]:
data_2017.head()

# infer schema

### infer schema based on full set of data

In [None]:
%%time
infer_schema.infer_schema(data=data_2016, fname='properties_2016', output_root='output/', 
                          sample_size=1.0, type_threshold=0.5, n_jobs=2, base_schema=None)

### infer schema based on sample data

In [None]:
%%time
infer_schema.infer_schema(data=data_2016, fname='properties_2016_sample', output_root='output/', 
                          sample_size=0.1, type_threshold=0.5, n_jobs=2, base_schema=None)

### you should check the generated data schema 
and modify the data types if necessary. :)  
You can do the modification based on the data dictionary **zillow_data_dictionary.xlsx**.  
It's better to save the modified data schema with different name from the original one.  
In this example, the modified data schema is saved as XXX_mdf.xlsx

### infer schema based on base_schema
Since we have already generated data schema for data_2016, we can use the **modified** data schema of data_2016 to infer the schema for data_2017. 

In [None]:
data_2016_schema = pd.read_excel('output/data_schema_properties_2016_mdf.xlsx')

In [None]:
data_2016_schema.head()

In [None]:
%%time
infer_schema.infer_schema(data=data_2017, fname='properties_2017_sample', output_root='output/', 
                          sample_size=0.1, type_threshold=0.5, n_jobs=2, base_schema=data_2016_schema)

# data_summary
generate data summary report based on the **modified** data schema

In [None]:
data_2016_schema = pd.read_excel('output/data_schema_properties_2016_mdf.xlsx')

In [None]:
data_2016_schema.head()

In [None]:
%%time
data_summary.data_summary(table_schema=data_2016_schema, table=data_2016, fname='properties_2016', 
                          sample_size=1.0, output_root='output/', keep_images=False, n_jobs=2)

### generate data summary notebook
if you want to do further checking based on the basic methods provided by **data_summary**, you can generate a notebook.

In [None]:
data_summary.data_summary_notebook(table_schema=data_2016_schema, table=data_2016, 
                                   fname='properties_2016', output_root='output/')

# data compare

In [None]:
data_2017_schema = pd.read_excel('output/data_schema_properties_2017_sample_mdf.xlsx')

In [None]:
%%time
data_compare.data_compare(table1=data_2016, table2=data_2017, schema1=data_2016_schema, schema2=data_2017_schema,
                          fname='properties_2016', sample_size=1.0, output_root='output/', keep_images=False, n_jobs=2)

### generate data compare notebook

In [None]:
data_compare.data_compare_notebook(table1=data_2016, table2=data_2017, schema1=data_2016_schema, schema2=data_2017_schema,
                                   fname='properties_2016', output_root='output/')

# data consist

In [None]:
%%time
data_consist.data_consist(table1=data_2016, table2=data_2017, key1='parcelid', key2='parcelid',
                          schema1=data_2016_schema, schema2=data_2017_schema,
                          fname='properties_2016', sample_size=1.0, output_root='output/', keep_images=False, n_jobs=2)

In [None]:
data_consist.data_consist_notebook(table1=data_2016, table2=data_2017, key1='parcelid', key2='parcelid',
                                   schema1=data_2016_schema, schema2=data_2017_schema,
                                   fname='properties_2016', output_root='output/')