In [1]:
import sys
sys.path.append('..')

import pandas as pd
import numpy as np
from pprint import pprint

from xai.formatter.report import Report
from xai.data.explorer.data_analyzer_suite import DataAnalyzerSuite


In [2]:
## initialize report object
report = Report(name = 'Titanic Report')
report.detail.add_section_title("Data Section")


In [3]:
## load training data
training_file_name = './compiler/sample_input/titanic/titanic.csv'
data = pd.read_csv(training_file_name)
data.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


## Lable Distribution

In [4]:
from xai.data.explorer import CategoricalDataAnalyzer
label_analyzer = CategoricalDataAnalyzer()
label_column = 'Survived'

label_analyzer.feed_all(data[label_column].tolist())
label_stats = label_analyzer.get_statistics()

data_distributions = []
data_distributions.append((label_column,label_stats.frequency_count))
print(data_distributions)

[('Survived', defaultdict(<class 'int'>, {0: 549, 1: 342}))]


In [5]:
report.detail.add_header_level_1(text='Class distribution analysis')
report.detail.add_data_set_distribution(data_distributions)

## Data Field Attribute

In [6]:
# get the data type
from xai.data.constants import DATATYPE
def get_column_types(data, threshold, label_column):
    valid_feature_names = []
    valid_feature_types = []
    feature = {}
    feature['categorical'] = []
    feature['numerical'] = []
    meta = {}
    for column in data.columns:
        if column == label_column:
            meta[column] = {'type': 'label', 'used': True, 'structured':'attribute'}
            continue
        col_data = data[column]
        unique_values = col_data.unique()

        if col_data.dtypes == np.float64:
            feature['numerical'].append(column)
            valid_feature_names.append(column)
            valid_feature_types.append(DATATYPE.NUMBER)
            meta[column] = {'type': 'numerical', 'used': True, 'structured':'attribute'}
        elif col_data.dtypes == np.int64:   
            if len(unique_values) < threshold * len(col_data):
                feature['categorical'].append(column)
                valid_feature_names.append(column)
                valid_feature_types.append(DATATYPE.CATEGORY)
                meta[column] = {'type': 'categorical', 'used': True, 'structured':'attribute'}
                
            else:
                print('Error: %s is suspected to be identifierable features. %s distinct values given %s rows. Will be ignored in data report.'%
                      (column, len(unique_values), len(col_data)))
                if len(unique_values) == len(col_data):
                    meta[column] = {'type': 'KEY', 'used': False, 'structured':'attribute'}
        else:
            if len(unique_values) < threshold * len(col_data):
                feature['categorical'].append(column)
                valid_feature_names.append(column)
                valid_feature_types.append(DATATYPE.CATEGORY)
                meta[column] = {'type': 'categorical', 'used': True, 'structured':'attribute'}

            else:
                print('Warning: %s is suspected to be identifierable features. %s distinct values given %s rows. Set it to text feature.'%
                      (column, len(unique_values), len(col_data)))
                valid_feature_names.append(column)
                valid_feature_types.append(DATATYPE.FREETEXT)
                meta[column] = {'type': 'Text', 'used': False, 'structured':'attribute'}

    return feature, valid_feature_names, valid_feature_types, meta

feature, valid_feature_names, valid_feature_types, meta = get_column_types(data=data,threshold=0.6,label_column=label_column)

# pprint(feature)
pprint(valid_feature_names)
pprint(valid_feature_types)
pprint(meta)

Error: PassengerId is suspected to be identifierable features. 891 distinct values given 891 rows. Will be ignored in data report.
['Pclass',
 'Name',
 'Sex',
 'Age',
 'SibSp',
 'Parch',
 'Ticket',
 'Fare',
 'Cabin',
 'Embarked']
['categorical',
 'text',
 'categorical',
 'numerical',
 'categorical',
 'categorical',
 'text',
 'numerical',
 'categorical',
 'categorical']
{'Age': {'structured': 'attribute', 'type': 'numerical', 'used': True},
 'Cabin': {'structured': 'attribute', 'type': 'categorical', 'used': True},
 'Embarked': {'structured': 'attribute', 'type': 'categorical', 'used': True},
 'Fare': {'structured': 'attribute', 'type': 'numerical', 'used': True},
 'Name': {'structured': 'attribute', 'type': 'Text', 'used': False},
 'Parch': {'structured': 'attribute', 'type': 'categorical', 'used': True},
 'PassengerId': {'structured': 'attribute', 'type': 'KEY', 'used': False},
 'Pclass': {'structured': 'attribute', 'type': 'categorical', 'used': True},
 'Sex': {'structured': 'attribu

In [7]:
report.detail.add_data_attributes(meta)

## Data distribution

In [8]:
data_analyzer_suite = DataAnalyzerSuite(data_type_list=valid_feature_types,column_names=valid_feature_names)
pprint(data_analyzer_suite.schema)

{'Age': 'numerical',
 'Cabin': 'categorical',
 'Embarked': 'categorical',
 'Fare': 'numerical',
 'Name': 'text',
 'Parch': 'categorical',
 'Pclass': 'categorical',
 'Sex': 'categorical',
 'SibSp': 'categorical',
 'Ticket': 'text'}


In [9]:
for column, column_type in zip(valid_feature_names,valid_feature_types):
    if column_type == 'categorical':
        data[column][data[column].isnull()] = 'NAN'
    data_analyzer_suite.feed_column(column_name=column, column_data=data[column].tolist(), labels=data[label_column])

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  This is separate from the ipykernel package so we can avoid doing imports until


LookupError: 
**********************************************************************
  Resource 'tokenizers/punkt/PY3/english.pickle' not found.
  Please use the NLTK Downloader to obtain the resource:  >>>
  nltk.download()
  Searched in:
    - '/Users/i062308/nltk_data'
    - '/usr/share/nltk_data'
    - '/usr/local/share/nltk_data'
    - '/usr/lib/nltk_data'
    - '/usr/local/lib/nltk_data'
    - ''
**********************************************************************

In [None]:
stats = data_analyzer_suite.get_statistics()

In [None]:
### Add Header Level 3
report.detail.add_header_level_3(text='Categorical Field Distribution')

In [None]:
### Categorical field distribution    
for field_name in feature['categorical']:
    labelled_stats, all_stats = stats[field_name]
    report.detail.add_categorical_field_distribution(field_name=field_name, field_distribution=labelled_stats)

In [None]:
### Numeric field distribution    
for field_name in feature['numerical']:
    labelled_stats, all_stats = stats[field_name]
    report.detail.add_numeric_field_distribution(field_name=field_name, field_distribution=labelled_stats)

## Missing Value

In [None]:
from xai.data.validator import EnumValidator

def generate_missing_value_schema(valid_feature_names,valid_feature_types):
    missing_value_schema = dict()
    for name, column_type in zip(valid_feature_names,valid_feature_types):
        if column_type == 'categorical':
            missing_value_schema[name] = ['NAN']
        if column_type == 'numerical':
            missing_value_schema[name] = ['NaN']
    return missing_value_schema

schema = generate_missing_value_schema(valid_feature_names,valid_feature_types)
pprint(schema)

In [None]:
import json
json_line = json.loads(data.to_json(orient='records'))
enum_validator = EnumValidator(schema=schema)
enum_validator.validate_all(sample_list = json_line)
stats = enum_validator.get_statistics()

In [None]:
missing_count = dict(stats.column_stats)
total_count = {feature_name: stats.total_count for feature_name in schema.keys()}
pprint(missing_count)
pprint(total_count)
report.detail.add_data_missing_value(missing_count=dict(missing_count), total_count=total_count)

In [None]:
import os
from xai.formatter import PdfWriter
report.generate(writer=PdfWriter(name='data-sample-report'))
print("report generated : %s/sample-report-final-with-summary.pdf" % os.getcwd())