In [1]:
import sys
sys.path.append('..')

import pandas as pd
import numpy as np
from pprint import pprint

from xai.formatter.report import Report
from xai.data.explorer.data_analyzer_suite import DataAnalyzerSuite


In [2]:
## initialize report object
report = Report(name = 'Titanic Report')
report.detail.add_section_title("Data Section")


In [3]:
## load training data
training_file_name = './compiler/sample_input/titanic/titanic.csv'
data = pd.read_csv(training_file_name)
bday = []
for i in range(len(data)):
    year = np.random.randint(low=1960,high=1979)
    month = np.random.randint(low=1,high=12)
    day = np.random.randint(low=1,high=28)
    bday.append("%s"%(10000*year+100*month+day))
data['Birthday'] = bday
data.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,Birthday
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S,19710817
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C,19720114
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S,19630815
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S,19720815
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S,19730515


## Lable Distribution

In [4]:
from xai.data.explorer import CategoricalDataAnalyzer
label_analyzer = CategoricalDataAnalyzer()
label_column = 'Survived'

label_analyzer.feed_all(data[label_column].tolist())
label_stats = label_analyzer.get_statistics()

data_distributions = []
data_distributions.append((label_column,label_stats.frequency_count))
print(data_distributions)

[('Survived', defaultdict(<class 'int'>, {0: 549, 1: 342}))]


In [5]:
report.detail.add_header_level_1(text='Class distribution analysis')
report.detail.add_data_set_distribution(data_distributions)

In [6]:
data.dtypes

PassengerId      int64
Survived         int64
Pclass           int64
Name            object
Sex             object
Age            float64
SibSp            int64
Parch            int64
Ticket          object
Fare           float64
Cabin           object
Embarked        object
Birthday        object
dtype: object

## Data Field Attribute

In [7]:
# get the data type
from xai.data.constants import DATATYPE
import dateutil
from statistics import median
from collections import Counter
from copy import deepcopy

def get_column_types(data, threshold, label_column):
    
    def check_numercial(col_data):
        if col_data.dtypes == np.float64:
            return True
        else:
            return False
        
    def check_datetime(col_data):
        if col_data.dtypes == np.int64:
            return False
        
        def parse_date(date):
            try:
                dt = dateutil.parser.parse(str(date))
                return 0
            except ValueError:
                return 1

        counter = Counter(col_data.tolist())
        if len(counter) >= threshold * len(col_data):
            invalid_count = 0
            for date in col_data.tolist():
                invalid_count += parse_date(date)
            if invalid_count<threshold*len(col_data):
                return True
            else:
                return False
        else:
            return False
        
    def check_categorical(col_data):
        counter = Counter(col_data.tolist())
        if len(counter) < threshold * len(col_data):
            _median = median(counter.values())
            if _median == 1:
                return False
            else:
                return True
        else:
            return False
        
    def check_text(col_dat):
        def _get_token_number(x):
            return len(x.split(' '))
        if col_data.dtypes == object:
            if len(col_data.unique()) > len(col_data)*threshold:
                if median(col_data.apply(_get_token_number))>3:
                    return True
                else:
                    return False
    
                
    valid_feature_names = []
    valid_feature_types = []
    feature = {}
    feature['categorical'] = []
    feature['numerical'] = []
    feature['text'] = []
    feature['datetime'] = []
    meta = {}
    for column in data.columns:
        if column == label_column:
            meta[column] = {'type': 'label', 'used': True, 'structured':'attribute'}
            continue
        col_data = data[column]
        
        if check_datetime(col_data):
            # datetime data
            feature['datetime'].append(column)
            valid_feature_names.append(column)
            valid_feature_types.append(DATATYPE.DATETIME)
            meta[column] = {'type': 'datetime', 'used': True, 'structured':'attribute'}
        
        elif check_numercial(col_data): 
            # numerical data
            feature['numerical'].append(column)
            valid_feature_names.append(column)
            valid_feature_types.append(DATATYPE.NUMBER)
            meta[column] = {'type': 'numerical', 'used': True, 'structured':'attribute'}
        
        elif check_categorical(col_data): 
            # categorical data 
            feature['categorical'].append(column)
            valid_feature_names.append(column)
            valid_feature_types.append(DATATYPE.CATEGORY)
            meta[column] = {'type': 'categorical', 'used': True, 'structured':'attribute'}
        
        elif check_text(col_data):
            # text data
            feature['text'].append(column)
            valid_feature_names.append(column)
            valid_feature_types.append(DATATYPE.FREETEXT)
            meta[column] = {'type': 'Text', 'used': True, 'structured':'attribute'}
        
        else:
            print('Warning: the feature [%s] is suspected to be identifierable feature. \n[Examples]: %s\n' % (column, col_data.tolist()[:5]))
            meta[column] = {'type': 'Key', 'used': True, 'structured':'attribute'}               

    return feature, valid_feature_names, valid_feature_types, meta

feature, valid_feature_names, valid_feature_types, meta = get_column_types(data=data,
                                                                           threshold=0.3,
                                                                           label_column=label_column)

# pprint(feature)
pprint(valid_feature_names)
pprint(valid_feature_types)
pprint(meta)

[Examples]: [1, 2, 3, 4, 5]

[Examples]: ['A/5 21171', 'PC 17599', 'STON/O2. 3101282', '113803', '373450']

[Examples]: [nan, 'C85', nan, 'C123', nan]

['Pclass',
 'Name',
 'Sex',
 'Age',
 'SibSp',
 'Parch',
 'Fare',
 'Embarked',
 'Birthday']
['categorical',
 'text',
 'categorical',
 'numerical',
 'categorical',
 'categorical',
 'numerical',
 'categorical',
 'datetime']
{'Age': {'structured': 'attribute', 'type': 'numerical', 'used': True},
 'Birthday': {'structured': 'attribute', 'type': 'datetime', 'used': True},
 'Cabin': {'structured': 'attribute', 'type': 'Key', 'used': True},
 'Embarked': {'structured': 'attribute', 'type': 'categorical', 'used': True},
 'Fare': {'structured': 'attribute', 'type': 'numerical', 'used': True},
 'Name': {'structured': 'attribute', 'type': 'Text', 'used': True},
 'Parch': {'structured': 'attribute', 'type': 'categorical', 'used': True},
 'PassengerId': {'structured': 'attribute', 'type': 'Key', 'used': True},
 'Pclass': {'structured': 'attribute', 't

In [8]:
report.detail.add_data_attributes(meta)

## Data distribution

In [9]:
data_analyzer_suite = DataAnalyzerSuite(data_type_list=valid_feature_types,column_names=valid_feature_names)
pprint(data_analyzer_suite.schema)

{'Age': 'numerical',
 'Birthday': 'datetime',
 'Embarked': 'categorical',
 'Fare': 'numerical',
 'Name': 'text',
 'Parch': 'categorical',
 'Pclass': 'categorical',
 'Sex': 'categorical',
 'SibSp': 'categorical'}


In [10]:
for column, column_type in zip(valid_feature_names,valid_feature_types):
    print(column)
    if column_type in ['categorical','text']:
        data[column][data[column].isnull()] = 'NAN'
    data_analyzer_suite.feed_column(column_name=column, column_data=data[column].tolist(), labels=data[label_column])

Pclass
Name


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  after removing the cwd from sys.path.


Sex
Age
SibSp
Parch
Fare
Embarked
Birthday


In [11]:
stats = data_analyzer_suite.get_statistics()
min(stats['Name'][1].tfidf.values())

0.0

In [12]:
### Add Header Level 3
report.detail.add_header_level_3(text='Categorical Field Distribution')

In [13]:
### Categorical field distribution    
for field_name in feature['categorical']:
    labelled_stats, all_stats = stats[field_name]
    report.detail.add_categorical_field_distribution(field_name=field_name, field_distribution=labelled_stats)

In [14]:
### Numeric field distribution    
for field_name in feature['numerical']:
    labelled_stats, all_stats = stats[field_name]
    report.detail.add_numeric_field_distribution(field_name=field_name, field_distribution=labelled_stats)

In [15]:
### Text field distribution    
for field_name in feature['text']:
    labelled_stats, all_stats = stats[field_name]
    report.detail.add_text_field_distribution(field_name=field_name, field_distribution=labelled_stats)

In [16]:
### Date Time field distribution    
for field_name in feature['datetime']:
    labelled_stats, all_stats = stats[field_name]
    report.detail.add_datetime_field_distribution(field_name=field_name, field_distribution=labelled_stats)

## Missing Value

In [17]:
from xai.data.validator import EnumValidator

def generate_missing_value_schema(valid_feature_names,valid_feature_types):
    missing_value_schema = dict()
    for name, column_type in zip(valid_feature_names,valid_feature_types):
        if column_type == 'categorical':
            missing_value_schema[name] = ['NAN']
        if column_type == 'numerical':
            missing_value_schema[name] = ['NAN']
    return missing_value_schema

schema = generate_missing_value_schema(valid_feature_names,valid_feature_types)
pprint(schema)

{'Age': ['NAN'],
 'Embarked': ['NAN'],
 'Fare': ['NAN'],
 'Parch': ['NAN'],
 'Pclass': ['NAN'],
 'Sex': ['NAN'],
 'SibSp': ['NAN']}


In [18]:
import json
json_line = json.loads(data.to_json(orient='records'))
enum_validator = EnumValidator(schema=schema)
enum_validator.validate_all(sample_list = json_line)
stats = enum_validator.get_statistics()

In [19]:
missing_count = dict(stats.column_stats)
total_count = {feature_name: stats.total_count for feature_name in schema.keys()}
pprint(missing_count)
pprint(total_count)
report.detail.add_data_missing_value(missing_count=dict(missing_count), total_count=total_count)

{'Embarked': 2}
{'Age': 891,
 'Embarked': 891,
 'Fare': 891,
 'Parch': 891,
 'Pclass': 891,
 'Sex': 891,
 'SibSp': 891}


In [20]:
import os
from xai.formatter import PdfWriter
report.generate(writer=PdfWriter(name='data-sample-report'))
print("report generated : %s/sample-report-final-with-summary.pdf" % os.getcwd())

report generated : /Users/i062308/Development/Explainable_AI/tests/sample-report-final-with-summary.pdf
