In [1]:
import pandas as pd

In [2]:
# Configuration
n_features = 3

In [3]:
# Load data
raw = pd.read_csv('../../examples/example_data_viz/local_dataset_reliability.csv')
local_nodes_raw = pd.read_csv('../../examples/example_data_viz/local_graph_nodes.csv')
global_nodes_raw = pd.read_csv('../../examples/example_data_viz/global_graph_nodes.csv')
global_explain = pd.read_csv('../../examples/example_data_viz/global_explainability.csv')

In [4]:
# Fix data
global_explain['feature_name'] = global_explain['feature_name'].replace({'economic_segment': 'economic_class'})
raw['gender'] = raw['gender'].str.lower()

In [5]:
# Select features to work with
features = list(global_explain.sort_values('feature_importance', ascending=False).head(n_features)['feature_name'])
features

['incomes', 'economic_class', 'gender']

In [6]:
# Clean local nodes DataFrame (one row per tuple id and column values)
local_nodes_clean = None
for c in features:
    # print("--------------------------------------------------------------------")
    # print("Processing column {}".format(c))
    tmp_c = local_nodes_raw[local_nodes_raw['node_name'].str.startswith(c)]
    tmp_sort = (tmp_c[['id', 'node_name', 'node_importance']].sort_values(['id', 'node_importance'], ascending=[True, False]))
    tmp_max = (tmp_c.groupby(['id'])['node_importance'].max())
    tmp_join = tmp_sort.join(tmp_max, on='id', how='left', rsuffix='_max')
    tmp = (tmp_join[tmp_join['node_importance'] == tmp_join['node_importance_max']]
           .drop_duplicates(subset=['id', 'node_importance'], keep='first')
           .drop('node_importance_max', axis=1))
    local_nodes_clean = pd.concat([local_nodes_clean, tmp])
    # print("")
    
print(local_nodes_clean.shape)
local_nodes_clean


(60, 3)


Unnamed: 0,id,node_name,node_importance
0,1,incomes_high,0.24
8,2,incomes_mid,0.24
17,3,incomes_mid,0.24
25,4,incomes_high,0.25
32,5,incomes_low,0.4
40,6,incomes_mid,0.32
48,7,incomes_high,0.37
56,8,incomes_high,0.27
65,9,incomes_high,0.24
72,10,incomes_mid,0.26


In [7]:
# Pivot local nodes
local_nodes_pivot = (
    local_nodes_clean
    .pivot(index='id', columns='node_name', values='node_importance')
    .reset_index()
    .rename_axis(None, axis=1))

# Select nodes according to features, add prefix to global nodes and pivot
global_nodes_clean = None
for c in features:
    tmp = global_nodes_raw[global_nodes_raw['node_name'].str.startswith(c)][['target', 'node_name', 'node_importance']]
    global_nodes_clean = pd.concat([global_nodes_clean, tmp])
global_nodes_clean['node_name'] = 'g_' + global_nodes_clean['node_name'].astype(str)
global_nodes_pivot = (
    global_nodes_clean
    .pivot(index='target', columns='node_name', values='node_importance')
    .reset_index()
    .rename_axis(None, axis=1))

# Join al the information into a single dataframe
df = (raw
      .merge(local_nodes_pivot, on='id', how='left')
      .merge(global_nodes_pivot, on='target', how='left'))
print(df.shape)
df

(20, 22)


Unnamed: 0,id,incomes,economic_class,gender,target,Reliability,economic_class_first,economic_class_second,economic_class_third,gender_female,...,incomes_low,incomes_mid,g_economic_class_first,g_economic_class_second,g_economic_class_third,g_gender_female,g_gender_male,g_incomes_high,g_incomes_low,g_incomes_mid
0,1,21378,third,male,Xiaomi,7.0,0.18,,,0.15,...,,,0.05,0.05,0.39,0.14,0.07,0.02,0.21,0.07
1,2,113816,first,female,iPhone,25.0,0.18,,,0.12,...,,0.24,0.18,0.11,0.08,0.15,0.09,0.24,0.02,0.13
2,3,81067,first,female,Samsung,8.0,,0.34,,,...,,0.24,0.1,0.16,0.01,0.01,0.11,0.23,0.09,0.29
3,4,98215,first,male,iPhone,84.0,0.27,,,0.13,...,,,0.18,0.11,0.08,0.15,0.09,0.24,0.02,0.13
4,5,30006,second,male,Xiaomi,50.0,,,0.24,0.08,...,0.4,,0.05,0.05,0.39,0.14,0.07,0.02,0.21,0.07
5,6,41421,second,female,Samsung,45.0,,0.18,,,...,,0.32,0.1,0.16,0.01,0.01,0.11,0.23,0.09,0.29
6,7,103675,first,female,iPhone,93.0,0.11,,,0.12,...,,,0.18,0.11,0.08,0.15,0.09,0.24,0.02,0.13
7,8,98094,first,male,iPhone,70.0,0.25,,,,...,,,0.18,0.11,0.08,0.15,0.09,0.24,0.02,0.13
8,9,44244,second,female,iPhone,67.0,0.31,,,0.1,...,,,0.18,0.11,0.08,0.15,0.09,0.24,0.02,0.13
9,10,88996,first,male,Samsung,16.0,0.13,,,,...,,0.26,0.1,0.16,0.01,0.01,0.11,0.23,0.09,0.29


In [8]:
df1 = df[df['id'] == 2].T
df1.reset_index(inplace=True)
df1.columns = ['name', 'value']
df1

Unnamed: 0,name,value
0,id,2
1,incomes,113816
2,economic_class,first
3,gender,female
4,target,iPhone
5,Reliability,25.0
6,economic_class_first,0.18
7,economic_class_second,
8,economic_class_third,
9,gender_female,0.12


In [9]:
col_to_keep = ['^' + c + '_' for c in features]
cond1 = df1['value'].notnull()
cond2 = df1[['name']].stack().str.contains('|'.join(col_to_keep), case=False, na=False).groupby(level=0).any()
df_local = df1[(cond1) & (cond2)].rename(columns={'value': 'node_importance'})
df_local

# .pivot(index='target', columns='node_name', values='node_importance')

df2 = df_local.copy()
df2[['foo1', 'value']] = df2['name'].str.split('|'.join(col_to_keep), n=1, expand=True)
df2[['name', 'foo2']] = df2['name'].str.split('|'.join('_' + df2['value']), n=1, expand=True)
df2 = df2.drop(['foo1', 'foo2'], axis=1)[['name', 'value', 'node_importance']]
df2

Unnamed: 0,name,value,node_importance
6,economic_class,first,0.18
9,gender,female,0.12
13,incomes,mid,0.24


In [10]:
df_nlg_full = df1[(cond1)].merge(df2, on=['name'], how='left', suffixes=('_raw', ''))
df_nlg_full['value'] = df_nlg_full['value'].combine_first(df_nlg_full['value_raw'])
df_nlg_full

Unnamed: 0,name,value_raw,value,node_importance
0,id,2,2,
1,incomes,113816,mid,0.24
2,economic_class,first,first,0.18
3,gender,female,female,0.12
4,target,iPhone,iPhone,
5,Reliability,25.0,25.0,
6,economic_class_first,0.18,0.18,
7,gender_female,0.12,0.12,
8,incomes_mid,0.24,0.24,
9,g_economic_class_first,0.18,0.18,


In [None]:
"The user $user_name (with id $id) has been recommended $syn_a_an $target"

In [11]:
# Keep only necessary rows
df_nlg_no_local = df_nlg_full.merge(df_local.drop('node_importance', axis=1), on='name', how='left', indicator=True)
df_nlg_no_local = df_nlg_no_local[df_nlg_no_local['_merge'] == 'left_only'].drop('_merge', axis=1)

df_nlg_no_global = (df_nlg_no_local
                    .merge(global_nodes_clean[['node_name']].drop_duplicates(),
                           left_on='name', right_on='node_name', how='left', indicator=True))
df_nlg_no_global = df_nlg_no_global[df_nlg_no_global['_merge'] == 'left_only'].drop(['node_name', '_merge'], axis=1)
df_nlg = df_nlg_no_global
df_nlg

Unnamed: 0,name,value_raw,value,node_importance
0,id,2,2,
1,incomes,113816,mid,0.24
2,economic_class,first,first,0.18
3,gender,female,female,0.12
4,target,iPhone,iPhone,
5,Reliability,25.0,25.0,


In [12]:
###############################################################################################
####################################### NLG ###################################################
###############################################################################################
from nlg.search import templatize
from gramex.data import filter as gfilter
from nlg.utils import load_spacy_model

In [13]:
sort_args = {'_sort': ['-node_importance']}
xdf = gfilter(df_nlg, sort_args.copy())
xdf.head()

Unnamed: 0,name,value_raw,value,node_importance
1,incomes,113816,mid,0.24
2,economic_class,first,first,0.18
3,gender,female,female,0.12
0,id,2,2,
4,target,iPhone,iPhone,


In [14]:
nlp = load_spacy_model()
# text = nlp("The customer with id 2 has a iPhone because his incomes are mid (113816), she belongs to the first economic class and her gender is female.")
text = nlp("The customer with id 2 has a value iPhone because his incomes are mid, she belongs to the first economic class and her gender is female.")

In [15]:
nugget = templatize(text, sort_args, df_nlg.drop('value_raw', axis=1))
nugget



{% set fh_args = {"_sort": ["-node_importance"]}  %}
{% set df = U.gfilter(orgdf, fh_args.copy()) %}
{% set fh_args = U.sanitize_fh_args(fh_args, orgdf) %}
{# Do not edit above this line. #}
The customer with id {{ df["value"].iloc[3] }} has a {{ df.columns[1] }} {{ df["value"].iloc[-{{ df["value"].iloc[3] }}] }} because his {{ df["name"].iloc[0] }} are {{ df["value"].iloc[0] }}, she belongs to the {{ df["value"].iloc[1] }} economic class and her {{ df["name"].iloc[{{ df["value"].iloc[3] }}] }} is {{ df["value"].iloc[2] }}.

In [16]:
text

The customer with id 2 has a value iPhone because his incomes are mid, she belongs to the first economic class and her gender is female.

In [17]:
nugget.variables

{value: {{ df.columns[1] }},
 2: {{ df["value"].iloc[3] }},
 incomes: {{ df["name"].iloc[0] }},
 gender: {{ df["name"].iloc[2] }},
 iPhone: {{ df["value"].iloc[-2] }},
 first: {{ df["value"].iloc[1] }},
 2: {{ df["value"].iloc[3] }},
 mid: {{ df["value"].iloc[0] }},
 female: {{ df["value"].iloc[2] }}}

In [18]:
nugget.render(df_nlg).decode('utf8')
# nugget.variables

ERROR:tornado.application:<string> code:
 1  def _tt_execute():  # <string>:0
 2      _tt_buffer = []  # <string>:0
 3      _tt_append = _tt_buffer.append  # <string>:0
 4      fh_args = {"_sort": ["-node_importance"]}  # <string>:1
 5      _tt_append(b' ')  # <string>:2
 6      df = U.gfilter(orgdf, fh_args.copy())  # <string>:2
 7      _tt_append(b' ')  # <string>:3
 8      fh_args = U.sanitize_fh_args(fh_args, orgdf)  # <string>:3
 9      _tt_append(b' ')  # <string>:4
10      _tt_append(b' The customer with id ')  # <string>:5
11      _tt_tmp = df["value"].iloc[3]  # <string>:5
12      if isinstance(_tt_tmp, _tt_string_types): _tt_tmp = _tt_utf8(_tt_tmp)  # <string>:5
13      else: _tt_tmp = _tt_utf8(str(_tt_tmp))  # <string>:5
14      _tt_tmp = _tt_utf8(xhtml_escape(_tt_tmp))  # <string>:5
15      _tt_append(_tt_tmp)  # <string>:5
16      _tt_append(b' has a ')  # <string>:5
17      _tt_tmp = df.columns[1]  # <string>:5
18      if isinstance(_tt_tmp, _tt_string_types): _tt_tmp = _

SyntaxError: invalid syntax (<string>.generated.py, line 24)

In [None]:
def _tt_execute():  # <string>:0
    _tt_buffer = []  # <string>:0
    _tt_append = _tt_buffer.append  # <string>:0
    fh_args = {"_sort": ["-node_importance"]}  # <string>:1
    _tt_append(b' ')  # <string>:2
    df = U.gfilter(orgdf, fh_args.copy())  # <string>:2
    _tt_append(b' ')  # <string>:3
    fh_args = U.sanitize_fh_args(fh_args, orgdf)  # <string>:3
    _tt_append(b' ')  # <string>:4
    _tt_append(b' The customer with id ')  # <string>:5
    _tt_tmp = df["value"].iloc[3]  # <string>:5
    if isinstance(_tt_tmp, _tt_string_types): _tt_tmp = _tt_utf8(_tt_tmp)  # <string>:5
    else: _tt_tmp = _tt_utf8(str(_tt_tmp))  # <string>:5
    _tt_tmp = _tt_utf8(xhtml_escape(_tt_tmp))  # <string>:5
    _tt_append(_tt_tmp)  # <string>:5
    _tt_append(b' has a ')  # <string>:5
    _tt_tmp = df["value"].iloc[-{{ df["value"].iloc[3]  # <string>:5
    if isinstance(_tt_tmp, _tt_string_types): _tt_tmp = _tt_utf8(_tt_tmp)  # <string>:5
    else: _tt_tmp = _tt_utf8(str(_tt_tmp))  # <string>:5
    _tt_tmp = _tt_utf8(xhtml_escape(_tt_tmp))  # <string>:5
    _tt_append(_tt_tmp)  # <string>:5
    _tt_append(b'] }} because his ')  # <string>:5
    _tt_tmp = df["name"].iloc[0]  # <string>:5
    if isinstance(_tt_tmp, _tt_string_types): _tt_tmp = _tt_utf8(_tt_tmp)  # <string>:5
    else: _tt_tmp = _tt_utf8(str(_tt_tmp))  # <string>:5
    _tt_tmp = _tt_utf8(xhtml_escape(_tt_tmp))  # <string>:5
    _tt_append(_tt_tmp)  # <string>:5
    _tt_append(b' are ')  # <string>:5
    _tt_tmp = df["value"].iloc[0]  # <string>:5
    if isinstance(_tt_tmp, _tt_string_types): _tt_tmp = _tt_utf8(_tt_tmp)  # <string>:5
    else: _tt_tmp = _tt_utf8(str(_tt_tmp))  # <string>:5
    _tt_tmp = _tt_utf8(xhtml_escape(_tt_tmp))  # <string>:5
    _tt_append(_tt_tmp)  # <string>:5
    _tt_append(b' (')  # <string>:5
    _tt_tmp = df["value_raw"].iloc[0]  # <string>:5
    if isinstance(_tt_tmp, _tt_string_types): _tt_tmp = _tt_utf8(_tt_tmp)  # <string>:5
    else: _tt_tmp = _tt_utf8(str(_tt_tmp))  # <string>:5
    _tt_tmp = _tt_utf8(xhtml_escape(_tt_tmp))  # <string>:5
    _tt_append(_tt_tmp)  # <string>:5
    _tt_append(b'), she belongs to the ')  # <string>:5
    _tt_tmp = df["value"].iloc[1]  # <string>:5
    if isinstance(_tt_tmp, _tt_string_types): _tt_tmp = _tt_utf8(_tt_tmp)  # <string>:5
    else: _tt_tmp = _tt_utf8(str(_tt_tmp))  # <string>:5
    _tt_tmp = _tt_utf8(xhtml_escape(_tt_tmp))  # <string>:5
    _tt_append(_tt_tmp)  # <string>:5
    _tt_append(b' economic class and her ')  # <string>:5
    _tt_tmp = df["name"].iloc[{{ df["value"].iloc[3]  # <string>:5
    if isinstance(_tt_tmp, _tt_string_types): _tt_tmp = _tt_utf8(_tt_tmp)  # <string>:5
    else: _tt_tmp = _tt_utf8(str(_tt_tmp))  # <string>:5
    _tt_tmp = _tt_utf8(xhtml_escape(_tt_tmp))  # <string>:5
    _tt_append(_tt_tmp)  # <string>:5
    _tt_append(b'] }} is ')  # <string>:5
    _tt_tmp = df["value"].iloc[2]  # <string>:5
    if isinstance(_tt_tmp, _tt_string_types): _tt_tmp = _tt_utf8(_tt_tmp)  # <string>:5
    else: _tt_tmp = _tt_utf8(str(_tt_tmp))  # <string>:5
    _tt_tmp = _tt_utf8(xhtml_escape(_tt_tmp))  # <string>:5
    _tt_append(_tt_tmp)  # <string>:5
    _tt_append(b'.')  # <string>:5
    return _tt_utf8('').join(_tt_buffer)  # <string>:0