# Evaluation Using Eli5 and Lime

In [None]:
# import packages
import numpy as np
import pandas as pd

# word2vec pipeline
from sklearn.pipeline import make_pipeline
from word2vec import get_embed_features_list
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier

# eli5
import eli5
from eli5.lime import TextExplainer

In [None]:
# define custom transformer to embed words
class FeatureEmbedder(BaseEstimator, TransformerMixin): 
    def __init__( self ):
        return
    
    # Return self nothing else to do here    
    def fit( self, X, y = None ):
        return self 
    
    # Method that describes what we need this transformer to do
    def transform( self, X, y = None ):
        return get_embed_features_list(X)

In [None]:
# define filter functions
def filter_entity(feature, entity_list):
    '''
    retrieve rows that have entity names
    '''
    # set everything to lowercase and split feature words
    entity_list = set(x.lower() for x in entity_list)
    feature_words = feature.lower().split()

    for word in feature_words:
        if word in entity_list:
            return True
    return False

def retrieve_entity(feature, entity_list):
    '''
    outputs entity that feature belongs to
    '''
    # set everything to lowercase and split feature words
    entity_list = set(x.lower() for x in entity_list)
    feature_words = feature.lower().split()

    for word in feature_words:
        if word in entity_list:
            return word
    return None # no word available

In [None]:
# load sample data
all_train = pd.read_csv('data/all_train.csv', header = 0)[["date_time", "text", "label"]]
all_test = pd.read_csv('data/all_test.csv', header = 0)[["date_time", "text", "label"]]
all_sample = pd.concat([all_train, all_test], axis=0)

# load entity list
entity_list = set(pd.read_csv("data/entity_list.csv", header=0)["entity"])

In [None]:
all_sample.head()

## Logistic Regression

In [None]:
# create pipeline
vec = FeatureEmbedder()
lr = LogisticRegression()
pipe_lr = make_pipeline(vec, lr)
pipe_lr.fit(all_train.text, all_train.label)

In [7]:
# retrieve all features and weights from all possible texts
for i in range(len(all_sample)):
    current_text = all_sample["text"].iloc[i]
    
    # create text explainer
    te_lr = TextExplainer(random_state=42)
    te_lr.fit(current_text, pipe_lr.predict_proba)

    # create dataframe from weights
    current_df = eli5.formatters.as_dataframe.format_as_dataframes(te_lr.explain_weights(top=None))["targets"]

    if i == 0:
        lr_df = current_df
    else:
        lr_df = pd.concat([current_df, lr_df], axis=0)

In [None]:
# retrieve mean of each weight
lr_df_averaged = lr_df.groupby("feature").mean()
lr_df_averaged = lr_df_averaged.reset_index()

lr_df_averaged

In [None]:
# retrieve rows that have entity names
lr_df_entity = lr_df_averaged[lr_df_averaged.apply(lambda x: filter_entity(x["feature"], entity_list), axis=1)]

# label rows as their entity
lr_df_entity["feature_entity"] = lr_df_entity["feature"].apply(lambda x: retrieve_entity(x, entity_list))

# output entity 
lr_df_entity_grouped = lr_df_entity.groupby("feature_entity").mean()
lr_df_entity_grouped = lr_df_entity_grouped.reset_index()

lr_df_entity_grouped

In [None]:
# save results to csv
lr_df_averaged.to_csv("data/evaluation_lime/lime_word2vec_lr.csv", index=False)
lr_df_entity.to_csv("data/evaluation_lime/lime_entity_ungrouped_word2vec_lr.csv", index=False)
lr_df_entity_grouped.to_csv("data/evaluation_lime/lime_entity_word2vec_lr.csv", index=False)

## SVM

In [None]:
# create pipeline
vec = FeatureEmbedder()
svm = SVC(probability=True)
pipe_svm = make_pipeline(vec, svm)
pipe_svm.fit(all_train.text, all_train.label)

In [None]:
# retrieve all features and weights from all possible texts
for i in range(len(all_sample)):
    current_text = all_sample["text"].iloc[i]
    
    # create text explainer
    te_svm = TextExplainer(random_state=42)
    te_svm.fit(current_text, pipe_svm.predict_proba)

    # create dataframe from weights
    current_df = eli5.formatters.as_dataframe.format_as_dataframes(te_svm.explain_weights(top=None))["targets"]

    if i == 0:
        svm_df = current_df
    else:
        svm_df = pd.concat([current_df, svm_df], axis=0)

In [None]:
# retrieve mean of each weight
svm_df_averaged = svm_df.groupby("feature").mean()
svm_df_averaged = svm_df_averaged.reset_index()

svm_df_averaged

In [None]:
# retrieve rows that have entity names
svm_df_entity = svm_df_averaged[svm_df_averaged.apply(lambda x: filter_entity(x["feature"], entity_list), axis=1)]

# label rows as their entity
svm_df_entity["feature_entity"] = svm_df_entity["feature"].apply(lambda x: retrieve_entity(x, entity_list))

# output entity 
svm_df_entity_grouped = svm_df_entity.groupby("feature_entity").mean()
svm_df_entity_grouped = svm_df_entity_grouped.reset_index()

svm_df_entity_grouped

In [None]:
# save results to csv
svm_df_averaged.to_csv("data/evaluation_lime/lime_word2vec_svm.csv", index=False)
svm_df_entity.to_csv("data/evaluation_lime/lime_entity_ungrouped_word2vec_svm.csv", index=False)
svm_df_entity_grouped.to_csv("data/evaluation_lime/lime_entity_word2vec_svm.csv", index=False)

In [None]:
# individual text explainer
# te_svm = TextExplainer(random_state=42)
# te_svm.fit(all_test.text.iloc[1], pipe_svm.predict_proba)
# te_svm.explain_weights(top=None)

## Random Forest

In [None]:
# create pipeline
vec = FeatureEmbedder()
rf = RandomForestClassifier()
pipe_rf = make_pipeline(vec, rf)
pipe_rf.fit(all_train.text, all_train.label)

In [None]:
# retrieve all features and weights from all possible texts
for i in range(len(all_sample)):
    current_text = all_sample["text"].iloc[i]
    
    # create text explainer
    te_rf = TextExplainer(random_state=42)
    te_rf.fit(current_text, pipe_svm.predict_proba)

    # create dataframe from weights
    current_df = eli5.formatters.as_dataframe.format_as_dataframes(te_rf.explain_weights(top=None))["targets"]

    if i == 0:
        rf_df = current_df
    else:
        rf_df = pd.concat([current_df, rf_df], axis=0)

In [None]:
# retrieve mean of each weight
rf_df_averaged = rf_df.groupby("feature").mean()
rf_df_averaged = rf_df_averaged.reset_index()

rf_df_averaged

In [None]:
# retrieve rows that have entity names
rf_df_entity = rf_df_averaged[rf_df_averaged.apply(lambda x: filter_entity(x["feature"], entity_list), axis=1)]

# label rows as their entity
rf_df_entity["feature_entity"] = rf_df_entity["feature"].apply(lambda x: retrieve_entity(x, entity_list))

# output entity 
rf_df_entity_grouped = rf_df_entity.groupby("feature_entity").mean()
rf_df_entity_grouped = rf_df_entity_grouped.reset_index()

rf_df_entity_grouped

In [None]:
# save results to csv
rf_df_averaged.to_csv("data/evaluation_lime/lime_word2vec_rf.csv", index=False)
rf_df_entity.to_csv("data/evaluation_lime/lime_entity_ungrouped_word2vec_rf.csv", index=False)
rf_df_entity_grouped.to_csv("data/evaluation_lime/lime_entity_word2vec_rf.csv", index=False)