In [None]:
# import packages
import numpy as np
import pandas as pd

from sklearn.pipeline import make_pipeline
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer

#lime
from lime.lime_text import LimeTextExplainer
from collections import defaultdict
from tqdm import tqdm
import matplotlib.pyplot as plt
import pickle

In [None]:
# load sample data
all_train = pd.read_csv('data/stacking_folds/train_all.csv', header = 0)[["phrase_stem_emoticon_generic", "phrase_stem_emoticon_unique","phrase_emoticon_generic", "label"]]
all_train.label = all_train.label.astype('int32') # convert target to int
all_test = pd.read_csv('data/stacking_folds/test.csv', header = 0)[["phrase_stem_emoticon_generic","phrase_stem_emoticon_unique", "phrase_emoticon_generic", "label"]]
all_test.label = all_test.label.astype('int32') # convert target to int
all_sample = pd.concat([all_train, all_test], axis=0).reset_index().drop('index', axis=1)
print(all_sample.label.value_counts())

In [None]:
all_sample.head()

# LIME

In [None]:
def append_dict_of_scores(d, lime_exp, label):
    l = lime_exp.as_list(label=label)
    for item in l:
        key = item[0]
        val = item[1]
        if key in d:
            d[key].append(val)
        else:
            d[key] = [val]


def generate_lime_scores(pipe, phrase_ver):
    d_neg = defaultdict()
    d_neu = defaultdict()
    d_pos = defaultdict()
    class_names = [-1, 0, 1] # ordered according to the classifier
    explainer = LimeTextExplainer(class_names = class_names, random_state=42)
    for i in tqdm(range(len(all_sample))):
        current_text = all_sample[phrase_ver].iloc[i]
        # labels – iterable with labels to be explained
        # num_samples – size of the neighborhood to learn the linear model
        exp = explainer.explain_instance(current_text, pipe.predict_proba, labels=[0, 1, 2], num_samples=100)
        append_dict_of_scores(d_neg, exp, 0) # class_names[0] = -1
        append_dict_of_scores(d_neu, exp, 1)
        append_dict_of_scores(d_pos, exp, 2)
    return d_neg, d_neu, d_pos

In [None]:
def dict_to_df(d, newcols):
    token_df =  pd.DataFrame([d]).T
    token_df = token_df.reset_index()
    token_df.columns = newcols
    return token_df

In [None]:
def get_token_avg_score(dpos, dneu, dneg):
    avgDict_pos = {}
    avgDict_neu = {}
    avgDict_neg = {}
    for k,v in dpos.items():
        # v is the list of impact on probability of predicting a class for a particular token
        avgDict_pos[k] = sum(v)/ float(len(v))
    for k,v in dneg.items():
        # v is the list of impact on probability of predicting a class for a particular token
        avgDict_neg[k] = sum(v)/ float(len(v))
    for k,v in dneu.items():
        # v is the list of impact on probability of predicting a class for a particular token
        avgDict_neu[k] = sum(v)/ float(len(v))
    pos=dict_to_df(avgDict_pos, ['token', 'average_pos_impact'])
    neg=dict_to_df(avgDict_neg, ['token', 'average_neg_impact'])
    neu=dict_to_df(avgDict_neu, ['token', 'average_neu_impact'])
    fin = pos.merge(neg, on='token', how = 'inner').merge(neu, on='token', how = 'inner')
    fin.sort_values(['average_pos_impact'], ascending=False)
    return fin

# LOGISTIC REGRESSION

In [None]:
# create pipeline
vec = TfidfVectorizer(analyzer="word",
    lowercase= True,
    ngram_range =(1,2),
    max_df = 0.25)
lr = LogisticRegression(C=5, class_weight='balanced')
pipe_lr = make_pipeline(vec, lr)
pipe_lr.fit(all_train.phrase_stem_emoticon_unique, all_train.label)

# ELI5

In [None]:
# load data
all_train = pd.read_csv('data/stacking_folds/train_all.csv', header = 0)
all_test = pd.read_csv('data/stacking_folds/test.csv', header = 0)
full_df = pd.concat([all_train, all_test], axis=0).reset_index().drop('index', axis=1)
full_df_subset = full_df[["new_aspect_1", "phrase_stem_emoticon_unique", "label"]]

# load saved models
vect_pkl_filename = "saved_models/model_logreg_vectorizer.pkl"
model_pkl_filename = "saved_models/model_logreg.pkl"
lr_vectorizer = pickle.load(open(vect_pkl_filename, "rb"))
lr_model = pickle.load(open(model_pkl_filename, "rb"))

In [None]:
lr_eval = eli5.explain_weights_df(lr_model, vec=lr_vectorizer, top=20)
lr_eval_neg = lr_eval[lr_eval.target == -1.0]
lr_eval_neu = lr_eval[lr_eval.target == 0.0]
lr_eval_pos = lr_eval[lr_eval.target == 1.0]

# save results
lr_eval.to_csv("data/explain_results/logreg_lime.csv")

In [None]:
plt.figure(figsize=(20,10))
plt.subplot(1, 3, 1)
plt.barh(lr_eval_neg.feature, lr_eval_neg.weight, height=0.8,
         color=["#E3242B" if x<0 else "#00AB6B" for x in lr_eval_neg.weight])
plt.title('y=-1.0 top features')
plt.ylabel('Features')
plt.xlabel('Weight')

plt.subplot(1, 3, 2)
plt.barh(lr_eval_neu.feature, lr_eval_neu.weight, height=0.8,
         color=["#E3242B" if x<0 else "#00AB6B" for x in lr_eval_neu.weight])
plt.title('y=0.0 top features')
plt.ylabel('Features')
plt.xlabel('Weight')

plt.subplot(1, 3, 3)
plt.barh(lr_eval_pos.feature, lr_eval_pos.weight, height=0.8,
         color=["#E3242B" if x<0 else "#00AB6B" for x in lr_eval_pos.weight])
plt.title('y=1.0 top features')
plt.ylabel('Features')
plt.xlabel('Weight')

plt.show()

# LIME 

In [None]:
dneg, dneu, dpos = generate_lime_scores(pipe_lr, "phrase_stem_emoticon_unique")

In [None]:
lr_eval = get_token_avg_score(dpos, dneu, dneg)
lr_eval.to_csv('data/explain_results/lr_lime.csv', index=False)

## Visualise Results

In [None]:
lr_eval["mag_neg"] = np.abs(lr_eval.average_neg_impact)
lr_eval["mag_neu"] = np.abs(lr_eval.average_neu_impact)
lr_eval["mag_pos"] = np.abs(lr_eval.average_pos_impact)

lr_eval_neg = lr_eval.nlargest(20, "mag_neg")
lr_eval_neu = lr_eval.nlargest(20, "mag_neu")
lr_eval_pos = lr_eval.nlargest(20, "mag_pos")

In [None]:
plt.figure(figsize=(20,10))
plt.subplot(1, 3, 1)
plt.barh(lr_eval_neg.token, lr_eval_neg.average_neg_impact, height=0.8,
         color=["#E3242B" if x<0 else "#00AB6B" for x in lr_eval_neg.average_neg_impact])
plt.title('y=-1.0 top features')
plt.ylabel('Features')
plt.xlabel('Weight')

plt.subplot(1, 3, 2)
plt.barh(lr_eval_neu.token, lr_eval_neu.average_neu_impact, height=0.8,
         color=["#E3242B" if x<0 else "#00AB6B" for x in lr_eval_neu.average_neu_impact])
plt.title('y=0.0 top features')
plt.ylabel('Features')
plt.xlabel('Weight')

plt.subplot(1, 3, 3)
plt.barh(lr_eval_pos.token, lr_eval_pos.average_pos_impact, height=0.8,
         color=["#E3242B" if x<0 else "#00AB6B" for x in lr_eval_pos.average_pos_impact])
plt.title('y=1.0 top features')
plt.ylabel('Features')
plt.xlabel('Weight')

plt.show()

# NAIVES BAYES

In [None]:
# create pipeline
vec = CountVectorizer(analyzer="word",
    lowercase= True,
    ngram_range =(1,1),
    max_df = 0.25,
    min_df = 10)
nb = MultinomialNB(alpha = 0.5)
pipe_nb = make_pipeline(vec, nb)
pipe_nb.fit(all_train.phrase_stem_emoticon_generic, all_train.label)

# LIME 

In [None]:
dneg, dneu, dpos = generate_lime_scores(pipe_nb, "phrase_stem_emoticon_generic")

In [None]:
nb_eval = get_token_avg_score(dpos, dneu, dneg)
nb_eval.to_csv('data/explain_results/nb_lime.csv', index=False)

## Visualise Results

In [None]:
nb_eval["mag_neg"] = np.abs(nb_eval.average_neg_impact)
nb_eval["mag_neu"] = np.abs(nb_eval.average_neu_impact)
nb_eval["mag_pos"] = np.abs(nb_eval.average_pos_impact)

nb_eval_neg = nb_eval.nlargest(20, "mag_neg")
nb_eval_neu = nb_eval.nlargest(20, "mag_neu")
nb_eval_pos = nb_eval.nlargest(20, "mag_pos")

In [None]:
plt.figure(figsize=(20,10))
plt.subplot(1, 3, 1)
plt.barh(nb_eval_neg.token, nb_eval_neg.average_neg_impact, height=0.8,
         color=["#E3242B" if x<0 else "#00AB6B" for x in nb_eval_neg.average_neg_impact])
plt.title('y=-1.0 top features')
plt.ylabel('Features')
plt.xlabel('Weight')

plt.subplot(1, 3, 2)
plt.barh(nb_eval_neu.token, nb_eval_neu.average_neu_impact, height=0.8,
         color=["#E3242B" if x<0 else "#00AB6B" for x in nb_eval_neu.average_neu_impact])
plt.title('y=0.0 top features')
plt.ylabel('Features')
plt.xlabel('Weight')

plt.subplot(1, 3, 3)
plt.barh(nb_eval_pos.token, nb_eval_pos.average_pos_impact, height=0.8,
         color=["#E3242B" if x<0 else "#00AB6B" for x in nb_eval_pos.average_pos_impact])
plt.title('y=1.0 top features')
plt.ylabel('Features')
plt.xlabel('Weight')

plt.show()

# RF

In [None]:
# create pipeline
vec_rf = CountVectorizer(analyzer="word",
    lowercase= True,
    ngram_range =(1,1),
    max_df = 1.0,
    min_df = 1)
rf = RandomForestClassifier(criterion = "gini", min_samples_split = 5, class_weight=None, max_features="auto", min_samples_leaf=1)
pipe_rf = make_pipeline(vec_rf, rf)
pipe_rf.fit(all_train.phrase_stem_emoticon_generic, all_train.label)

# LIME

In [None]:
dneg, dneu, dpos = generate_lime_scores(pipe_rf, "phrase_stem_emoticon_generic")

In [None]:
rf_eval = get_token_avg_score(dpos, dneu, dneg)
rf_eval.to_csv('data/explain_results/rf_lime.csv', index=False)

## Visualise Results

In [None]:
rf_eval["mag_neg"] = np.abs(rf_eval.average_neg_impact)
rf_eval["mag_neu"] = np.abs(rf_eval.average_neu_impact)
rf_eval["mag_pos"] = np.abs(rf_eval.average_pos_impact)

rf_eval_neg = rf_eval.nlargest(20, "mag_neg")
rf_eval_neu = rf_eval.nlargest(20, "mag_neu")
rf_eval_pos = rf_eval.nlargest(20, "mag_pos")

In [None]:
plt.figure(figsize=(20,10))
plt.subplot(1, 3, 1)
plt.barh(rf_eval_neg.token, rf_eval_neg.average_neg_impact, height=0.8,
         color=["#E3242B" if x<0 else "#00AB6B" for x in rf_eval_neg.average_neg_impact])
plt.title('y=-1.0 top features')
plt.ylabel('Features')
plt.xlabel('Weight')

plt.subplot(1, 3, 2)
plt.barh(rf_eval_neu.token, rf_eval_neu.average_neu_impact, height=0.8,
         color=["#E3242B" if x<0 else "#00AB6B" for x in rf_eval_neu.average_neu_impact])
plt.title('y=0.0 top features')
plt.ylabel('Features')
plt.xlabel('Weight')

plt.subplot(1, 3, 3)
plt.barh(rf_eval_pos.token, rf_eval_pos.average_pos_impact, height=0.8,
         color=["#E3242B" if x<0 else "#00AB6B" for x in rf_eval_pos.average_pos_impact])
plt.title('y=1.0 top features')
plt.ylabel('Features')
plt.xlabel('Weight')

plt.show()

# SVM

In [None]:
# create pipeline
vec_svm = CountVectorizer(analyzer="word",
    lowercase= True,
    ngram_range =(1,1),
    max_df = 0.25,
    min_df = 1)
svm = SVC(C=5, kernel='rbf', probability=True,class_weight=None,gamma='scale')
pipe_svm = make_pipeline(vec_svm, svm)
pipe_svm.fit(all_train.phrase_stem_emoticon_generic, all_train.label)

# LIME

In [None]:
dneg, dneu, dpos = generate_lime_scores(pipe_svm, "phrase_emoticon_generic")

In [None]:
svm_eval = get_token_avg_score(dpos, dneu, dneg)
svm_eval.to_csv('data/explain_results/svm_lime.csv', index=False)

## Visualise Results

In [None]:
svm_eval["mag_neg"] = np.abs(svm_eval.average_neg_impact)
svm_eval["mag_neu"] = np.abs(svm_eval.average_neu_impact)
svm_eval["mag_pos"] = np.abs(svm_eval.average_pos_impact)

svm_eval_neg = svm_eval.nlargest(20, "mag_neg")
svm_eval_neu = svm_eval.nlargest(20, "mag_neu")
svm_eval_pos = svm_eval.nlargest(20, "mag_pos")

In [None]:
plt.figure(figsize=(20,10))
plt.subplot(1, 3, 1)
plt.barh(svm_eval_neg.token, svm_eval_neg.average_neg_impact, height=0.8,
         color=["#E3242B" if x<0 else "#00AB6B" for x in svm_eval_neg.average_neg_impact])
plt.title('y=-1.0 top features')
plt.ylabel('Features')
plt.xlabel('Weight')

plt.subplot(1, 3, 2)
plt.barh(svm_eval_neu.token, svm_eval_neu.average_neu_impact, height=0.8,
         color=["#E3242B" if x<0 else "#00AB6B" for x in svm_eval_neu.average_neu_impact])
plt.title('y=0.0 top features')
plt.ylabel('Features')
plt.xlabel('Weight')

plt.subplot(1, 3, 3)
plt.barh(svm_eval_pos.token, svm_eval_pos.average_pos_impact, height=0.8,
         color=["#E3242B" if x<0 else "#00AB6B" for x in svm_eval_pos.average_pos_impact])
plt.title('y=1.0 top features')
plt.ylabel('Features')
plt.xlabel('Weight')

plt.show()