In [None]:
# import packages
import numpy as np
import pandas as pd

from sklearn.pipeline import make_pipeline
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer

#lime
from lime.lime_text import LimeTextExplainer
from collections import defaultdict
from tqdm import tqdm

In [None]:
# load sample data
all_train = pd.read_csv('data/stacking_folds/train_all.csv', header = 0)[["phrase_stem_emoticon_generic", "phrase_stem_emoticon_unique","phrase", "label"]]
all_test = pd.read_csv('data/stacking_folds/test.csv', header = 0)[["phrase_stem_emoticon_generic","phrase_stem_emoticon_unique", "phrase", "label"]]
all_sample = pd.concat([all_train, all_test], axis=0).reset_index().drop('index', axis=1)

In [None]:
all_sample.head()

# LIME

In [None]:
def append_dict_of_scores(d, lime_exp, label):
    l = lime_exp.as_list(label=label)
    for item in l:
        key = item[0]
        val = item[1]
        if key in d:
            d[key].append(val)
        else:
            d[key] = [val]


def generate_lime_scores(pipe, phrase_ver):
    d_neg = defaultdict()
    d_neu = defaultdict()
    d_pos = defaultdict()
    class_names = [-1, 0, 1]
    explainer = LimeTextExplainer(class_names = class_names, random_state=42)
    for i in tqdm(range(len(all_sample))):
        current_text = all_sample[phrase_ver].iloc[i]
        exp = explainer.explain_instance(current_text, pipe.predict_proba, labels=[-1,0,1]) #1,-1
        append_dict_of_scores(d_neg, exp, -1)
        append_dict_of_scores(d_neu, exp, 0)
        append_dict_of_scores(d_pos, exp, 1)
    return d_neg, d_neu, d_pos

In [None]:
def dict_to_df(d, newcols):
    token_df =  pd.DataFrame([d]).T
    token_df = token_df.reset_index()
    token_df.columns = newcols
    return token_df

In [None]:
def get_token_avg_score(dpos, dneu, dneg):
    avgDict_pos = {}
    avgDict_neu = {}
    avgDict_neg = {}
    for k,v in dpos.items():
        # v is the list of impact on probability of predicting a class for a particular token
        avgDict_pos[k] = sum(v)/ float(len(v))
    for k,v in dneg.items():
        # v is the list of impact on probability of predicting a class for a particular token
        avgDict_neg[k] = sum(v)/ float(len(v))
    for k,v in dneu.items():
        # v is the list of impact on probability of predicting a class for a particular token
        avgDict_neu[k] = sum(v)/ float(len(v))
    pos=dict_to_df(avgDict_pos, ['token', 'average_pos_impact'])
    neg=dict_to_df(avgDict_neg, ['token', 'average_neg_impact'])
    neu=dict_to_df(avgDict_neu, ['token', 'average_neu_impact'])
    fin = pos.merge(neg, on='token', how = 'inner').merge(neu, on='token', how = 'inner')
    fin.sort_values(['average_pos_impact'], ascending=False)
    return fin

# LOGISTIC REGRESSION

In [None]:
# create pipeline
vec = TfidfVectorizer(analyzer="word",
    lowercase= True,
    ngram_range =(1,2),
    max_df = 0.25)
lr = LogisticRegression(C=5, class_weight='balanced')
pipe_lr = make_pipeline(vec, lr)
pipe_lr.fit(all_train.phrase_stem_emoticon_unique, all_train.label)

# LIME 

In [None]:
dneg, dneu, dpos = generate_lime_scores(pipe_lr, "phrase_stem_emoticon_unique")

In [None]:
lr_lime = get_token_avg_score(dpos, dneu, dneg)
lr_lime.to_csv('data/explain_results/lr_lime.csv', index=False)

# NAIVES BAYES

In [None]:
# create pipeline
vec = CountVectorizer(analyzer="word",
    lowercase= True,
    ngram_range =(1,1),
    max_df = 0.25,
    min_df = 10)
nb = MultinomialNB(alpha = 0.5)
pipe_nb = make_pipeline(vec, nb)
pipe_nb.fit(all_train.phrase_stem_emoticon_generic, all_train.label)

# LIME 

In [None]:
dneg, dneu, dpos = generate_lime_scores(pipe_nb, "phrase_stem_emoticon_generic")

In [None]:
nb_lime = get_token_avg_score(dpos, dneu, dneg)
nb_lime.to_csv('data/explain_results/nb_lime.csv', index=False)

# RF

In [None]:
# create pipeline
vec_rf = CountVectorizer(analyzer="word",
    lowercase= True,
    ngram_range =(1,1),
    max_df = 1.0,
    min_df = 1)
rf = RandomForestClassifier(criterion = "gini", min_samples_split = 5, class_weight=None, max_features="auto", min_samples_leaf=1)
pipe_rf = make_pipeline(vec_rf, rf)
pipe_rf.fit(all_train.phrase_stem_emoticon_generic, all_train.label)

# LIME

In [None]:
dneg, dneu, dpos = generate_lime_scores(pipe_rf, "phrase_stem_emoticon_generic")

In [None]:
rf_lime = get_token_avg_score(dpos, dneu, dneg)
rf_lime.to_csv('data/explain_results/rf_lime.csv', index=False)

# SVM

In [None]:
# create pipeline
vec_svm = CountVectorizer(analyzer="word",
    lowercase= True,
    ngram_range =(1,1),
    max_df = 0.25,
    min_df = 1)
svm = SVC(C=5, kernel='rbf', probability=True,class_weight=None,gamma='scale')
pipe_svm = make_pipeline(vec_svm, svm)
pipe_svm.fit(all_train.phrase_stem_emoticon_generic, all_train.label)

# LIME

In [None]:
dneg, dneu, dpos = generate_lime_scores(pipe_svm, "phrase_stem_emoticon_generic")

In [None]:
svm_lime = get_token_avg_score(dpos, dneu, dneg)
svm_lime.to_csv('data/explain_results/svm_lime.csv', index=False)