<a href="https://colab.research.google.com/github/RobertNimmo26/Toxic-Comments-XAI-Study/blob/main/notebooks/toxic_classifier_explanations_script.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# install packages
!pip install detoxify
!pip install lime

In [None]:
# import packages
from detoxify import Detoxify

import lime
from lime.lime_text import LimeTextExplainer

import numpy as np
import pandas as pd

import json

from google.colab import files

In [None]:
# load dataframe
df = pd.read_csv('drive/MyDrive/University/msci_project/data_output.csv')

df = df[["id","comment_text","toxic"]]
df_toxic = df.loc[df['toxic'] == 1]
df_nontoxic = df.loc[df['toxic'] == 0]

In [None]:
# create lime explainer object
class_names = ['Non-toxic','Toxic']
explainer = LimeTextExplainer(class_names=class_names)

In [None]:
# prediction function
def predict(x):
  results = Detoxify('original').predict(x)["toxicity"]
  results_both = np.transpose(np.array([1-np.array(results),results]))
  return results_both

In [None]:
def create_exp_dict(exp, prediction_proba, x, id):
  exp = exp.as_list()
  output = {}
  output["id"] = id
  output["comment"] = x
  if prediction_proba >= 0.8:
    output["prediction_proba"] = round(float(prediction_proba),4) * 100
    output["prediction_label"] = "Toxic"
  else:
    output["prediction_proba"] = round(float(1-np.array(prediction_proba)),4) * 100
    output["prediction_label"] = "Non-toxic"


  output["important_words"] = []
  for i in exp:
    output["important_words"].append({"word":i[0], "weight":round(float(i[1]),4)})

  return output

In [None]:
def generate_explanations(row, verbose=False):
  if verbose:
    print(f"Generating explanations for {row.id}")
  exp = explainer.explain_instance(row.comment_text, predict, num_features=10, num_samples=100)
  prediction_proba = Detoxify('original').predict(row.comment_text)["toxicity"]
  return create_exp_dict(exp, prediction_proba, row.comment_text, row.id)

In [None]:
temp_df = df.groupby("toxic").sample(n=60, random_state=10).sample(frac = 1)

In [None]:
# generate explanation for each row
output = []
for row in temp_df.itertuples():
  exp = generate_explanations(row, True)
  output.append(exp)

In [None]:
# serializing json
json_object = json.dumps(output)
 
# writing to sample.json
with open("explanationData.json", "w") as outfile:
    outfile.write(json_object)

files.download("explanationData.json") 

In [None]:
temp_comment_text = df.loc[df["id"]=="05fea958cf35e6ed"]["comment_text"].values[0]

In [None]:
prediction_proba = Detoxify('original').predict(temp_comment_text)["toxicity"]

In [None]:
prediction_proba