In [3]:
import sys
sys.path.append("../..")
import os

import pandas as pd

# sklearn
from sklearn.metrics import precision_recall_fscore_support
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

from interpret_text.experimental.classical import ClassicalTextExplainer

# for testing
from scrapbook.api import glue

working_dir = os.getcwd()

In [4]:
DATA_FOLDER = r'../dataset'
TRAIN_SIZE = 0.7
TEST_SIZE = 0.3

In [5]:
df = pd.read_csv(DATA_FOLDER + "/concat_sentence_text.csv")
df.head()

Unnamed: 0,sentence_text,practice,modality
0,"IP ADDRESS, COOKIES, AND WEB BEACONS",Identifier_Cookie_or_similar_Tech_1stParty,PERFORMED
1,"IP ADDRESS, COOKIES, AND WEB BEACONS",Identifier_IP_Address_1stParty,PERFORMED
2,"IP addresses will be collected, along with inf...",Identifier_IP_Address_1stParty,PERFORMED
3,The information that our products collect incl...,Identifier_Cookie_or_similar_Tech_1stParty,PERFORMED
4,The information that our products collect incl...,Identifier_IP_Address_1stParty,PERFORMED


In [6]:
### Limit to top 5 practices, data cleaning
df["sentence_text"] = df["sentence_text"].astype("string")
df["practice"] = df["practice"].astype("category")
df["practice"] = df["practice"].astype("category")

top_5_cats = ["Identifier_Cookie_or_similar_Tech_1stParty", "Contact_E_Mail_Address_1stParty", "Location_1stParty", "Identifier_Cookie_or_similar_Tech_3rdParty", "Identifier_IP_Address_1stParty"]
df = df[df["practice"].isin(top_5_cats)]

In [7]:
# fetch documents and labels from data frame
X_str = df['sentence_text']  # the document we want to analyze
ylabels = df['practice'] # the labels, or answers, we want to test against

In [8]:
# Create explainer object that contains default glassbox classifier and explanation methods
explainer = ClassicalTextExplainer(n_jobs=-1, tol=0.1)
label_encoder = LabelEncoder()

In [9]:
X_train, X_test, y_train, y_test = train_test_split(X_str, ylabels, train_size=0.8, test_size=0.2)
y_train = label_encoder.fit_transform(y_train)
y_test = label_encoder.transform(y_test)

In [10]:
print("X_train shape =" + str(X_train.shape))
print("y_train shape =" + str(y_train.shape))
print("X_train data structure = " + str(type(X_train)))

X_train shape =(6385,)
y_train shape =(6385,)
X_train data structure = <class 'pandas.core.series.Series'>


In [11]:
classifier, best_params = explainer.fit(X_train, y_train)

  "The parameter 'token_pattern' will not be used"


In [12]:
# obtain best classifier and hyper params
print("best classifier: " + str(best_params))

best classifier: {'C': 10000, 'multi_class': 'multinomial', 'solver': 'saga', 'tol': 0.1}


In [13]:
mean_accuracy = classifier.score(X_test, y_test, sample_weight=None)
print("accuracy = " + str(mean_accuracy * 100) + "%")
y_pred = classifier.predict(X_test)
[precision, recall, fscore, support] = precision_recall_fscore_support(y_test, y_pred,average='macro')

accuracy = 64.30807764558547%


In [15]:
sample_text = "However, some features and services may not function properly if cookies are disabled."

In [16]:
y = classifier.predict(sample_text)
predicted_label = label_encoder.inverse_transform(y)
local_explanation = explainer.explain_local(sample_text, predicted_label)

AttributeError: 'LabelEncoder' object has no attribute 'classes_'

In [18]:
predicted_label

array(['Identifier_Cookie_or_similar_Tech_1stParty'], dtype=object)