In [None]:
!pip install datasets
!pip install transformers
!pip install fasttext
!pip install ktrain
!pip install lime

In [None]:
!pip install imbalanced-learn

In [3]:
from sklearn.naive_bayes import GaussianNB, MultinomialNB
from datasets import load_dataset
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import ConfusionMatrixDisplay
from sklearn import metrics
from sklearn.pipeline import Pipeline
from collections import Counter
import transformers
import numpy as np
import fasttext
import ktrain
from ktrain import text
from imblearn.over_sampling import RandomOverSampler
import lime
from lime import lime_text
from lime.lime_text import LimeTextExplainer
from sklearn.pipeline import make_pipeline

1. Get acquainted with the data of the Polish Cyberbullying detection dataset. Pay special attention to the distribution of the positive and negative examples in the first task as well as distribution of the classes in the second task.

In [4]:
def get_results(y_test, y_pred): # show results based on predicted and real test values.
    print("Confusion matrix: ")
    print(metrics.confusion_matrix(y_test, y_pred))
    # sklearn.metrics.ConfusionMatrixDisplay.from_predictions(y_test, y_pred)
    # plt.show()
    print("Classification: ")
    print(metrics.classification_report(y_test, y_pred))
    print("MCC: ")
    print(metrics.matthews_corrcoef(y_test, y_pred))

In [5]:
def transformer_model(model_name, x_train, y_train, x_test): # wytrenuj model (podany) i zwróć wyniki oraz predyktor do LIMEa
    # https://nbviewer.org/github/amaiya/ktrain/blob/develop/tutorials/tutorial-A3-hugging_face_transformers.ipynb
    t = ktrain.text.Transformer(model_name, maxlen=25, class_names=np.unique(y_train)) # 25 bo pokazuje ze niby 24 jest max
    train = t.preprocess_train(x_train, y_train)
    model = t.get_classifier()
    learner = ktrain.get_learner(model, train_data=train, batch_size=32)
    learner.fit_onecycle(5e-5, 2)
    
    predictor = ktrain.get_predictor(learner.model, preproc=t)
    y_pred = predictor.predict(x_test)
    
    return [y_pred, predictor, learner]

2. Train the following classifiers on the training sets (for the task 1 and the task 2):
- Bayesian classifier with TF * IDF weighting.
- Fasttext text classifier
- Transformer classifier (take into account that a number of experiments should be performed for this model).

In [6]:
dataset = load_dataset("poleval2019_cyberbullying", "task01")

Downloading:   0%|          | 0.00/2.08k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.35k [00:00<?, ?B/s]

Downloading and preparing dataset poleval2019_cyber_bullying/task01 (download: 400.39 KiB, generated: 1.16 MiB, post-processed: Unknown size, total: 1.55 MiB) to /root/.cache/huggingface/datasets/poleval2019_cyber_bullying/task01/1.0.0/ce6060c56dae43c469bab309a7573b86299b0bcc2484e85cfe0ae70b5f770450...


Downloading:   0%|          | 0.00/340k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/70.1k [00:00<?, ?B/s]

0 examples [00:00, ? examples/s]

0 examples [00:00, ? examples/s]

Dataset poleval2019_cyber_bullying downloaded and prepared to /root/.cache/huggingface/datasets/poleval2019_cyber_bullying/task01/1.0.0/ce6060c56dae43c469bab309a7573b86299b0bcc2484e85cfe0ae70b5f770450. Subsequent calls will reuse this data.


  0%|          | 0/2 [00:00<?, ?it/s]

In [7]:
x_train = dataset["train"]['text']
x_test = dataset["test"]['text']
y_train = dataset["train"]['label']
y_test = dataset["test"]['label']

Naive Bayes

In [8]:
# Naive Bayes
vectorizer = TfidfVectorizer()
train_vector = vectorizer.fit_transform(x_train).todense()
test_vector = vectorizer.transform(x_test).todense()

In [9]:
# Gaussian NB
gnb = GaussianNB()

y_pred_gnb = gnb.fit(train_vector, y_train).predict(test_vector)
get_results(y_test, y_pred_gnb)

Confusion matrix: 
[[742 124]
 [ 94  40]]
Classification: 
              precision    recall  f1-score   support

           0       0.89      0.86      0.87       866
           1       0.24      0.30      0.27       134

    accuracy                           0.78      1000
   macro avg       0.57      0.58      0.57      1000
weighted avg       0.80      0.78      0.79      1000

MCC: 
0.1428942557422714


In [10]:
mnb = MultinomialNB()

y_pred_mnb = mnb.fit(train_vector, y_train).predict(test_vector)
get_results(y_test, y_pred_mnb)

Confusion matrix: 
[[866   0]
 [133   1]]
Classification: 
              precision    recall  f1-score   support

           0       0.87      1.00      0.93       866
           1       1.00      0.01      0.01       134

    accuracy                           0.87      1000
   macro avg       0.93      0.50      0.47      1000
weighted avg       0.88      0.87      0.81      1000

MCC: 
0.08043106192813237


FastText

In [11]:
# Przygotowanie pliku pod fasttext
with open("train_data.txt", 'wb') as f:
    f.writelines([f"__label__{label} {sentence}\n".encode() for sentence, label in zip(x_train, y_train)])

In [12]:
model = fasttext.train_supervised("train_data.txt")

y_pred_fast, propabilities = model.predict(x_test)
y_pred_fast = [int(label[0].split("__label__")[1]) for label in y_pred_fast]
get_results(y_test, y_pred_fast)

Confusion matrix: 
[[851  15]
 [112  22]]
Classification: 
              precision    recall  f1-score   support

           0       0.88      0.98      0.93       866
           1       0.59      0.16      0.26       134

    accuracy                           0.87      1000
   macro avg       0.74      0.57      0.59      1000
weighted avg       0.84      0.87      0.84      1000

MCC: 
0.2650301059500807


Transformers

In [13]:
dkleczek_bert_y_pred, dkleczek_bert_predictor, dkleczek_bert_learner = transformer_model('dkleczek/bert-base-polish-cased-v1', x_train, y_train, x_test)
get_results(y_test, dkleczek_bert_y_pred)

Downloading:   0%|          | 0.00/459 [00:00<?, ?B/s]

preprocessing train...
language: pl
train sequence lengths:
	mean : 12
	95percentile : 21
	99percentile : 24


Downloading:   0%|          | 0.00/30.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/489k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/112 [00:00<?, ?B/s]

404 Client Error: Not Found for url: https://huggingface.co/dkleczek/bert-base-polish-cased-v1/resolve/main/tf_model.h5


Is Multi-Label? False




Downloading:   0%|          | 0.00/531M [00:00<?, ?B/s]



begin training using onecycle policy with max lr of 5e-05...
Epoch 1/2
Epoch 2/2
Confusion matrix: 
[[858   8]
 [104  30]]
Classification: 
              precision    recall  f1-score   support

           0       0.89      0.99      0.94       866
           1       0.79      0.22      0.35       134

    accuracy                           0.89      1000
   macro avg       0.84      0.61      0.64      1000
weighted avg       0.88      0.89      0.86      1000

MCC: 
0.38242667499167965


In [None]:
# wyświetl listę do wybrania sobie odpowiednich przykładów
list(zip(range(len(dkleczek_bert_y_pred)), y_test, dkleczek_bert_y_pred)) # TP 0, FN 24, TN 31, FP 148

In [24]:
# TP 0, FN 24, TN 31, FP 148
x_test[0], x_test[24], x_test[31], x_test[148]

('@anonymized_account Spoko, jak im Duda z Morawieckim zamówią po pięć piw to wszystko będzie ok.',
 '@anonymized_account Tej szmaty się nie komentuje',
 '@anonymized_account Dokładnie, pisdzielstwo nie ma prawa rozpierdalać systemu,  sądownictwa nie mając większości',
 '@anonymized_account Jestem ukrainskim żydem z polskim obywatelstwem tnij może jedna ci starczy')

LIME

Gaussian NB

In [25]:
# GNB
class DenseTransformer():

    def fit(self, X, y=None, **fit_params):
        return self

    def transform(self, X, y=None, **fit_params):
        return X.todense()

In [73]:
vectorizer = TfidfVectorizer()
train_vector = vectorizer.fit_transform(x_train).todense()
test_vector = vectorizer.transform(x_test).todense()

class_names = ["0", "1"]

In [74]:
pipeline = Pipeline([
     ('vectorizer', vectorizer), 
     ('to_dense', DenseTransformer()), 
     ('classifier', gnb)
])
explainer = LimeTextExplainer(class_names=class_names)

idx_tp = 0
idx_fn = 24
idx_tn = 31
idx_fp = 148

In [None]:
# TP
idx = idx_tp

exp = explainer.explain_instance(x_test[idx], pipeline.predict_proba)
print("True positive")
print('Document id: %d' % idx)
print('Predicted class =', class_names[gnb.predict(test_vector[idx]).reshape(1,-1)[0,0]])
print('True class: %s' % y_test[idx])
exp.show_in_notebook(show_predicted_value=True)

# FN
idx = idx_fn

exp = explainer.explain_instance(x_test[idx], pipeline.predict_proba)
print("False negative")
print('Document id: %d' % idx)
print('Predicted class =', class_names[gnb.predict(test_vector[idx]).reshape(1,-1)[0,0]])
print('True class: %s' % y_test[idx])
exp.show_in_notebook(show_predicted_value=True)

# TN
idx = idx_tn

exp = explainer.explain_instance(x_test[idx], pipeline.predict_proba)
print("True negative")
print('Document id: %d' % idx)
print('Predicted class =', class_names[gnb.predict(test_vector[idx]).reshape(1,-1)[0,0]])
print('True class: %s' % y_test[idx])
exp.show_in_notebook(show_predicted_value=True)

# FP
idx = idx_fp

exp = explainer.explain_instance(x_test[idx], pipeline.predict_proba)
print("False positive")
print('Document id: %d' % idx)
print('Predicted class =', class_names[gnb.predict(test_vector[idx]).reshape(1,-1)[0,0]])
print('True class: %s' % y_test[idx])
exp.show_in_notebook(show_predicted_value=True)

Multinomial NB

In [68]:
c = make_pipeline(vectorizer, mnb)
explainer = LimeTextExplainer(class_names=class_names)

In [None]:
# TP
idx = idx_tp

exp = explainer.explain_instance(x_test[idx], c.predict_proba)

print("True positive")
print('Document id: %d' % idx)
print('Predicted class =', class_names[mnb.predict(test_vector[idx]).reshape(1,-1)[0,0]])
print('True class: %s' % y_test[idx])
exp.show_in_notebook(show_predicted_value=True)

# FN
idx = idx_fn

exp = explainer.explain_instance(x_test[idx], c.predict_proba)

print("False negative")
print('Document id: %d' % idx)
print('Predicted class =', class_names[mnb.predict(test_vector[idx]).reshape(1,-1)[0,0]])
print('True class: %s' % y_test[idx])
exp.show_in_notebook(show_predicted_value=True)

# TN
idx = idx_tn

exp = explainer.explain_instance(x_test[idx], c.predict_proba)

print("True negative")
print('Document id: %d' % idx)
print('Predicted class =', class_names[mnb.predict(test_vector[idx]).reshape(1,-1)[0,0]])
print('True class: %s' % y_test[idx])
exp.show_in_notebook(show_predicted_value=True)

# FP
idx = idx_fp

exp = explainer.explain_instance(x_test[idx], c.predict_proba)

print("False positive")
print('Document id: %d' % idx)
print('Predicted class =', class_names[mnb.predict(test_vector[idx]).reshape(1,-1)[0,0]])
print('True class: %s' % y_test[idx])
exp.show_in_notebook(show_predicted_value=True)

FastText

In [66]:
classifier  = model # ładujemy nasz wytrenowany model z fasttextu

def tokenize_string(string): # funkcja która dzieli tekst tak samo jak fasttext.
    return string.split()

# nasz klasyfikator był uczony pojedynczymi słowami
explainer = LimeTextExplainer(split_expression=tokenize_string, class_names=class_names)

def fasttext_pred(classifier, texts):
    res = []
    labels, probabilities = classifier.predict(texts, 2) # teksty z 2 labelkami

    for label, probs, text in zip(labels, probabilities, texts):
        order = np.argsort(np.array(label))
        res.append(probs[order])

    return np.array(res)

In [47]:
classifier.predict(x_test[idx])

(('__label__0',), array([0.50151396]))

In [None]:
# TP
idx = idx_tp

exp = explainer.explain_instance(x_test[idx], classifier_fn=lambda x: fasttext_pred(classifier, x))
print("True positive")
print('Document id: %d' % idx)
print('Predicted class =', classifier.predict(x_test[idx])[0])
print('True class: %s' % y_test[idx])
exp.show_in_notebook(show_predicted_value=True)

# FN
idx = idx_fn

exp = explainer.explain_instance(x_test[idx], classifier_fn=lambda x: fasttext_pred(classifier, x))
print("False negative")
print('Document id: %d' % idx)
print('Predicted class =', classifier.predict(x_test[idx])[0])
print('True class: %s' % y_test[idx])
exp.show_in_notebook(show_predicted_value=True)

# TN
idx = idx_tn

exp = explainer.explain_instance(x_test[idx], classifier_fn=lambda x: fasttext_pred(classifier, x))
print("True negative")
print('Document id: %d' % idx)
print('Predicted class =', classifier.predict(x_test[idx])[0])
print('True class: %s' % y_test[idx])
exp.show_in_notebook(show_predicted_value=True)

#FP
idx = idx_fp

exp = explainer.explain_instance(x_test[idx], classifier_fn=lambda x: fasttext_pred(classifier, x))
print("False positive")
print('Document id: %d' % idx)
print('Predicted class =', classifier.predict(x_test[idx])[0])
print('True class: %s' % y_test[idx])
exp.show_in_notebook(show_predicted_value=True)

Transformers

Kleczek - bert jest na gorze nauczony

In [56]:
predictor = dkleczek_bert_predictor
explainer = LimeTextExplainer(class_names=['0', '1'])

In [None]:
# TP
idx = idx_tp

exp = explainer.explain_instance(x_test[idx], predictor.predict_proba, num_features=25)
print("True positive")
print('Document id: %d' % idx)
print('Predicted class =', predictor.predict(x_test[idx]))
print('True class: %s' % y_test[idx])
exp.show_in_notebook(show_predicted_value=True)

# FN
idx = idx_fn

exp = explainer.explain_instance(x_test[idx], predictor.predict_proba, num_features=25)
print("False negative")
print('Document id: %d' % idx)
print('Predicted class =', predictor.predict(x_test[idx]))
print('True class: %s' % y_test[idx])
exp.show_in_notebook(show_predicted_value=True)

# TN
idx = idx_tn

exp = explainer.explain_instance(x_test[idx], predictor.predict_proba, num_features=25)
print("True negative")
print('Document id: %d' % idx)
print('Predicted class =', predictor.predict(x_test[idx]))
print('True class: %s' % y_test[idx])
exp.show_in_notebook(show_predicted_value=True)

# FP
idx = idx_fp

exp = explainer.explain_instance(x_test[idx], predictor.predict_proba, num_features=25)
print("False positive")
print('Document id: %d' % idx)
print('Predicted class =', predictor.predict(x_test[idx]))
print('True class: %s' % y_test[idx])
exp.show_in_notebook(show_predicted_value=True)

Bert

In [None]:
bert_y_pred, bert_predictor, bert_learner = transformer_model('bert-base-multilingual-cased', x_train, y_train, x_test)
get_results(y_test, bert_y_pred)

predictor = bert_predictor
explainer = LimeTextExplainer(class_names=['0', '1'])

In [None]:
# TP
idx = idx_tp

exp = explainer.explain_instance(x_test[idx], predictor.predict_proba, num_features=25)
print("True positive")
print('Document id: %d' % idx)
print('Predicted class =', predictor.predict(x_test[idx]))
print('True class: %s' % y_test[idx])
exp.show_in_notebook(show_predicted_value=True)

# FN
idx = idx_fn

exp = explainer.explain_instance(x_test[idx], predictor.predict_proba, num_features=25)
print("False negative")
print('Document id: %d' % idx)
print('Predicted class =', predictor.predict(x_test[idx]))
print('True class: %s' % y_test[idx])
exp.show_in_notebook(show_predicted_value=True)

# TN
idx = idx_tn

exp = explainer.explain_instance(x_test[idx], predictor.predict_proba, num_features=25)
print("True negative")
print('Document id: %d' % idx)
print('Predicted class =', predictor.predict(x_test[idx]))
print('True class: %s' % y_test[idx])
exp.show_in_notebook(show_predicted_value=True)

# FP
idx = idx_fp

exp = explainer.explain_instance(x_test[idx], predictor.predict_proba, num_features=25)
print("False positive")
print('Document id: %d' % idx)
print('Predicted class =', predictor.predict(x_test[idx]))
print('True class: %s' % y_test[idx])
exp.show_in_notebook(show_predicted_value=True)

XLM

In [None]:
xlm_y_pred, xlm_predictor, xlm_learner = transformer_model('xlm-roberta-base', x_train, y_train, x_test)
get_results(y_test, xlm_y_pred)

predictor = xlm_predictor
explainer = LimeTextExplainer(class_names=['0', '1'])

In [None]:
# TP
idx = idx_tp

exp = explainer.explain_instance(x_test[idx], predictor.predict_proba, num_features=25)
print("True positive")
print('Document id: %d' % idx)
print('Predicted class =', predictor.predict(x_test[idx]))
print('True class: %s' % y_test[idx])
exp.show_in_notebook(show_predicted_value=True)

# FN
idx = idx_fn

exp = explainer.explain_instance(x_test[idx], predictor.predict_proba, num_features=25)
print("False negative")
print('Document id: %d' % idx)
print('Predicted class =', predictor.predict(x_test[idx]))
print('True class: %s' % y_test[idx])
exp.show_in_notebook(show_predicted_value=True)

# TN
idx = idx_tn

exp = explainer.explain_instance(x_test[idx], predictor.predict_proba, num_features=25)
print("True negative")
print('Document id: %d' % idx)
print('Predicted class =', predictor.predict(x_test[idx]))
print('True class: %s' % y_test[idx])
exp.show_in_notebook(show_predicted_value=True)

# FP
idx = idx_fp

exp = explainer.explain_instance(x_test[idx], predictor.predict_proba, num_features=25)
print("False positive")
print('Document id: %d' % idx)
print('Predicted class =', predictor.predict(x_test[idx]))
print('True class: %s' % y_test[idx])
exp.show_in_notebook(show_predicted_value=True)

Dodanie więcej klas (niezbalansowany zbiór)

In [78]:
sampler = RandomOverSampler(sampling_strategy='minority', random_state=13)
x_train_balanced, y_train_balanced = sampler.fit_resample(np.array(dataset["train"]["text"]).reshape((-1, 1)), dataset["train"]["label"])
x_train_balanced = x_train_balanced.reshape((-1))

x_train = x_train_balanced
x_test = dataset["test"]['text']
y_train = y_train_balanced
y_test = dataset["test"]['label']

print(Counter(dataset["train"]["label"]))
print(Counter(y_train_balanced))

Counter({0: 9190, 1: 851})
Counter({0: 9190, 1: 9190})


Naive Bayes

In [79]:
# Naive Bayes
vectorizer = TfidfVectorizer()
train_vector = vectorizer.fit_transform(x_train).todense()
test_vector = vectorizer.transform(x_test).todense()

In [80]:
# Gaussian NB
gnb = GaussianNB()

y_pred_gnb = gnb.fit(train_vector, y_train).predict(test_vector)
get_results(y_test, y_pred_gnb)

Confusion matrix: 
[[750 116]
 [101  33]]
Classification: 
              precision    recall  f1-score   support

           0       0.88      0.87      0.87       866
           1       0.22      0.25      0.23       134

    accuracy                           0.78      1000
   macro avg       0.55      0.56      0.55      1000
weighted avg       0.79      0.78      0.79      1000

MCC: 
0.10745050407812812


In [81]:
mnb = MultinomialNB()

y_pred_mnb = mnb.fit(train_vector, y_train).predict(test_vector)
get_results(y_test, y_pred_mnb)

Confusion matrix: 
[[747 119]
 [ 56  78]]
Classification: 
              precision    recall  f1-score   support

           0       0.93      0.86      0.90       866
           1       0.40      0.58      0.47       134

    accuracy                           0.82      1000
   macro avg       0.66      0.72      0.68      1000
weighted avg       0.86      0.82      0.84      1000

MCC: 
0.3808590708020836


FastText

In [82]:
# Przygotowanie pliku pod fasttext
with open("train_data.txt", 'wb') as f:
    f.writelines([f"__label__{label} {sentence}\n".encode() for sentence, label in zip(x_train, y_train)])

In [84]:
model = fasttext.train_supervised("train_data.txt")

y_pred_fast, propabilities = model.predict(x_test)
y_pred_fast = [int(label[0].split("__label__")[1]) for label in y_pred_fast]
get_results(y_test, y_pred_fast)

Confusion matrix: 
[[162 704]
 [  4 130]]
Classification: 
              precision    recall  f1-score   support

           0       0.98      0.19      0.31       866
           1       0.16      0.97      0.27       134

    accuracy                           0.29      1000
   macro avg       0.57      0.58      0.29      1000
weighted avg       0.87      0.29      0.31      1000

MCC: 
0.14393673563899323


Transformers

In [None]:
dkleczek_bert_y_pred, dkleczek_bert_predictor, dkleczek_bert_learner = transformer_model('dkleczek/bert-base-polish-cased-v1', x_train, y_train, x_test)
get_results(y_test, dkleczek_bert_y_pred)

LIME

Gaussian NB

In [None]:
vectorizer = TfidfVectorizer()
train_vector = vectorizer.fit_transform(x_train).todense()
test_vector = vectorizer.transform(x_test).todense()

class_names = ["0", "1"]

In [None]:
pipeline = Pipeline([
     ('vectorizer', vectorizer), 
     ('to_dense', DenseTransformer()), 
     ('classifier', gnb)
])
explainer = LimeTextExplainer(class_names=class_names)

idx_tp = 0
idx_fn = 24
idx_tn = 31
idx_fp = 148

In [None]:
# TP
idx = idx_tp

exp = explainer.explain_instance(x_test[idx], pipeline.predict_proba)
print("True positive")
print('Document id: %d' % idx)
print('Predicted class =', class_names[gnb.predict(test_vector[idx]).reshape(1,-1)[0,0]])
print('True class: %s' % y_test[idx])
exp.show_in_notebook(show_predicted_value=True)

# FN
idx = idx_fn

exp = explainer.explain_instance(x_test[idx], pipeline.predict_proba)
print("False negative")
print('Document id: %d' % idx)
print('Predicted class =', class_names[gnb.predict(test_vector[idx]).reshape(1,-1)[0,0]])
print('True class: %s' % y_test[idx])
exp.show_in_notebook(show_predicted_value=True)

# TN
idx = idx_tn

exp = explainer.explain_instance(x_test[idx], pipeline.predict_proba)
print("True negative")
print('Document id: %d' % idx)
print('Predicted class =', class_names[gnb.predict(test_vector[idx]).reshape(1,-1)[0,0]])
print('True class: %s' % y_test[idx])
exp.show_in_notebook(show_predicted_value=True)

# FP
idx = idx_fp

exp = explainer.explain_instance(x_test[idx], pipeline.predict_proba)
print("False positive")
print('Document id: %d' % idx)
print('Predicted class =', class_names[gnb.predict(test_vector[idx]).reshape(1,-1)[0,0]])
print('True class: %s' % y_test[idx])
exp.show_in_notebook(show_predicted_value=True)

Multinomial NB

In [None]:
c = make_pipeline(vectorizer, mnb)
explainer = LimeTextExplainer(class_names=class_names)

In [None]:
# TP
idx = idx_tp

exp = explainer.explain_instance(x_test[idx], c.predict_proba)

print("True positive")
print('Document id: %d' % idx)
print('Predicted class =', class_names[mnb.predict(test_vector[idx]).reshape(1,-1)[0,0]])
print('True class: %s' % y_test[idx])
exp.show_in_notebook(show_predicted_value=True)

# FN
idx = idx_fn

exp = explainer.explain_instance(x_test[idx], c.predict_proba)

print("False negative")
print('Document id: %d' % idx)
print('Predicted class =', class_names[mnb.predict(test_vector[idx]).reshape(1,-1)[0,0]])
print('True class: %s' % y_test[idx])
exp.show_in_notebook(show_predicted_value=True)

# TN
idx = idx_tn

exp = explainer.explain_instance(x_test[idx], c.predict_proba)

print("True negative")
print('Document id: %d' % idx)
print('Predicted class =', class_names[mnb.predict(test_vector[idx]).reshape(1,-1)[0,0]])
print('True class: %s' % y_test[idx])
exp.show_in_notebook(show_predicted_value=True)

# FP
idx = idx_fp

exp = explainer.explain_instance(x_test[idx], c.predict_proba)

print("False positive")
print('Document id: %d' % idx)
print('Predicted class =', class_names[mnb.predict(test_vector[idx]).reshape(1,-1)[0,0]])
print('True class: %s' % y_test[idx])
exp.show_in_notebook(show_predicted_value=True)

FastText

In [None]:
classifier  = model # ładujemy nasz wytrenowany model z fasttextu

def tokenize_string(string): # funkcja która dzieli tekst tak samo jak fasttext.
    return string.split()

# nasz klasyfikator był uczony pojedynczymi słowami
explainer = LimeTextExplainer(split_expression=tokenize_string, class_names=class_names)

def fasttext_pred(classifier, texts):
    res = []
    labels, probabilities = classifier.predict(texts, 2) # teksty z 2 labelkami

    for label, probs, text in zip(labels, probabilities, texts):
        order = np.argsort(np.array(label))
        res.append(probs[order])

    return np.array(res)

In [None]:
# TP
idx = idx_tp

exp = explainer.explain_instance(x_test[idx], classifier_fn=lambda x: fasttext_pred(classifier, x))
print("True positive")
print('Document id: %d' % idx)
print('Predicted class =', classifier.predict(x_test[idx])[0])
print('True class: %s' % y_test[idx])
exp.show_in_notebook(show_predicted_value=True)

# FN
idx = idx_fn

exp = explainer.explain_instance(x_test[idx], classifier_fn=lambda x: fasttext_pred(classifier, x))
print("False negative")
print('Document id: %d' % idx)
print('Predicted class =', classifier.predict(x_test[idx])[0])
print('True class: %s' % y_test[idx])
exp.show_in_notebook(show_predicted_value=True)

# TN
idx = idx_tn

exp = explainer.explain_instance(x_test[idx], classifier_fn=lambda x: fasttext_pred(classifier, x))
print("True negative")
print('Document id: %d' % idx)
print('Predicted class =', classifier.predict(x_test[idx])[0])
print('True class: %s' % y_test[idx])
exp.show_in_notebook(show_predicted_value=True)

#FP
idx = idx_fp

exp = explainer.explain_instance(x_test[idx], classifier_fn=lambda x: fasttext_pred(classifier, x))
print("False positive")
print('Document id: %d' % idx)
print('Predicted class =', classifier.predict(x_test[idx])[0])
print('True class: %s' % y_test[idx])
exp.show_in_notebook(show_predicted_value=True)

Transformers

Kleczek - bert

In [None]:
predictor = dkleczek_bert_predictor
explainer = LimeTextExplainer(class_names=['0', '1'])

In [None]:
# TP
idx = idx_tp

exp = explainer.explain_instance(x_test[idx], predictor.predict_proba, num_features=25)
print("True positive")
print('Document id: %d' % idx)
print('Predicted class =', predictor.predict(x_test[idx]))
print('True class: %s' % y_test[idx])
exp.show_in_notebook(show_predicted_value=True)

# FN
idx = idx_fn

exp = explainer.explain_instance(x_test[idx], predictor.predict_proba, num_features=25)
print("False negative")
print('Document id: %d' % idx)
print('Predicted class =', predictor.predict(x_test[idx]))
print('True class: %s' % y_test[idx])
exp.show_in_notebook(show_predicted_value=True)

# TN
idx = idx_tn

exp = explainer.explain_instance(x_test[idx], predictor.predict_proba, num_features=25)
print("True negative")
print('Document id: %d' % idx)
print('Predicted class =', predictor.predict(x_test[idx]))
print('True class: %s' % y_test[idx])
exp.show_in_notebook(show_predicted_value=True)

# FP
idx = idx_fp

exp = explainer.explain_instance(x_test[idx], predictor.predict_proba, num_features=25)
print("False positive")
print('Document id: %d' % idx)
print('Predicted class =', predictor.predict(x_test[idx]))
print('True class: %s' % y_test[idx])
exp.show_in_notebook(show_predicted_value=True)

Bert

In [None]:
bert_y_pred, bert_predictor, bert_learner = transformer_model('bert-base-multilingual-cased', x_train, y_train, x_test)
get_results(y_test, bert_y_pred)

predictor = bert_predictor
explainer = LimeTextExplainer(class_names=['0', '1'])

In [None]:
# TP
idx = idx_tp

exp = explainer.explain_instance(x_test[idx], predictor.predict_proba, num_features=25)
print("True positive")
print('Document id: %d' % idx)
print('Predicted class =', predictor.predict(x_test[idx]))
print('True class: %s' % y_test[idx])
exp.show_in_notebook(show_predicted_value=True)

# FN
idx = idx_fn

exp = explainer.explain_instance(x_test[idx], predictor.predict_proba, num_features=25)
print("False negative")
print('Document id: %d' % idx)
print('Predicted class =', predictor.predict(x_test[idx]))
print('True class: %s' % y_test[idx])
exp.show_in_notebook(show_predicted_value=True)

# TN
idx = idx_tn

exp = explainer.explain_instance(x_test[idx], predictor.predict_proba, num_features=25)
print("True negative")
print('Document id: %d' % idx)
print('Predicted class =', predictor.predict(x_test[idx]))
print('True class: %s' % y_test[idx])
exp.show_in_notebook(show_predicted_value=True)

# FP
idx = idx_fp

exp = explainer.explain_instance(x_test[idx], predictor.predict_proba, num_features=25)
print("False positive")
print('Document id: %d' % idx)
print('Predicted class =', predictor.predict(x_test[idx]))
print('True class: %s' % y_test[idx])
exp.show_in_notebook(show_predicted_value=True)

XLM

In [None]:
xlm_y_pred, xlm_predictor, xlm_learner = transformer_model('xlm-roberta-base', x_train, y_train, x_test)
get_results(y_test, xlm_y_pred)

predictor = xlm_predictor
explainer = LimeTextExplainer(class_names=['0', '1'])

In [None]:
# TP
idx = idx_tp

exp = explainer.explain_instance(x_test[idx], predictor.predict_proba, num_features=25)
print("True positive")
print('Document id: %d' % idx)
print('Predicted class =', predictor.predict(x_test[idx]))
print('True class: %s' % y_test[idx])
exp.show_in_notebook(show_predicted_value=True)

# FN
idx = idx_fn

exp = explainer.explain_instance(x_test[idx], predictor.predict_proba, num_features=25)
print("False negative")
print('Document id: %d' % idx)
print('Predicted class =', predictor.predict(x_test[idx]))
print('True class: %s' % y_test[idx])
exp.show_in_notebook(show_predicted_value=True)

# TN
idx = idx_tn

exp = explainer.explain_instance(x_test[idx], predictor.predict_proba, num_features=25)
print("True negative")
print('Document id: %d' % idx)
print('Predicted class =', predictor.predict(x_test[idx]))
print('True class: %s' % y_test[idx])
exp.show_in_notebook(show_predicted_value=True)

# FP
idx = idx_fp

exp = explainer.explain_instance(x_test[idx], predictor.predict_proba, num_features=25)
print("False positive")
print('Document id: %d' % idx)
print('Predicted class =', predictor.predict(x_test[idx]))
print('True class: %s' % y_test[idx])
exp.show_in_notebook(show_predicted_value=True)