In [None]:
!pip install datasets
!pip install transformers
!pip install fasttext
!pip install ktrain
!pip install lime

In [None]:
!pip install imbalanced-learn

In [3]:
from sklearn.naive_bayes import GaussianNB, MultinomialNB
from datasets import load_dataset
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import ConfusionMatrixDisplay
from sklearn import metrics
from sklearn.pipeline import Pipeline
from collections import Counter
import transformers
import numpy as np
import fasttext
import ktrain
from ktrain import text
from imblearn.over_sampling import RandomOverSampler
import lime
from lime import lime_text
from lime.lime_text import LimeTextExplainer
from sklearn.pipeline import make_pipeline

1. Get acquainted with the data of the Polish Cyberbullying detection dataset. Pay special attention to the distribution of the positive and negative examples in the first task as well as distribution of the classes in the second task.

In [4]:
def get_results(y_test, y_pred): # show results based on predicted and real test values.
    print("Confusion matrix: ")
    print(metrics.confusion_matrix(y_test, y_pred))
    # sklearn.metrics.ConfusionMatrixDisplay.from_predictions(y_test, y_pred)
    # plt.show()
    print("Classification: ")
    print(metrics.classification_report(y_test, y_pred))
    print("MCC: ")
    print(metrics.matthews_corrcoef(y_test, y_pred))

In [5]:
def transformer_model(model_name, x_train, y_train, x_test): # wytrenuj model (podany) i zwróć wyniki oraz predyktor do LIMEa
    # https://nbviewer.org/github/amaiya/ktrain/blob/develop/tutorials/tutorial-A3-hugging_face_transformers.ipynb
    t = ktrain.text.Transformer(model_name, maxlen=25, class_names=np.unique(y_train)) # 25 bo pokazuje ze niby 24 jest max
    train = t.preprocess_train(x_train, y_train)
    model = t.get_classifier()
    learner = ktrain.get_learner(model, train_data=train, batch_size=32)
    learner.fit_onecycle(5e-5, 2)
    
    predictor = ktrain.get_predictor(learner.model, preproc=t)
    y_pred = predictor.predict(x_test)
    
    return [y_pred, predictor, learner]

2. Train the following classifiers on the training sets (for the task 1 and the task 2):
- Bayesian classifier with TF * IDF weighting.
- Fasttext text classifier
- Transformer classifier (take into account that a number of experiments should be performed for this model).

# Task01

## Base

In [None]:
dataset = load_dataset("poleval2019_cyberbullying", "task01")

Downloading:   0%|          | 0.00/2.08k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.35k [00:00<?, ?B/s]

Downloading and preparing dataset poleval2019_cyber_bullying/task01 (download: 400.39 KiB, generated: 1.16 MiB, post-processed: Unknown size, total: 1.55 MiB) to /root/.cache/huggingface/datasets/poleval2019_cyber_bullying/task01/1.0.0/ce6060c56dae43c469bab309a7573b86299b0bcc2484e85cfe0ae70b5f770450...


Downloading:   0%|          | 0.00/340k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/70.1k [00:00<?, ?B/s]

0 examples [00:00, ? examples/s]

0 examples [00:00, ? examples/s]

Dataset poleval2019_cyber_bullying downloaded and prepared to /root/.cache/huggingface/datasets/poleval2019_cyber_bullying/task01/1.0.0/ce6060c56dae43c469bab309a7573b86299b0bcc2484e85cfe0ae70b5f770450. Subsequent calls will reuse this data.


  0%|          | 0/2 [00:00<?, ?it/s]

In [None]:
x_train = dataset["train"]['text']
x_test = dataset["test"]['text']
y_train = dataset["train"]['label']
y_test = dataset["test"]['label']

### Naive Bayes

In [None]:
# Naive Bayes
vectorizer = TfidfVectorizer()
train_vector = vectorizer.fit_transform(x_train).todense()
test_vector = vectorizer.transform(x_test).todense()

In [None]:
# Gaussian NB
gnb = GaussianNB()

y_pred_gnb = gnb.fit(train_vector, y_train).predict(test_vector)
get_results(y_test, y_pred_gnb)

Confusion matrix: 
[[742 124]
 [ 94  40]]
Classification: 
              precision    recall  f1-score   support

           0       0.89      0.86      0.87       866
           1       0.24      0.30      0.27       134

    accuracy                           0.78      1000
   macro avg       0.57      0.58      0.57      1000
weighted avg       0.80      0.78      0.79      1000

MCC: 
0.1428942557422714


In [None]:
mnb = MultinomialNB()

y_pred_mnb = mnb.fit(train_vector, y_train).predict(test_vector)
get_results(y_test, y_pred_mnb)

Confusion matrix: 
[[866   0]
 [133   1]]
Classification: 
              precision    recall  f1-score   support

           0       0.87      1.00      0.93       866
           1       1.00      0.01      0.01       134

    accuracy                           0.87      1000
   macro avg       0.93      0.50      0.47      1000
weighted avg       0.88      0.87      0.81      1000

MCC: 
0.08043106192813237


### FastText

In [None]:
# Przygotowanie pliku pod fasttext
with open("train_data.txt", 'wb') as f:
    f.writelines([f"__label__{label} {sentence}\n".encode() for sentence, label in zip(x_train, y_train)])

In [None]:
model_fast = fasttext.train_supervised("train_data.txt")

y_pred_fast, propabilities = model_fast.predict(x_test)
y_pred_fast = [int(label[0].split("__label__")[1]) for label in y_pred_fast]
get_results(y_test, y_pred_fast)

Confusion matrix: 
[[851  15]
 [112  22]]
Classification: 
              precision    recall  f1-score   support

           0       0.88      0.98      0.93       866
           1       0.59      0.16      0.26       134

    accuracy                           0.87      1000
   macro avg       0.74      0.57      0.59      1000
weighted avg       0.84      0.87      0.84      1000

MCC: 
0.2650301059500807


### Transformers

In [None]:
dkleczek_bert_y_pred, dkleczek_bert_predictor, dkleczek_bert_learner = transformer_model('dkleczek/bert-base-polish-cased-v1', x_train, y_train, x_test)
get_results(y_test, dkleczek_bert_y_pred)

Downloading:   0%|          | 0.00/459 [00:00<?, ?B/s]

preprocessing train...
language: pl
train sequence lengths:
	mean : 12
	95percentile : 21
	99percentile : 24


Downloading:   0%|          | 0.00/30.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/489k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/112 [00:00<?, ?B/s]

Is Multi-Label? False


404 Client Error: Not Found for url: https://huggingface.co/dkleczek/bert-base-polish-cased-v1/resolve/main/tf_model.h5


Downloading:   0%|          | 0.00/531M [00:00<?, ?B/s]



begin training using onecycle policy with max lr of 5e-05...
Epoch 1/2
Epoch 2/2
Confusion matrix: 
[[853  13]
 [105  29]]
Classification: 
              precision    recall  f1-score   support

           0       0.89      0.98      0.94       866
           1       0.69      0.22      0.33       134

    accuracy                           0.88      1000
   macro avg       0.79      0.60      0.63      1000
weighted avg       0.86      0.88      0.85      1000

MCC: 
0.34204018503853295


In [None]:
# wyświetl listę do wybrania sobie odpowiednich przykładów
list(zip(range(len(dkleczek_bert_y_pred)), y_test, dkleczek_bert_y_pred)) # TP 0, FN 24, TN 31, FP 148

[(0, 0, 0),
 (1, 0, 0),
 (2, 0, 0),
 (3, 0, 0),
 (4, 0, 0),
 (5, 0, 0),
 (6, 0, 0),
 (7, 0, 0),
 (8, 0, 0),
 (9, 0, 0),
 (10, 0, 0),
 (11, 0, 0),
 (12, 0, 0),
 (13, 0, 0),
 (14, 0, 0),
 (15, 0, 0),
 (16, 0, 0),
 (17, 0, 0),
 (18, 0, 0),
 (19, 0, 0),
 (20, 0, 0),
 (21, 0, 0),
 (22, 0, 0),
 (23, 0, 0),
 (24, 1, 1),
 (25, 0, 0),
 (26, 0, 0),
 (27, 0, 0),
 (28, 0, 0),
 (29, 0, 0),
 (30, 0, 0),
 (31, 1, 1),
 (32, 0, 0),
 (33, 0, 0),
 (34, 0, 0),
 (35, 0, 0),
 (36, 0, 0),
 (37, 1, 1),
 (38, 0, 0),
 (39, 0, 0),
 (40, 0, 0),
 (41, 0, 0),
 (42, 0, 0),
 (43, 0, 0),
 (44, 0, 0),
 (45, 0, 0),
 (46, 0, 0),
 (47, 0, 0),
 (48, 0, 0),
 (49, 0, 0),
 (50, 0, 0),
 (51, 1, 1),
 (52, 0, 0),
 (53, 0, 0),
 (54, 0, 0),
 (55, 0, 0),
 (56, 0, 0),
 (57, 1, 0),
 (58, 1, 0),
 (59, 0, 0),
 (60, 0, 0),
 (61, 1, 0),
 (62, 0, 0),
 (63, 0, 0),
 (64, 1, 0),
 (65, 1, 1),
 (66, 0, 0),
 (67, 0, 0),
 (68, 0, 0),
 (69, 0, 0),
 (70, 0, 0),
 (71, 0, 0),
 (72, 0, 0),
 (73, 0, 0),
 (74, 0, 0),
 (75, 0, 1),
 (76, 0, 0),
 (77, 0, 

In [None]:
# TP 0, FN 24, TN 31, FP 148
x_test[0], x_test[24], x_test[31], x_test[148]

('@anonymized_account Spoko, jak im Duda z Morawieckim zamówią po pięć piw to wszystko będzie ok.',
 '@anonymized_account Tej szmaty się nie komentuje',
 '@anonymized_account Dokładnie, pisdzielstwo nie ma prawa rozpierdalać systemu,  sądownictwa nie mając większości',
 '@anonymized_account Jestem ukrainskim żydem z polskim obywatelstwem tnij może jedna ci starczy')

## LIME

In [None]:
class_names = ["0", "1"]

In [None]:
idx_tp = 0
idx_fn = 24
idx_tn = 31
idx_fp = 148

### Naive Bayes

Gaussian NB

In [None]:
# GNB
class DenseTransformer():

    def fit(self, X, y=None, **fit_params):
        return self

    def transform(self, X, y=None, **fit_params):
        return X.todense()

In [None]:
vectorizer = TfidfVectorizer()
train_vector = vectorizer.fit_transform(x_train).todense()
test_vector = vectorizer.transform(x_test).todense()

In [None]:
pipeline = Pipeline([
     ('vectorizer', vectorizer), 
     ('to_dense', DenseTransformer()), 
     ('classifier', gnb)
])
explainer = LimeTextExplainer(class_names=class_names)

In [None]:
# TP
idx = idx_tp

exp = explainer.explain_instance(x_test[idx], pipeline.predict_proba)
print("True positive")
print('Document id: %d' % idx)
print('Predicted class =', class_names[gnb.predict(test_vector[idx]).reshape(1,-1)[0,0]])
print('True class: %s' % y_test[idx])
exp.show_in_notebook(show_predicted_value=True)

# FN
idx = idx_fn

exp = explainer.explain_instance(x_test[idx], pipeline.predict_proba)
print("False negative")
print('Document id: %d' % idx)
print('Predicted class =', class_names[gnb.predict(test_vector[idx]).reshape(1,-1)[0,0]])
print('True class: %s' % y_test[idx])
exp.show_in_notebook(show_predicted_value=True)

# TN
idx = idx_tn

exp = explainer.explain_instance(x_test[idx], pipeline.predict_proba)
print("True negative")
print('Document id: %d' % idx)
print('Predicted class =', class_names[gnb.predict(test_vector[idx]).reshape(1,-1)[0,0]])
print('True class: %s' % y_test[idx])
exp.show_in_notebook(show_predicted_value=True)

# FP
idx = idx_fp

exp = explainer.explain_instance(x_test[idx], pipeline.predict_proba)
print("False positive")
print('Document id: %d' % idx)
print('Predicted class =', class_names[gnb.predict(test_vector[idx]).reshape(1,-1)[0,0]])
print('True class: %s' % y_test[idx])
exp.show_in_notebook(show_predicted_value=True)

Output hidden; open in https://colab.research.google.com to view.

Multinomial NB

In [None]:
c = make_pipeline(vectorizer, mnb)
explainer = LimeTextExplainer(class_names=class_names)

In [None]:
# TP
idx = idx_tp

exp = explainer.explain_instance(x_test[idx], c.predict_proba)

print("True positive")
print('Document id: %d' % idx)
print('Predicted class =', class_names[mnb.predict(test_vector[idx]).reshape(1,-1)[0,0]])
print('True class: %s' % y_test[idx])
exp.show_in_notebook(show_predicted_value=True)

# FN
idx = idx_fn

exp = explainer.explain_instance(x_test[idx], c.predict_proba)

print("False negative")
print('Document id: %d' % idx)
print('Predicted class =', class_names[mnb.predict(test_vector[idx]).reshape(1,-1)[0,0]])
print('True class: %s' % y_test[idx])
exp.show_in_notebook(show_predicted_value=True)

# TN
idx = idx_tn

exp = explainer.explain_instance(x_test[idx], c.predict_proba)

print("True negative")
print('Document id: %d' % idx)
print('Predicted class =', class_names[mnb.predict(test_vector[idx]).reshape(1,-1)[0,0]])
print('True class: %s' % y_test[idx])
exp.show_in_notebook(show_predicted_value=True)

# FP
idx = idx_fp

exp = explainer.explain_instance(x_test[idx], c.predict_proba)

print("False positive")
print('Document id: %d' % idx)
print('Predicted class =', class_names[mnb.predict(test_vector[idx]).reshape(1,-1)[0,0]])
print('True class: %s' % y_test[idx])
exp.show_in_notebook(show_predicted_value=True)

Output hidden; open in https://colab.research.google.com to view.

### FastText

In [None]:
classifier  = model_fast # ładujemy nasz wytrenowany model z fasttextu

def tokenize_string(string): # funkcja która dzieli tekst tak samo jak fasttext.
    return string.split()

# nasz klasyfikator był uczony pojedynczymi słowami
explainer = LimeTextExplainer(split_expression=tokenize_string, class_names=class_names)

def fasttext_pred(classifier, texts):
    res = []
    labels, probabilities = classifier.predict(texts, 2) # teksty z 2 labelkami

    for label, probs, text in zip(labels, probabilities, texts):
        order = np.argsort(np.array(label))
        res.append(probs[order])

    return np.array(res)

In [None]:
classifier.predict(x_test[idx])

(('__label__0',), array([0.50151396]))

In [None]:
# TP
idx = idx_tp

exp = explainer.explain_instance(x_test[idx], classifier_fn=lambda x: fasttext_pred(classifier, x))
print("True positive")
print('Document id: %d' % idx)
print('Predicted class =', classifier.predict(x_test[idx])[0])
print('True class: %s' % y_test[idx])
exp.show_in_notebook(show_predicted_value=True)

# FN
idx = idx_fn

exp = explainer.explain_instance(x_test[idx], classifier_fn=lambda x: fasttext_pred(classifier, x))
print("False negative")
print('Document id: %d' % idx)
print('Predicted class =', classifier.predict(x_test[idx])[0])
print('True class: %s' % y_test[idx])
exp.show_in_notebook(show_predicted_value=True)

# TN
idx = idx_tn

exp = explainer.explain_instance(x_test[idx], classifier_fn=lambda x: fasttext_pred(classifier, x))
print("True negative")
print('Document id: %d' % idx)
print('Predicted class =', classifier.predict(x_test[idx])[0])
print('True class: %s' % y_test[idx])
exp.show_in_notebook(show_predicted_value=True)

#FP
idx = idx_fp

exp = explainer.explain_instance(x_test[idx], classifier_fn=lambda x: fasttext_pred(classifier, x))
print("False positive")
print('Document id: %d' % idx)
print('Predicted class =', classifier.predict(x_test[idx])[0])
print('True class: %s' % y_test[idx])
exp.show_in_notebook(show_predicted_value=True)

Output hidden; open in https://colab.research.google.com to view.

### Transformers

Kleczek - bert jest na gorze nauczony

In [None]:
predictor = dkleczek_bert_predictor
explainer = LimeTextExplainer(class_names=class_names)

In [None]:
# TP
idx = idx_tp

exp = explainer.explain_instance(x_test[idx], predictor.predict_proba, num_features=25)
print("True positive")
print('Document id: %d' % idx)
print('Predicted class =', predictor.predict(x_test[idx]))
print('True class: %s' % y_test[idx])
exp.show_in_notebook(show_predicted_value=True)

# FN
idx = idx_fn

exp = explainer.explain_instance(x_test[idx], predictor.predict_proba, num_features=25)
print("False negative")
print('Document id: %d' % idx)
print('Predicted class =', predictor.predict(x_test[idx]))
print('True class: %s' % y_test[idx])
exp.show_in_notebook(show_predicted_value=True)

# TN
idx = idx_tn

exp = explainer.explain_instance(x_test[idx], predictor.predict_proba, num_features=25)
print("True negative")
print('Document id: %d' % idx)
print('Predicted class =', predictor.predict(x_test[idx]))
print('True class: %s' % y_test[idx])
exp.show_in_notebook(show_predicted_value=True)

# FP
idx = idx_fp

exp = explainer.explain_instance(x_test[idx], predictor.predict_proba, num_features=25)
print("False positive")
print('Document id: %d' % idx)
print('Predicted class =', predictor.predict(x_test[idx]))
print('True class: %s' % y_test[idx])
exp.show_in_notebook(show_predicted_value=True)

Output hidden; open in https://colab.research.google.com to view.

Bert

In [None]:
bert_y_pred, bert_predictor, bert_learner = transformer_model('bert-base-multilingual-cased', x_train, y_train, x_test)
get_results(y_test, bert_y_pred)

predictor = bert_predictor
explainer = LimeTextExplainer(class_names=class_names)

Downloading:   0%|          | 0.00/625 [00:00<?, ?B/s]

preprocessing train...
language: pl
train sequence lengths:
	mean : 12
	95percentile : 21
	99percentile : 24


Downloading:   0%|          | 0.00/996k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.96M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/29.0 [00:00<?, ?B/s]

Is Multi-Label? False


Downloading:   0%|          | 0.00/1.08G [00:00<?, ?B/s]



begin training using onecycle policy with max lr of 5e-05...
Epoch 1/2
Epoch 2/2
Confusion matrix: 
[[866   0]
 [134   0]]
Classification: 
              precision    recall  f1-score   support

           0       0.87      1.00      0.93       866
           1       0.00      0.00      0.00       134

    accuracy                           0.87      1000
   macro avg       0.43      0.50      0.46      1000
weighted avg       0.75      0.87      0.80      1000

MCC: 
0.0


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [None]:
# TP
idx = idx_tp

exp = explainer.explain_instance(x_test[idx], predictor.predict_proba, num_features=25)
print("True positive")
print('Document id: %d' % idx)
print('Predicted class =', predictor.predict(x_test[idx]))
print('True class: %s' % y_test[idx])
exp.show_in_notebook(show_predicted_value=True)

# FN
idx = idx_fn

exp = explainer.explain_instance(x_test[idx], predictor.predict_proba, num_features=25)
print("False negative")
print('Document id: %d' % idx)
print('Predicted class =', predictor.predict(x_test[idx]))
print('True class: %s' % y_test[idx])
exp.show_in_notebook(show_predicted_value=True)

# TN
idx = idx_tn

exp = explainer.explain_instance(x_test[idx], predictor.predict_proba, num_features=25)
print("True negative")
print('Document id: %d' % idx)
print('Predicted class =', predictor.predict(x_test[idx]))
print('True class: %s' % y_test[idx])
exp.show_in_notebook(show_predicted_value=True)

# FP
idx = idx_fp

exp = explainer.explain_instance(x_test[idx], predictor.predict_proba, num_features=25)
print("False positive")
print('Document id: %d' % idx)
print('Predicted class =', predictor.predict(x_test[idx]))
print('True class: %s' % y_test[idx])
exp.show_in_notebook(show_predicted_value=True)

Output hidden; open in https://colab.research.google.com to view.

XLM

In [None]:
xlm_y_pred, xlm_predictor, xlm_learner = transformer_model('xlm-roberta-base', x_train, y_train, x_test)
get_results(y_test, xlm_y_pred)

predictor = xlm_predictor
explainer = LimeTextExplainer(class_names=class_names)

Downloading:   0%|          | 0.00/512 [00:00<?, ?B/s]

preprocessing train...
language: pl
train sequence lengths:
	mean : 12
	95percentile : 21
	99percentile : 24


Downloading:   0%|          | 0.00/512 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/5.07M [00:00<?, ?B/s]

Is Multi-Label? False


Downloading:   0%|          | 0.00/1.89G [00:00<?, ?B/s]



begin training using onecycle policy with max lr of 5e-05...
Epoch 1/2

KeyboardInterrupt: ignored

In [None]:
# TP
idx = idx_tp

exp = explainer.explain_instance(x_test[idx], predictor.predict_proba, num_features=25)
print("True positive")
print('Document id: %d' % idx)
print('Predicted class =', predictor.predict(x_test[idx]))
print('True class: %s' % y_test[idx])
exp.show_in_notebook(show_predicted_value=True)

# FN
idx = idx_fn

exp = explainer.explain_instance(x_test[idx], predictor.predict_proba, num_features=25)
print("False negative")
print('Document id: %d' % idx)
print('Predicted class =', predictor.predict(x_test[idx]))
print('True class: %s' % y_test[idx])
exp.show_in_notebook(show_predicted_value=True)

# TN
idx = idx_tn

exp = explainer.explain_instance(x_test[idx], predictor.predict_proba, num_features=25)
print("True negative")
print('Document id: %d' % idx)
print('Predicted class =', predictor.predict(x_test[idx]))
print('True class: %s' % y_test[idx])
exp.show_in_notebook(show_predicted_value=True)

# FP
idx = idx_fp

exp = explainer.explain_instance(x_test[idx], predictor.predict_proba, num_features=25)
print("False positive")
print('Document id: %d' % idx)
print('Predicted class =', predictor.predict(x_test[idx]))
print('True class: %s' % y_test[idx])
exp.show_in_notebook(show_predicted_value=True)

## Dodanie więcej klas (niezbalansowany zbiór)

In [None]:
sampler = RandomOverSampler(sampling_strategy='minority', random_state=13)
x_train_balanced, y_train_balanced = sampler.fit_resample(np.array(dataset["train"]["text"]).reshape((-1, 1)), dataset["train"]["label"])
x_train_balanced = x_train_balanced.reshape((-1))

x_train = x_train_balanced
x_test = dataset["test"]['text']
y_train = y_train_balanced
y_test = dataset["test"]['label']

print(Counter(dataset["train"]["label"]))
print(Counter(y_train_balanced))

### Naive Bayes

In [None]:
# Naive Bayes
vectorizer = TfidfVectorizer()
train_vector = vectorizer.fit_transform(x_train).todense()
test_vector = vectorizer.transform(x_test).todense()

In [None]:
# Gaussian NB
gnb = GaussianNB()

y_pred_gnb = gnb.fit(train_vector, y_train).predict(test_vector)
get_results(y_test, y_pred_gnb)

Confusion matrix: 
[[750 116]
 [101  33]]
Classification: 
              precision    recall  f1-score   support

           0       0.88      0.87      0.87       866
           1       0.22      0.25      0.23       134

    accuracy                           0.78      1000
   macro avg       0.55      0.56      0.55      1000
weighted avg       0.79      0.78      0.79      1000

MCC: 
0.10745050407812812


In [None]:
mnb = MultinomialNB()

y_pred_mnb = mnb.fit(train_vector, y_train).predict(test_vector)
get_results(y_test, y_pred_mnb)

Confusion matrix: 
[[747 119]
 [ 56  78]]
Classification: 
              precision    recall  f1-score   support

           0       0.93      0.86      0.90       866
           1       0.40      0.58      0.47       134

    accuracy                           0.82      1000
   macro avg       0.66      0.72      0.68      1000
weighted avg       0.86      0.82      0.84      1000

MCC: 
0.3808590708020836


### FastText

In [None]:
# Przygotowanie pliku pod fasttext
with open("train_data.txt", 'wb') as f:
    f.writelines([f"__label__{label} {sentence}\n".encode() for sentence, label in zip(x_train, y_train)])

In [None]:
model_fast = fasttext.train_supervised("train_data.txt")

y_pred_fast, propabilities = model_fast.predict(x_test)
y_pred_fast = [int(label[0].split("__label__")[1]) for label in y_pred_fast]
get_results(y_test, y_pred_fast)

Confusion matrix: 
[[162 704]
 [  4 130]]
Classification: 
              precision    recall  f1-score   support

           0       0.98      0.19      0.31       866
           1       0.16      0.97      0.27       134

    accuracy                           0.29      1000
   macro avg       0.57      0.58      0.29      1000
weighted avg       0.87      0.29      0.31      1000

MCC: 
0.14393673563899323


### Transformers

In [None]:
dkleczek_bert_y_pred, dkleczek_bert_predictor, dkleczek_bert_learner = transformer_model('dkleczek/bert-base-polish-cased-v1', x_train, y_train, x_test)
get_results(y_test, dkleczek_bert_y_pred)

preprocessing train...
language: pl
train sequence lengths:
	mean : 13
	95percentile : 20
	99percentile : 23


404 Client Error: Not Found for url: https://huggingface.co/dkleczek/bert-base-polish-cased-v1/resolve/main/tf_model.h5


Is Multi-Label? False






begin training using onecycle policy with max lr of 5e-05...
Epoch 1/2
Epoch 2/2
Confusion matrix: 
[[826  40]
 [105  29]]
Classification: 
              precision    recall  f1-score   support

           0       0.89      0.95      0.92       866
           1       0.42      0.22      0.29       134

    accuracy                           0.85      1000
   macro avg       0.65      0.59      0.60      1000
weighted avg       0.82      0.85      0.83      1000

MCC: 
0.2287938352388199


## LIME

In [None]:
class_names = ["0", "1"]

In [None]:
idx_tp = 0
idx_fn = 24
idx_tn = 31
idx_fp = 148

### Naive Bayes

Gaussian NB

In [None]:
vectorizer = TfidfVectorizer()
train_vector = vectorizer.fit_transform(x_train).todense()
test_vector = vectorizer.transform(x_test).todense()

In [None]:
pipeline = Pipeline([
     ('vectorizer', vectorizer), 
     ('to_dense', DenseTransformer()), 
     ('classifier', gnb)
])
explainer = LimeTextExplainer(class_names=class_names)

In [None]:
# TP
idx = idx_tp

exp = explainer.explain_instance(x_test[idx], pipeline.predict_proba)
print("True positive")
print('Document id: %d' % idx)
print('Predicted class =', class_names[gnb.predict(test_vector[idx]).reshape(1,-1)[0,0]])
print('True class: %s' % y_test[idx])
exp.show_in_notebook(show_predicted_value=True)

# FN
idx = idx_fn

exp = explainer.explain_instance(x_test[idx], pipeline.predict_proba)
print("False negative")
print('Document id: %d' % idx)
print('Predicted class =', class_names[gnb.predict(test_vector[idx]).reshape(1,-1)[0,0]])
print('True class: %s' % y_test[idx])
exp.show_in_notebook(show_predicted_value=True)

# TN
idx = idx_tn

exp = explainer.explain_instance(x_test[idx], pipeline.predict_proba)
print("True negative")
print('Document id: %d' % idx)
print('Predicted class =', class_names[gnb.predict(test_vector[idx]).reshape(1,-1)[0,0]])
print('True class: %s' % y_test[idx])
exp.show_in_notebook(show_predicted_value=True)

# FP
idx = idx_fp

exp = explainer.explain_instance(x_test[idx], pipeline.predict_proba)
print("False positive")
print('Document id: %d' % idx)
print('Predicted class =', class_names[gnb.predict(test_vector[idx]).reshape(1,-1)[0,0]])
print('True class: %s' % y_test[idx])
exp.show_in_notebook(show_predicted_value=True)

Output hidden; open in https://colab.research.google.com to view.

Multinomial NB

In [None]:
c = make_pipeline(vectorizer, mnb)
explainer = LimeTextExplainer(class_names=class_names)

In [None]:
# TP
idx = idx_tp

exp = explainer.explain_instance(x_test[idx], c.predict_proba)

print("True positive")
print('Document id: %d' % idx)
print('Predicted class =', class_names[mnb.predict(test_vector[idx]).reshape(1,-1)[0,0]])
print('True class: %s' % y_test[idx])
exp.show_in_notebook(show_predicted_value=True)

# FN
idx = idx_fn

exp = explainer.explain_instance(x_test[idx], c.predict_proba)

print("False negative")
print('Document id: %d' % idx)
print('Predicted class =', class_names[mnb.predict(test_vector[idx]).reshape(1,-1)[0,0]])
print('True class: %s' % y_test[idx])
exp.show_in_notebook(show_predicted_value=True)

# TN
idx = idx_tn

exp = explainer.explain_instance(x_test[idx], c.predict_proba)

print("True negative")
print('Document id: %d' % idx)
print('Predicted class =', class_names[mnb.predict(test_vector[idx]).reshape(1,-1)[0,0]])
print('True class: %s' % y_test[idx])
exp.show_in_notebook(show_predicted_value=True)

# FP
idx = idx_fp

exp = explainer.explain_instance(x_test[idx], c.predict_proba)

print("False positive")
print('Document id: %d' % idx)
print('Predicted class =', class_names[mnb.predict(test_vector[idx]).reshape(1,-1)[0,0]])
print('True class: %s' % y_test[idx])
exp.show_in_notebook(show_predicted_value=True)

Output hidden; open in https://colab.research.google.com to view.

### FastText

In [None]:
classifier  = model_fast # ładujemy nasz wytrenowany model z fasttextu

def tokenize_string(string): # funkcja która dzieli tekst tak samo jak fasttext.
    return string.split()

# nasz klasyfikator był uczony pojedynczymi słowami
explainer = LimeTextExplainer(split_expression=tokenize_string, class_names=class_names)

def fasttext_pred(classifier, texts):
    res = []
    labels, probabilities = classifier.predict(texts, 2) # teksty z 2 labelkami

    for label, probs, text in zip(labels, probabilities, texts):
        order = np.argsort(np.array(label))
        res.append(probs[order])

    return np.array(res)

In [None]:
# TP
idx = idx_tp

exp = explainer.explain_instance(x_test[idx], classifier_fn=lambda x: fasttext_pred(classifier, x))
print("True positive")
print('Document id: %d' % idx)
print('Predicted class =', classifier.predict(x_test[idx])[0])
print('True class: %s' % y_test[idx])
exp.show_in_notebook(show_predicted_value=True)

# FN
idx = idx_fn

exp = explainer.explain_instance(x_test[idx], classifier_fn=lambda x: fasttext_pred(classifier, x))
print("False negative")
print('Document id: %d' % idx)
print('Predicted class =', classifier.predict(x_test[idx])[0])
print('True class: %s' % y_test[idx])
exp.show_in_notebook(show_predicted_value=True)

# TN
idx = idx_tn

exp = explainer.explain_instance(x_test[idx], classifier_fn=lambda x: fasttext_pred(classifier, x))
print("True negative")
print('Document id: %d' % idx)
print('Predicted class =', classifier.predict(x_test[idx])[0])
print('True class: %s' % y_test[idx])
exp.show_in_notebook(show_predicted_value=True)

#FP
idx = idx_fp

exp = explainer.explain_instance(x_test[idx], classifier_fn=lambda x: fasttext_pred(classifier, x))
print("False positive")
print('Document id: %d' % idx)
print('Predicted class =', classifier.predict(x_test[idx])[0])
print('True class: %s' % y_test[idx])
exp.show_in_notebook(show_predicted_value=True)

Output hidden; open in https://colab.research.google.com to view.

### Transformers

Kleczek - bert

In [None]:
predictor = dkleczek_bert_predictor
explainer = LimeTextExplainer(class_names=class_names)

In [None]:
# TP
idx = idx_tp

exp = explainer.explain_instance(x_test[idx], predictor.predict_proba, num_features=25)
print("True positive")
print('Document id: %d' % idx)
print('Predicted class =', predictor.predict(x_test[idx]))
print('True class: %s' % y_test[idx])
exp.show_in_notebook(show_predicted_value=True)

# FN
idx = idx_fn

exp = explainer.explain_instance(x_test[idx], predictor.predict_proba, num_features=25)
print("False negative")
print('Document id: %d' % idx)
print('Predicted class =', predictor.predict(x_test[idx]))
print('True class: %s' % y_test[idx])
exp.show_in_notebook(show_predicted_value=True)

# TN
idx = idx_tn

exp = explainer.explain_instance(x_test[idx], predictor.predict_proba, num_features=25)
print("True negative")
print('Document id: %d' % idx)
print('Predicted class =', predictor.predict(x_test[idx]))
print('True class: %s' % y_test[idx])
exp.show_in_notebook(show_predicted_value=True)

# FP
idx = idx_fp

exp = explainer.explain_instance(x_test[idx], predictor.predict_proba, num_features=25)
print("False positive")
print('Document id: %d' % idx)
print('Predicted class =', predictor.predict(x_test[idx]))
print('True class: %s' % y_test[idx])
exp.show_in_notebook(show_predicted_value=True)

Output hidden; open in https://colab.research.google.com to view.

Bert

In [None]:
bert_y_pred, bert_predictor, bert_learner = transformer_model('bert-base-multilingual-cased', x_train, y_train, x_test)
get_results(y_test, bert_y_pred)

predictor = bert_predictor
explainer = LimeTextExplainer(class_names=class_names)

preprocessing train...
language: pl
train sequence lengths:
	mean : 13
	95percentile : 20
	99percentile : 23


Is Multi-Label? False


begin training using onecycle policy with max lr of 5e-05...
Epoch 1/2
Epoch 2/2
Confusion matrix: 
[[816  50]
 [ 96  38]]
Classification: 
              precision    recall  f1-score   support

           0       0.89      0.94      0.92       866
           1       0.43      0.28      0.34       134

    accuracy                           0.85      1000
   macro avg       0.66      0.61      0.63      1000
weighted avg       0.83      0.85      0.84      1000

MCC: 
0.27157124272671096


In [None]:
# TP
idx = idx_tp

exp = explainer.explain_instance(x_test[idx], predictor.predict_proba, num_features=25)
print("True positive")
print('Document id: %d' % idx)
print('Predicted class =', predictor.predict(x_test[idx]))
print('True class: %s' % y_test[idx])
exp.show_in_notebook(show_predicted_value=True)

# FN
idx = idx_fn

exp = explainer.explain_instance(x_test[idx], predictor.predict_proba, num_features=25)
print("False negative")
print('Document id: %d' % idx)
print('Predicted class =', predictor.predict(x_test[idx]))
print('True class: %s' % y_test[idx])
exp.show_in_notebook(show_predicted_value=True)

# TN
idx = idx_tn

exp = explainer.explain_instance(x_test[idx], predictor.predict_proba, num_features=25)
print("True negative")
print('Document id: %d' % idx)
print('Predicted class =', predictor.predict(x_test[idx]))
print('True class: %s' % y_test[idx])
exp.show_in_notebook(show_predicted_value=True)

# FP
idx = idx_fp

exp = explainer.explain_instance(x_test[idx], predictor.predict_proba, num_features=25)
print("False positive")
print('Document id: %d' % idx)
print('Predicted class =', predictor.predict(x_test[idx]))
print('True class: %s' % y_test[idx])
exp.show_in_notebook(show_predicted_value=True)

Output hidden; open in https://colab.research.google.com to view.

XLM

In [None]:
xlm_y_pred, xlm_predictor, xlm_learner = transformer_model('xlm-roberta-base', x_train, y_train, x_test)
get_results(y_test, xlm_y_pred)

predictor = xlm_predictor
explainer = LimeTextExplainer(class_names=class_names)

In [None]:
# TP
idx = idx_tp

exp = explainer.explain_instance(x_test[idx], predictor.predict_proba, num_features=25)
print("True positive")
print('Document id: %d' % idx)
print('Predicted class =', predictor.predict(x_test[idx]))
print('True class: %s' % y_test[idx])
exp.show_in_notebook(show_predicted_value=True)

# FN
idx = idx_fn

exp = explainer.explain_instance(x_test[idx], predictor.predict_proba, num_features=25)
print("False negative")
print('Document id: %d' % idx)
print('Predicted class =', predictor.predict(x_test[idx]))
print('True class: %s' % y_test[idx])
exp.show_in_notebook(show_predicted_value=True)

# TN
idx = idx_tn

exp = explainer.explain_instance(x_test[idx], predictor.predict_proba, num_features=25)
print("True negative")
print('Document id: %d' % idx)
print('Predicted class =', predictor.predict(x_test[idx]))
print('True class: %s' % y_test[idx])
exp.show_in_notebook(show_predicted_value=True)

# FP
idx = idx_fp

exp = explainer.explain_instance(x_test[idx], predictor.predict_proba, num_features=25)
print("False positive")
print('Document id: %d' % idx)
print('Predicted class =', predictor.predict(x_test[idx]))
print('True class: %s' % y_test[idx])
exp.show_in_notebook(show_predicted_value=True)

# Task02

## Base

In [6]:
dataset = load_dataset("poleval2019_cyberbullying", "task02")

Downloading:   0%|          | 0.00/2.08k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.35k [00:00<?, ?B/s]

Downloading and preparing dataset poleval2019_cyber_bullying/task02 (download: 400.53 KiB, generated: 1.16 MiB, post-processed: Unknown size, total: 1.55 MiB) to /root/.cache/huggingface/datasets/poleval2019_cyber_bullying/task02/1.0.0/ce6060c56dae43c469bab309a7573b86299b0bcc2484e85cfe0ae70b5f770450...


Downloading:   0%|          | 0.00/340k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/70.1k [00:00<?, ?B/s]

0 examples [00:00, ? examples/s]

0 examples [00:00, ? examples/s]

Dataset poleval2019_cyber_bullying downloaded and prepared to /root/.cache/huggingface/datasets/poleval2019_cyber_bullying/task02/1.0.0/ce6060c56dae43c469bab309a7573b86299b0bcc2484e85cfe0ae70b5f770450. Subsequent calls will reuse this data.


  0%|          | 0/2 [00:00<?, ?it/s]

In [7]:
x_train = dataset["train"]['text']
x_test = dataset["test"]['text']
y_train = dataset["train"]['label']
y_test = dataset["test"]['label']

### Naive Bayes

In [None]:
# Naive Bayes
vectorizer = TfidfVectorizer()
train_vector = vectorizer.fit_transform(x_train).todense()
test_vector = vectorizer.transform(x_test).todense()

Gaussian NB

In [None]:
# Gaussian NB
gnb = GaussianNB()

y_pred_gnb = gnb.fit(train_vector, y_train).predict(test_vector)
get_results(y_test, y_pred_gnb)

Confusion matrix: 
[[763  50  53]
 [ 14   4   7]
 [ 84   5  20]]
Classification: 
              precision    recall  f1-score   support

           0       0.89      0.88      0.88       866
           1       0.07      0.16      0.10        25
           2       0.25      0.18      0.21       109

    accuracy                           0.79      1000
   macro avg       0.40      0.41      0.40      1000
weighted avg       0.80      0.79      0.79      1000

MCC: 
0.1282543759318036


Multinomial NB

In [None]:
mnb = MultinomialNB()

y_pred_mnb = mnb.fit(train_vector, y_train).predict(test_vector)
get_results(y_test, y_pred_mnb)

Confusion matrix: 
[[866   0   0]
 [ 25   0   0]
 [109   0   0]]
Classification: 
              precision    recall  f1-score   support

           0       0.87      1.00      0.93       866
           1       0.00      0.00      0.00        25
           2       0.00      0.00      0.00       109

    accuracy                           0.87      1000
   macro avg       0.29      0.33      0.31      1000
weighted avg       0.75      0.87      0.80      1000

MCC: 
0.0


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


### FastText

In [None]:
# Przygotowanie pliku pod fasttext
with open("train_data.txt", 'wb') as f:
    f.writelines([f"__label__{label} {sentence}\n".encode() for sentence, label in zip(x_train, y_train)])

In [None]:
model_fast = fasttext.train_supervised("train_data.txt")

y_pred_fast, propabilities = model_fast.predict(x_test)
y_pred_fast = [int(label[0].split("__label__")[1]) for label in y_pred_fast]
get_results(y_test, y_pred_fast)

Confusion matrix: 
[[863   2   1]
 [ 22   2   1]
 [102   4   3]]
Classification: 
              precision    recall  f1-score   support

           0       0.87      1.00      0.93       866
           1       0.25      0.08      0.12        25
           2       0.60      0.03      0.05       109

    accuracy                           0.87      1000
   macro avg       0.57      0.37      0.37      1000
weighted avg       0.83      0.87      0.82      1000

MCC: 
0.16001981125515372


### Transformers

In [None]:
dkleczek_bert_y_pred, dkleczek_bert_predictor, dkleczek_bert_learner = transformer_model('dkleczek/bert-base-polish-cased-v1', x_train, y_train, x_test)
get_results(y_test, dkleczek_bert_y_pred)

preprocessing train...
language: pl
train sequence lengths:
	mean : 12
	95percentile : 21
	99percentile : 24


404 Client Error: Not Found for url: https://huggingface.co/dkleczek/bert-base-polish-cased-v1/resolve/main/tf_model.h5


Is Multi-Label? False






begin training using onecycle policy with max lr of 5e-05...
Epoch 1/2
Epoch 2/2
Confusion matrix: 
[[851   2  13]
 [ 18   3   4]
 [ 83   3  23]]
Classification: 
              precision    recall  f1-score   support

           0       0.89      0.98      0.94       866
           1       0.38      0.12      0.18        25
           2       0.57      0.21      0.31       109

    accuracy                           0.88      1000
   macro avg       0.61      0.44      0.48      1000
weighted avg       0.85      0.88      0.85      1000

MCC: 
0.3246965214728727


## LIME

In [None]:
class_names = ["0", "1", "2"]

In [None]:
idx_tp = 0
idx_fn = 24
idx_tn = 31
idx_fp = 148

### Naive Bayes

Gaussian NB

In [None]:
vectorizer = TfidfVectorizer()
train_vector = vectorizer.fit_transform(x_train).todense()
test_vector = vectorizer.transform(x_test).todense()

In [None]:
pipeline = Pipeline([
     ('vectorizer', vectorizer), 
     ('to_dense', DenseTransformer()), 
     ('classifier', gnb)
])
explainer = LimeTextExplainer(class_names=class_names)

In [None]:
# TP
idx = idx_tp

exp = explainer.explain_instance(x_test[idx], pipeline.predict_proba)
print("True positive")
print('Document id: %d' % idx)
print('Predicted class =', class_names[gnb.predict(test_vector[idx]).reshape(1,-1)[0,0]])
print('True class: %s' % y_test[idx])
exp.show_in_notebook(show_predicted_value=True)

# FN
idx = idx_fn

exp = explainer.explain_instance(x_test[idx], pipeline.predict_proba)
print("False negative")
print('Document id: %d' % idx)
print('Predicted class =', class_names[gnb.predict(test_vector[idx]).reshape(1,-1)[0,0]])
print('True class: %s' % y_test[idx])
exp.show_in_notebook(show_predicted_value=True)

# TN
idx = idx_tn

exp = explainer.explain_instance(x_test[idx], pipeline.predict_proba)
print("True negative")
print('Document id: %d' % idx)
print('Predicted class =', class_names[gnb.predict(test_vector[idx]).reshape(1,-1)[0,0]])
print('True class: %s' % y_test[idx])
exp.show_in_notebook(show_predicted_value=True)

# FP
idx = idx_fp

exp = explainer.explain_instance(x_test[idx], pipeline.predict_proba)
print("False positive")
print('Document id: %d' % idx)
print('Predicted class =', class_names[gnb.predict(test_vector[idx]).reshape(1,-1)[0,0]])
print('True class: %s' % y_test[idx])
exp.show_in_notebook(show_predicted_value=True)

Output hidden; open in https://colab.research.google.com to view.

Multinomial NB

In [None]:
c = make_pipeline(vectorizer, mnb)
explainer = LimeTextExplainer(class_names=class_names)

In [None]:
# TP
idx = idx_tp

exp = explainer.explain_instance(x_test[idx], c.predict_proba)
print("True positive")
print('Document id: %d' % idx)
print('Predicted class =', class_names[mnb.predict(test_vector[idx]).reshape(1,-1)[0,0]])
print('True class: %s' % y_test[idx])
exp.show_in_notebook(show_predicted_value=True)

# FN
idx = idx_fn

exp = explainer.explain_instance(x_test[idx], c.predict_proba)
print("False negative")
print('Document id: %d' % idx)
print('Predicted class =', class_names[mnb.predict(test_vector[idx]).reshape(1,-1)[0,0]])
print('True class: %s' % y_test[idx])
exp.show_in_notebook(show_predicted_value=True)

# TN
idx = idx_tn

exp = explainer.explain_instance(x_test[idx], c.predict_proba)
print("True negative")
print('Document id: %d' % idx)
print('Predicted class =', class_names[mnb.predict(test_vector[idx]).reshape(1,-1)[0,0]])
print('True class: %s' % y_test[idx])
exp.show_in_notebook(show_predicted_value=True)

# FP
idx = idx_fp

exp = explainer.explain_instance(x_test[idx], c.predict_proba)
print("False positive")
print('Document id: %d' % idx)
print('Predicted class =', class_names[mnb.predict(test_vector[idx]).reshape(1,-1)[0,0]])
print('True class: %s' % y_test[idx])
exp.show_in_notebook(show_predicted_value=True)

Output hidden; open in https://colab.research.google.com to view.

### FastText

In [None]:
classifier  = model_fast # ładujemy nasz wytrenowany model z fasttextu

def tokenize_string(string): # funkcja która dzieli tekst tak samo jak fasttext.
    return string.split()

# nasz klasyfikator był uczony pojedynczymi słowami
explainer = LimeTextExplainer(split_expression=tokenize_string, class_names=class_names)

def fasttext_pred(classifier, texts):
    res = []
    labels, probabilities = classifier.predict(texts, 3) # teksty z 2 labelkami

    for label, probs, text in zip(labels, probabilities, texts):
        order = np.argsort(np.array(label))
        res.append(probs[order])

    return np.array(res)

In [None]:
# TP
idx = idx_tp

exp = explainer.explain_instance(x_test[idx], classifier_fn=lambda x: fasttext_pred(classifier, x))
print("True positive")
print('Document id: %d' % idx)
print('Predicted class =', classifier.predict(x_test[idx])[0])
print('True class: %s' % y_test[idx])
exp.show_in_notebook(show_predicted_value=True)

# FN
idx = idx_fn

exp = explainer.explain_instance(x_test[idx], classifier_fn=lambda x: fasttext_pred(classifier, x))
print("False negative")
print('Document id: %d' % idx)
print('Predicted class =', classifier.predict(x_test[idx])[0])
print('True class: %s' % y_test[idx])
exp.show_in_notebook(show_predicted_value=True)

# TN
idx = idx_tn

exp = explainer.explain_instance(x_test[idx], classifier_fn=lambda x: fasttext_pred(classifier, x))
print("True negative")
print('Document id: %d' % idx)
print('Predicted class =', classifier.predict(x_test[idx])[0])
print('True class: %s' % y_test[idx])
exp.show_in_notebook(show_predicted_value=True)

#FP
idx = idx_fp

exp = explainer.explain_instance(x_test[idx], classifier_fn=lambda x: fasttext_pred(classifier, x))
print("False positive")
print('Document id: %d' % idx)
print('Predicted class =', classifier.predict(x_test[idx])[0])
print('True class: %s' % y_test[idx])
exp.show_in_notebook(show_predicted_value=True)

Output hidden; open in https://colab.research.google.com to view.

### Transformers

Kleczek - bert

In [None]:
predictor = dkleczek_bert_predictor
explainer = LimeTextExplainer(class_names=class_names)

In [None]:
# TP
idx = idx_tp

exp = explainer.explain_instance(x_test[idx], predictor.predict_proba, num_features=25)
print("True positive")
print('Document id: %d' % idx)
print('Predicted class =', predictor.predict(x_test[idx]))
print('True class: %s' % y_test[idx])
exp.show_in_notebook(show_predicted_value=True)

# FN
idx = idx_fn

exp = explainer.explain_instance(x_test[idx], predictor.predict_proba, num_features=25)
print("False negative")
print('Document id: %d' % idx)
print('Predicted class =', predictor.predict(x_test[idx]))
print('True class: %s' % y_test[idx])
exp.show_in_notebook(show_predicted_value=True)

# TN
idx = idx_tn

exp = explainer.explain_instance(x_test[idx], predictor.predict_proba, num_features=25)
print("True negative")
print('Document id: %d' % idx)
print('Predicted class =', predictor.predict(x_test[idx]))
print('True class: %s' % y_test[idx])
exp.show_in_notebook(show_predicted_value=True)

# FP
idx = idx_fp

exp = explainer.explain_instance(x_test[idx], predictor.predict_proba, num_features=25)
print("False positive")
print('Document id: %d' % idx)
print('Predicted class =', predictor.predict(x_test[idx]))
print('True class: %s' % y_test[idx])
exp.show_in_notebook(show_predicted_value=True)

Output hidden; open in https://colab.research.google.com to view.

Bert

In [None]:
bert_y_pred, bert_predictor, bert_learner = transformer_model('bert-base-multilingual-cased', x_train, y_train, x_test)
get_results(y_test, bert_y_pred)

predictor = bert_predictor
explainer = LimeTextExplainer(class_names=class_names)

preprocessing train...
language: pl
train sequence lengths:
	mean : 12
	95percentile : 21
	99percentile : 24


Is Multi-Label? False


begin training using onecycle policy with max lr of 5e-05...
Epoch 1/2
Epoch 2/2
Confusion matrix: 
[[866   0   0]
 [ 25   0   0]
 [109   0   0]]
Classification: 
              precision    recall  f1-score   support

           0       0.87      1.00      0.93       866
           1       0.00      0.00      0.00        25
           2       0.00      0.00      0.00       109

    accuracy                           0.87      1000
   macro avg       0.29      0.33      0.31      1000
weighted avg       0.75      0.87      0.80      1000

MCC: 
0.0


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [None]:
# TP
idx = idx_tp

exp = explainer.explain_instance(x_test[idx], predictor.predict_proba, num_features=25)
print("True positive")
print('Document id: %d' % idx)
print('Predicted class =', predictor.predict(x_test[idx]))
print('True class: %s' % y_test[idx])
exp.show_in_notebook(show_predicted_value=True)

# FN
idx = idx_fn

exp = explainer.explain_instance(x_test[idx], predictor.predict_proba, num_features=25)
print("False negative")
print('Document id: %d' % idx)
print('Predicted class =', predictor.predict(x_test[idx]))
print('True class: %s' % y_test[idx])
exp.show_in_notebook(show_predicted_value=True)

# TN
idx = idx_tn

exp = explainer.explain_instance(x_test[idx], predictor.predict_proba, num_features=25)
print("True negative")
print('Document id: %d' % idx)
print('Predicted class =', predictor.predict(x_test[idx]))
print('True class: %s' % y_test[idx])
exp.show_in_notebook(show_predicted_value=True)

# FP
idx = idx_fp

exp = explainer.explain_instance(x_test[idx], predictor.predict_proba, num_features=25)
print("False positive")
print('Document id: %d' % idx)
print('Predicted class =', predictor.predict(x_test[idx]))
print('True class: %s' % y_test[idx])
exp.show_in_notebook(show_predicted_value=True)

Output hidden; open in https://colab.research.google.com to view.

XLM

In [None]:
xlm_y_pred, xlm_predictor, xlm_learner = transformer_model('xlm-roberta-base', x_train, y_train, x_test)
get_results(y_test, xlm_y_pred)

predictor = xlm_predictor
explainer = LimeTextExplainer(class_names=class_names)

In [None]:
# TP
idx = idx_tp

exp = explainer.explain_instance(x_test[idx], predictor.predict_proba, num_features=25)
print("True positive")
print('Document id: %d' % idx)
print('Predicted class =', predictor.predict(x_test[idx]))
print('True class: %s' % y_test[idx])
exp.show_in_notebook(show_predicted_value=True)

# FN
idx = idx_fn

exp = explainer.explain_instance(x_test[idx], predictor.predict_proba, num_features=25)
print("False negative")
print('Document id: %d' % idx)
print('Predicted class =', predictor.predict(x_test[idx]))
print('True class: %s' % y_test[idx])
exp.show_in_notebook(show_predicted_value=True)

# TN
idx = idx_tn

exp = explainer.explain_instance(x_test[idx], predictor.predict_proba, num_features=25)
print("True negative")
print('Document id: %d' % idx)
print('Predicted class =', predictor.predict(x_test[idx]))
print('True class: %s' % y_test[idx])
exp.show_in_notebook(show_predicted_value=True)

# FP
idx = idx_fp

exp = explainer.explain_instance(x_test[idx], predictor.predict_proba, num_features=25)
print("False positive")
print('Document id: %d' % idx)
print('Predicted class =', predictor.predict(x_test[idx]))
print('True class: %s' % y_test[idx])
exp.show_in_notebook(show_predicted_value=True)

## Dodanie więcej klas (niezbalansowany)

In [8]:
sampler = RandomOverSampler(sampling_strategy='not majority', random_state=13)
x_train_balanced, y_train_balanced = sampler.fit_resample(np.array(dataset["train"]["text"]).reshape((-1, 1)), dataset["train"]["label"])
x_train_balanced = x_train_balanced.reshape((-1))

x_train = x_train_balanced
x_test = dataset["test"]['text']
y_train = y_train_balanced
y_test = dataset["test"]['label']

print(Counter(dataset["train"]["label"]))
print(Counter(y_train_balanced))

Counter({0: 9190, 2: 598, 1: 253})
Counter({0: 9190, 2: 9190, 1: 9190})


### Naive Bayes

In [None]:
# Naive Bayes
vectorizer = TfidfVectorizer()
train_vector = vectorizer.fit_transform(x_train).todense()
test_vector = vectorizer.transform(x_test).todense()

In [None]:
# Gaussian NB
gnb = GaussianNB()

y_pred_gnb = gnb.fit(train_vector, y_train).predict(test_vector)
get_results(y_test, y_pred_gnb)

Confusion matrix: 
[[767  48  51]
 [ 17   4   4]
 [ 87   5  17]]
Classification: 
              precision    recall  f1-score   support

           0       0.88      0.89      0.88       866
           1       0.07      0.16      0.10        25
           2       0.24      0.16      0.19       109

    accuracy                           0.79      1000
   macro avg       0.40      0.40      0.39      1000
weighted avg       0.79      0.79      0.79      1000

MCC: 
0.1039066729068978


In [None]:
mnb = MultinomialNB()

y_pred_mnb = mnb.fit(train_vector, y_train).predict(test_vector)
get_results(y_test, y_pred_mnb)

Confusion matrix: 
[[710  73  83]
 [  9   9   7]
 [ 47  16  46]]
Classification: 
              precision    recall  f1-score   support

           0       0.93      0.82      0.87       866
           1       0.09      0.36      0.15        25
           2       0.34      0.42      0.38       109

    accuracy                           0.77      1000
   macro avg       0.45      0.53      0.46      1000
weighted avg       0.84      0.77      0.80      1000

MCC: 
0.2789394453912142


### FastText

In [None]:
# Przygotowanie pliku pod fasttext
with open("train_data.txt", 'wb') as f:
    f.writelines([f"__label__{label} {sentence}\n".encode() for sentence, label in zip(x_train, y_train)])

In [None]:
model_fast = fasttext.train_supervised("train_data.txt")

y_pred_fast, propabilities = model_fast.predict(x_test)
y_pred_fast = [int(label[0].split("__label__")[1]) for label in y_pred_fast]
get_results(y_test, y_pred_fast)

Confusion matrix: 
[[ 41  83 742]
 [  1   1  23]
 [  1   0 108]]
Classification: 
              precision    recall  f1-score   support

           0       0.95      0.05      0.09       866
           1       0.01      0.04      0.02        25
           2       0.12      0.99      0.22       109

    accuracy                           0.15      1000
   macro avg       0.36      0.36      0.11      1000
weighted avg       0.84      0.15      0.10      1000

MCC: 
0.0664843954832379


### Transformers

In [None]:
dkleczek_bert_y_pred, dkleczek_bert_predictor, dkleczek_bert_learner = transformer_model('dkleczek/bert-base-polish-cased-v1', x_train, y_train, x_test)
get_results(y_test, dkleczek_bert_y_pred)

Downloading:   0%|          | 0.00/459 [00:00<?, ?B/s]

preprocessing train...
language: pl
train sequence lengths:
	mean : 13
	95percentile : 20
	99percentile : 23


Downloading:   0%|          | 0.00/30.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/489k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/112 [00:00<?, ?B/s]

404 Client Error: Not Found for url: https://huggingface.co/dkleczek/bert-base-polish-cased-v1/resolve/main/tf_model.h5


Is Multi-Label? False




Downloading:   0%|          | 0.00/531M [00:00<?, ?B/s]



begin training using onecycle policy with max lr of 5e-05...
Epoch 1/2
Epoch 2/2
Confusion matrix: 
[[821  38   7]
 [ 21   4   0]
 [ 85  13  11]]
Classification: 
              precision    recall  f1-score   support

           0       0.89      0.95      0.92       866
           1       0.07      0.16      0.10        25
           2       0.61      0.10      0.17       109

    accuracy                           0.84      1000
   macro avg       0.52      0.40      0.40      1000
weighted avg       0.84      0.84      0.81      1000

MCC: 
0.1654468564271027


## LIME

In [9]:
class_names = ["0", "1", "2"]

In [10]:
idx_tp = 0
idx_fn = 24
idx_tn = 31
idx_fp = 148

### Naive Bayes

Gaussian NB

In [None]:
vectorizer = TfidfVectorizer()
train_vector = vectorizer.fit_transform(x_train).todense()
test_vector = vectorizer.transform(x_test).todense()

NameError: ignored

In [None]:
pipeline = Pipeline([
     ('vectorizer', vectorizer), 
     ('to_dense', DenseTransformer()), 
     ('classifier', gnb)
])
explainer = LimeTextExplainer(class_names=class_names)

In [None]:
# TP
idx = idx_tp

exp = explainer.explain_instance(x_test[idx], pipeline.predict_proba)
print("True positive")
print('Document id: %d' % idx)
print('Predicted class =', class_names[gnb.predict(test_vector[idx]).reshape(1,-1)[0,0]])
print('True class: %s' % y_test[idx])
exp.show_in_notebook(show_predicted_value=True)

# FN
idx = idx_fn

exp = explainer.explain_instance(x_test[idx], pipeline.predict_proba)
print("False negative")
print('Document id: %d' % idx)
print('Predicted class =', class_names[gnb.predict(test_vector[idx]).reshape(1,-1)[0,0]])
print('True class: %s' % y_test[idx])
exp.show_in_notebook(show_predicted_value=True)

# TN
idx = idx_tn

exp = explainer.explain_instance(x_test[idx], pipeline.predict_proba)
print("True negative")
print('Document id: %d' % idx)
print('Predicted class =', class_names[gnb.predict(test_vector[idx]).reshape(1,-1)[0,0]])
print('True class: %s' % y_test[idx])
exp.show_in_notebook(show_predicted_value=True)

# FP
idx = idx_fp

exp = explainer.explain_instance(x_test[idx], pipeline.predict_proba)
print("False positive")
print('Document id: %d' % idx)
print('Predicted class =', class_names[gnb.predict(test_vector[idx]).reshape(1,-1)[0,0]])
print('True class: %s' % y_test[idx])
exp.show_in_notebook(show_predicted_value=True)

Multinomial NB

In [None]:
c = make_pipeline(vectorizer, mnb)
explainer = LimeTextExplainer(class_names=class_names)

In [None]:
# TP
idx = idx_tp

exp = explainer.explain_instance(x_test[idx], c.predict_proba)

print("True positive")
print('Document id: %d' % idx)
print('Predicted class =', class_names[mnb.predict(test_vector[idx]).reshape(1,-1)[0,0]])
print('True class: %s' % y_test[idx])
exp.show_in_notebook(show_predicted_value=True)

# FN
idx = idx_fn

exp = explainer.explain_instance(x_test[idx], c.predict_proba)

print("False negative")
print('Document id: %d' % idx)
print('Predicted class =', class_names[mnb.predict(test_vector[idx]).reshape(1,-1)[0,0]])
print('True class: %s' % y_test[idx])
exp.show_in_notebook(show_predicted_value=True)

# TN
idx = idx_tn

exp = explainer.explain_instance(x_test[idx], c.predict_proba)

print("True negative")
print('Document id: %d' % idx)
print('Predicted class =', class_names[mnb.predict(test_vector[idx]).reshape(1,-1)[0,0]])
print('True class: %s' % y_test[idx])
exp.show_in_notebook(show_predicted_value=True)

# FP
idx = idx_fp

exp = explainer.explain_instance(x_test[idx], c.predict_proba)

print("False positive")
print('Document id: %d' % idx)
print('Predicted class =', class_names[mnb.predict(test_vector[idx]).reshape(1,-1)[0,0]])
print('True class: %s' % y_test[idx])
exp.show_in_notebook(show_predicted_value=True)

### FastText

In [None]:
classifier  = model_fast # ładujemy nasz wytrenowany model z fasttextu

def tokenize_string(string): # funkcja która dzieli tekst tak samo jak fasttext.
    return string.split()

# nasz klasyfikator był uczony pojedynczymi słowami
explainer = LimeTextExplainer(split_expression=tokenize_string, class_names=class_names)

def fasttext_pred(classifier, texts):
    res = []
    labels, probabilities = classifier.predict(texts, 3) # teksty z 2 labelkami

    for label, probs, text in zip(labels, probabilities, texts):
        order = np.argsort(np.array(label))
        res.append(probs[order])

    return np.array(res)

In [None]:
# TP
idx = idx_tp

exp = explainer.explain_instance(x_test[idx], classifier_fn=lambda x: fasttext_pred(classifier, x))
print("True positive")
print('Document id: %d' % idx)
print('Predicted class =', classifier.predict(x_test[idx])[0])
print('True class: %s' % y_test[idx])
exp.show_in_notebook(show_predicted_value=True)

# FN
idx = idx_fn

exp = explainer.explain_instance(x_test[idx], classifier_fn=lambda x: fasttext_pred(classifier, x))
print("False negative")
print('Document id: %d' % idx)
print('Predicted class =', classifier.predict(x_test[idx])[0])
print('True class: %s' % y_test[idx])
exp.show_in_notebook(show_predicted_value=True)

# TN
idx = idx_tn

exp = explainer.explain_instance(x_test[idx], classifier_fn=lambda x: fasttext_pred(classifier, x))
print("True negative")
print('Document id: %d' % idx)
print('Predicted class =', classifier.predict(x_test[idx])[0])
print('True class: %s' % y_test[idx])
exp.show_in_notebook(show_predicted_value=True)

#FP
idx = idx_fp

exp = explainer.explain_instance(x_test[idx], classifier_fn=lambda x: fasttext_pred(classifier, x))
print("False positive")
print('Document id: %d' % idx)
print('Predicted class =', classifier.predict(x_test[idx])[0])
print('True class: %s' % y_test[idx])
exp.show_in_notebook(show_predicted_value=True)

### Transformers

Kleczek - bert

In [None]:
predictor = dkleczek_bert_predictor
explainer = LimeTextExplainer(class_names=class_names)

In [None]:
# TP
idx = idx_tp

exp = explainer.explain_instance(x_test[idx], predictor.predict_proba, num_features=25)
print("True positive")
print('Document id: %d' % idx)
print('Predicted class =', predictor.predict(x_test[idx]))
print('True class: %s' % y_test[idx])
exp.show_in_notebook(show_predicted_value=True)

# FN
idx = idx_fn

exp = explainer.explain_instance(x_test[idx], predictor.predict_proba, num_features=25)
print("False negative")
print('Document id: %d' % idx)
print('Predicted class =', predictor.predict(x_test[idx]))
print('True class: %s' % y_test[idx])
exp.show_in_notebook(show_predicted_value=True)

# TN
idx = idx_tn

exp = explainer.explain_instance(x_test[idx], predictor.predict_proba, num_features=25)
print("True negative")
print('Document id: %d' % idx)
print('Predicted class =', predictor.predict(x_test[idx]))
print('True class: %s' % y_test[idx])
exp.show_in_notebook(show_predicted_value=True)

# FP
idx = idx_fp

exp = explainer.explain_instance(x_test[idx], predictor.predict_proba, num_features=25)
print("False positive")
print('Document id: %d' % idx)
print('Predicted class =', predictor.predict(x_test[idx]))
print('True class: %s' % y_test[idx])
exp.show_in_notebook(show_predicted_value=True)

Output hidden; open in https://colab.research.google.com to view.

Bert

In [11]:
bert_y_pred, bert_predictor, bert_learner = transformer_model('bert-base-multilingual-cased', x_train, y_train, x_test)
get_results(y_test, bert_y_pred)

predictor = bert_predictor
explainer = LimeTextExplainer(class_names=class_names)

Downloading:   0%|          | 0.00/625 [00:00<?, ?B/s]

preprocessing train...
language: pl
train sequence lengths:
	mean : 13
	95percentile : 20
	99percentile : 23


Downloading:   0%|          | 0.00/996k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.96M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/29.0 [00:00<?, ?B/s]

Is Multi-Label? False


Downloading:   0%|          | 0.00/1.08G [00:00<?, ?B/s]



begin training using onecycle policy with max lr of 5e-05...
Epoch 1/2
Epoch 2/2
Confusion matrix: 
[[814  35  17]
 [ 19   2   4]
 [ 81  10  18]]
Classification: 
              precision    recall  f1-score   support

           0       0.89      0.94      0.91       866
           1       0.04      0.08      0.06        25
           2       0.46      0.17      0.24       109

    accuracy                           0.83      1000
   macro avg       0.46      0.40      0.40      1000
weighted avg       0.82      0.83      0.82      1000

MCC: 
0.18953033681455808


In [12]:
# TP
idx = idx_tp

exp = explainer.explain_instance(x_test[idx], predictor.predict_proba, num_features=25)
print("True positive")
print('Document id: %d' % idx)
print('Predicted class =', predictor.predict(x_test[idx]))
print('True class: %s' % y_test[idx])
exp.show_in_notebook(show_predicted_value=True)

# FN
idx = idx_fn

exp = explainer.explain_instance(x_test[idx], predictor.predict_proba, num_features=25)
print("False negative")
print('Document id: %d' % idx)
print('Predicted class =', predictor.predict(x_test[idx]))
print('True class: %s' % y_test[idx])
exp.show_in_notebook(show_predicted_value=True)

# TN
idx = idx_tn

exp = explainer.explain_instance(x_test[idx], predictor.predict_proba, num_features=25)
print("True negative")
print('Document id: %d' % idx)
print('Predicted class =', predictor.predict(x_test[idx]))
print('True class: %s' % y_test[idx])
exp.show_in_notebook(show_predicted_value=True)

# FP
idx = idx_fp

exp = explainer.explain_instance(x_test[idx], predictor.predict_proba, num_features=25)
print("False positive")
print('Document id: %d' % idx)
print('Predicted class =', predictor.predict(x_test[idx]))
print('True class: %s' % y_test[idx])
exp.show_in_notebook(show_predicted_value=True)

Output hidden; open in https://colab.research.google.com to view.

XLM

In [None]:
xlm_y_pred, xlm_predictor, xlm_learner = transformer_model('xlm-roberta-base', x_train, y_train, x_test)
get_results(y_test, xlm_y_pred)

predictor = xlm_predictor
explainer = LimeTextExplainer(class_names=class_names)

In [None]:
# TP
idx = idx_tp

exp = explainer.explain_instance(x_test[idx], predictor.predict_proba, num_features=25)
print("True positive")
print('Document id: %d' % idx)
print('Predicted class =', predictor.predict(x_test[idx]))
print('True class: %s' % y_test[idx])
exp.show_in_notebook(show_predicted_value=True)

# FN
idx = idx_fn

exp = explainer.explain_instance(x_test[idx], predictor.predict_proba, num_features=25)
print("False negative")
print('Document id: %d' % idx)
print('Predicted class =', predictor.predict(x_test[idx]))
print('True class: %s' % y_test[idx])
exp.show_in_notebook(show_predicted_value=True)

# TN
idx = idx_tn

exp = explainer.explain_instance(x_test[idx], predictor.predict_proba, num_features=25)
print("True negative")
print('Document id: %d' % idx)
print('Predicted class =', predictor.predict(x_test[idx]))
print('True class: %s' % y_test[idx])
exp.show_in_notebook(show_predicted_value=True)

# FP
idx = idx_fp

exp = explainer.explain_instance(x_test[idx], predictor.predict_proba, num_features=25)
print("False positive")
print('Document id: %d' % idx)
print('Predicted class =', predictor.predict(x_test[idx]))
print('True class: %s' % y_test[idx])
exp.show_in_notebook(show_predicted_value=True)

# Odpowiedzi

Wyniki z rożnych metryk

In [None]:
Base

							              Task 1 												                    Task 2
* ---------------------- * -------- * ------------- * ---------- * ------ *** -------- * ------------- * ---------- * ------ *
| Classfier              | Accuracy | Mac precision | Mac Recall | Mac F1 ||| Accuracy | Mac precision | Mac Recall | Mac F1 |
* ---------------------- * -------- * ------------- * ---------- * ------ *** -------- * ------------- * ---------- * ------ *
| Naive Bayes (Gaussian) | 0.78		  | 0.57          | 0.58       | 0.57   ||| 0.79	   | 0.40          | 0.41       | 0.40   |
* ---------------------- * -------- * ------------- * ---------- * ------ *** -------- * ------------- * ---------- * ------ *
| Naive Bayes (Multi)    | 0.87   	| 0.93          | 0.50       | 0.47   ||| 0.87	   | 0.29          | 0.33       | 0.31   |
* ---------------------- * -------- * ------------- * ---------- * ------ *** -------- * ------------- * ---------- * ------ *
| FastText               | 0.87   	| 0.74          | 0.57       | 0.59   ||| 0.87	   | 0.57          | 0.37       | 0.37   |
* ---------------------- * -------- * ------------- * ---------- * ------ *** -------- * ------------- * ---------- * ------ *
| T: dkleczek_bert       | 0.90   	| 0.88          | 0.63       | 0.68   ||| 0.88	   | 0.60          | 0.44       | 0.47   |
* ---------------------- * -------- * ------------- * ---------- * ------ *** -------- * ------------- * ---------- * ------ *
| T: allegro_herbert     | 0.87   	| 0.43          | 0.50       | 0.46   ||| 0.88	   | 0.53          | 0.39       | 0.41   |
* ---------------------- * -------- * ------------- * ---------- * ------ *** -------- * ------------- * ---------- * ------ *
| T: geotrend_distilbert | 0.87   	| 0.43          | 0.50       | 0.46   ||| 0.87	   | 0.29          | 0.33       | 0.31   |
* ---------------------- * -------- * ------------- * ---------- * ------ *** -------- * ------------- * ---------- * ------ *
| T: geotrend_bert       | 0.87   	| 0.43          | 0.50       | 0.46   ||| 0.87	   | 0.29          | 0.33       | 0.31   |
* ---------------------- * -------- * ------------- * ---------- * ------ *** -------- * ------------- * ---------- * ------ *
| T: xlm                 | 0.87   	| 0.43          | 0.50       | 0.46   ||| 0.87	   | 0.29          | 0.33       | 0.31   |
* ---------------------- * -------- * ------------- * ---------- * ------ *** -------- * ------------- * ---------- * ------ *
| T: distilbert          | 0.88   	| 0.85          | 0.55       | 0.56   ||| 0.87	   | 0.29          | 0.33       | 0.31   |
* ---------------------- * -------- * ------------- * ---------- * ------ *** -------- * ------------- * ---------- * ------ *
| T: bert                | 0.87   	| 0.73          | 0.55       | 0.56   ||| 0.87	   | 0.51          | 0.34       | 0.81   |
* ---------------------- * -------- * ------------- * ---------- * ------ *** -------- * ------------- * ---------- * ------ *

Balanced
							              Task 1 												                    Task 2
* ---------------------- * -------- * ------------- * ---------- * ------ *** -------- * ------------- * ---------- * ------ *
| Classfier              | Accuracy | Mac precision | Mac Recall | Mac F1 ||| Accuracy | Mac precision | Mac Recall | Mac F1 |
* ---------------------- * -------- * ------------- * ---------- * ------ *** -------- * ------------- * ---------- * ------ *
| Naive Bayes (Gaussian) | 0.78     | 0.55          | 0.56       | 0.55   ||| 0.79	   | 0.40          | 0.40       | 0.39   |
* ---------------------- * -------- * ------------- * ---------- * ------ *** -------- * ------------- * ---------- * ------ *
| Naive Bayes (Multi)    | 0.82     | 0.66          | 0.72       | 0.68   ||| 0.77	   | 0.45          | 0.53       | 0.46   |
* ---------------------- * -------- * ------------- * ---------- * ------ *** -------- * ------------- * ---------- * ------ *
| FastText               | 0.29     | 0.57          | 0.58       | 0.29   ||| 0.15	   | 0.36          | 0.36       | 0.11   |
* ---------------------- * -------- * ------------- * ---------- * ------ *** -------- * ------------- * ---------- * ------ *
| T: dkleczek_bert       | 0.88     | 0.76          | 0.59       | 0.61   ||| 0.84	   | 0.54          | 0.41       | 0.41   |
* ---------------------- * -------- * ------------- * ---------- * ------ *** -------- * ------------- * ---------- * ------ *
| T: allegro_herbert     | 0.13     | 0.07          | 0.50       | 0.12   ||| 0.03	   | 0.01          | 0.33       | 0.02   |
* ---------------------- * -------- * ------------- * ---------- * ------ *** -------- * ------------- * ---------- * ------ *
| T: geotrend_distilbert | 0.80     | 0.57          | 0.58       | 0.58   ||| 0.77	   | 0.46          | 0.38       | 0.37   |
* ---------------------- * -------- * ------------- * ---------- * ------ *** -------- * ------------- * ---------- * ------ *
| T: geotrend_bert       | 0.79     | 0.57          | 0.58       | 0.58   ||| 0.78	   | 0.48          | 0.39       | 0.36   |
* ---------------------- * -------- * ------------- * ---------- * ------ *** -------- * ------------- * ---------- * ------ *
| T: xlm                 | 0.85     | 0.66          | 0.64       | 0.65   ||| 0.82	   | 0.49          | 0.43       | 0.43   |
* ---------------------- * -------- * ------------- * ---------- * ------ *** -------- * ------------- * ---------- * ------ *
| T: distilbert          | 0.84     | 0.63          | 0.58       | 0.59   ||| 0.83	   | 0.48          | 0.38       | 0.39   |
* ---------------------- * -------- * ------------- * ---------- * ------ *** -------- * ------------- * ---------- * ------ *
| T: bert                | 0.86     | 0.68          | 0.62       | 0.64   ||| 0.83	   | 0.45          | 0.38       | 0.38   |
* ---------------------- * -------- * ------------- * ---------- * ------ *** -------- * ------------- * ---------- * ------ *

Macro liczy dla każdej klasy osobno i bierze średnią.
Micro bierze wszystkie i liczy dopiero. Micro jest lepsze jak jest wiecej jednej klasy niz drugiej.

Wybieram F1 macro

Do task1 wybrany dkleczek_bert bo miał najlepsze F1 macro.
U nas 0 to "dobre", 1 to "złe"
    0  1
 0 TP FN
 1 FP TN
TP - ma być 0 = wybrał 0
TN - ma być 1 = wybrał 1
FP - ma być 0 = wybrał 1
FN - ma być 1 = wybrał 0

Wyniki z wybranych Tp, FN, TN, FP

In [None]:
Base
							            Task 1 									Task 2
* ---------------------- * -- * -- * -- * -- *** -- * -- * -- * -- *
| Classfier              | TP | FN | TN | FP ||| TP | FN | TN | FP |
* ---------------------- * -- * -- * -- * -- *** -- * -- * -- * -- *
| Naive Bayes (Gaussian) | TP | TN | FN | TP ||| TP | TNm| FN | TP |
* ---------------------- * -- * -- * -- * -- *** -- * -- * -- * -- *
| Naive Bayes (Multi)    | TP | FN | FN | TP ||| TP | FN | FN | TP |
* ---------------------- * -- * -- * -- * -- *** -- * -- * -- * -- *
| FastText               | TP | FN | FN | TP ||| TP | FN | FN | TP |
* ---------------------- * -- * -- * -- * -- *** -- * -- * -- * -- *
| T: dkleczek_bert !!!   | TP!| TN!| TN!| FP!||| TP | TNm| TN | FP |
* ---------------------- * -- * -- * -- * -- *** -- * -- * -- * -- *
| T: xlm                 |    |    |    |    |||    |    |    |    |
* ---------------------- * -- * -- * -- * -- *** -- * -- * -- * -- *
| T: bert                | TP | FN | FN | TP ||| TP | FN | FN | TP |
* ---------------------- * -- * -- * -- * -- *** -- * -- * -- * -- *
Oznaczenia tutaj mówią o tym czy dany TP/TN/TF/FP się zgadza czyli.
jeżeli miał być 1 i wybrał 0 dla przykładu FN to jest ok.
To co ma jedynkę jest bazą czyli tam gdzie miał być TP było  TP, tam gdzie FP to FP itd.
TP - ma być 0 = wybrał 0
TN - ma być 1 = wybrał 1
FP - ma być 0 = wybrał 1
FN - ma być 1 = wybrał 0

Balanced
							            Task 1 									Task 2
* ---------------------- * -- * -- * -- * -- *** -- * -- * -- * -- *
| Classfier              | TP | FN | TN | FP ||| TP | FN | TN | FP |
* ---------------------- * -- * -- * -- * -- *** -- * -- * -- * -- *
| Naive Bayes (Gaussian) | TP | FN | FN | TP ||| TP | FN | FN | TP |
* ---------------------- * -- * -- * -- * -- *** -- * -- * -- * -- *
| Naive Bayes (Multi)    | TP | TN | FN | TP ||| TP | TNm| FN | TP |
* ---------------------- * -- * -- * -- * -- *** -- * -- * -- * -- *
| FastText               | FP | TN | TN | FP ||| FP | TNm| TN | FP |
* ---------------------- * -- * -- * -- * -- *** -- * -- * -- * -- *
| T: dkleczek_bert !!!   | TP | TN | TN | FP ||| TP | FN | TN | TP |
* ---------------------- * -- * -- * -- * -- *** -- * -- * -- * -- *
| T: xlm                 |    |    |    |    |||    |    |    |    |
* ---------------------- * -- * -- * -- * -- *** -- * -- * -- * -- *
| T: bert                | TP | TN | TN | FP ||| TP | TNm| TN | TP |
* ---------------------- * -- * -- * -- * -- *** -- * -- * -- * -- *

Dla taska 2 liczymy 1 i 2 razem (jako negative).
m przy task02 oznacza że trafił w TP TN ale np. dał 1 zamiast 2.

To samo tylko czy podał poprawną odpowiedź

In [None]:
Base
							            Task 1 									Task 2
* ---------------------- * -- * -- * -- * -- *** -- * -- * -- * -- *
| Classfier              | x  |    | x  |    ||| x  |    | x  |    | 2 + 2
* ---------------------- * -- * -- * -- * -- *** -- * -- * -- * -- *
| Naive Bayes (Gaussian) | x  | x  |    | x  ||| x  | x m|    | x  | 3 + 3
* ---------------------- * -- * -- * -- * -- *** -- * -- * -- * -- *
| Naive Bayes (Multi)    | x  |    |    | x  ||| x  |    |    | x  | 2 + 2
* ---------------------- * -- * -- * -- * -- *** -- * -- * -- * -- *
| FastText               | x  |    |    | x  ||| x  |    |    | x  | 2 + 2
* ---------------------- * -- * -- * -- * -- *** -- * -- * -- * -- *
| T: dkleczek_bert !!!   | x! | x! | x! |  ! ||| x  | x m| x  |    | 3 + 3
* ---------------------- * -- * -- * -- * -- *** -- * -- * -- * -- *
| T: xlm                 |    |    |    |    |||    |    |    |    |
* ---------------------- * -- * -- * -- * -- *** -- * -- * -- * -- *
| T: bert                | x  |    |    | x  ||| x  |    |    | x  | 2 + 2
* ---------------------- * -- * -- * -- * -- *** -- * -- * -- * -- *

Balanced
							            Task 1 								Task 2
* ---------------------- * -- * -- * -- * -- *** -- * -- * -- * -- *
| Classfier              | x  |    | x  |    ||| x  |    | x  |    | 2 + 2
* ---------------------- * -- * -- * -- * -- *** -- * -- * -- * -- *
| Naive Bayes (Gaussian) | x  |    |    | x  ||| x  |    |    | x  | 2 + 2
* ---------------------- * -- * -- * -- * -- *** -- * -- * -- * -- *
| Naive Bayes (Multi)    | x  | x  |    | x  ||| x  | x m|    | x  | 3 + 3
* ---------------------- * -- * -- * -- * -- *** -- * -- * -- * -- *
| FastText               |    | x  | x  |    |||    | x m| x  |    | 2 + 2
* ---------------------- * -- * -- * -- * -- *** -- * -- * -- * -- *
| T: dkleczek_bert !!!   | x  | x  | x  |    ||| x  |    | x  | x  | 3 + 3
* ---------------------- * -- * -- * -- * -- *** -- * -- * -- * -- *
| T: xlm                 |    |    |    |    |||    |    |    |    |
* ---------------------- * -- * -- * -- * -- *** -- * -- * -- * -- *
| T: bert                | x  | x  | x  |    ||| x  | x  | x  | x  | 3 + 4
* ---------------------- * -- * -- * -- * -- *** -- * -- * -- * -- *


# Answer the following questions: 

- Which of the classifiers works the best for the task 1 and the task 2.
Zarówno dla task 01 jak i task 02(obecnie) najlepiej poradziłsobie klasyfikator bayesa.
- Did you achieve results comparable with the results of PolEval Task?
http://2019.poleval.pl/index.php/results/
Od tych wyników najlepszych to na pewno nie, ale wiele z tych niższych pobijają teoretycznie pod względem wartości metryk.
- Did you achieve results comparable with the Klej leaderboard?
https://klejbenchmark.com/leaderboard/  (CBD)
Generalnie sporo gorzej od tych najlepszych, ale po 10 miejscu zaczyna się to robić podobne. Była tam XLM oraz Multilingual bert (jeszcze nie am wyniku).
- Describe strengths and weaknesses of each of the compared algorithms.
Naive Bayess patrząc tylko na wyniki w tabelach (dla wybranych 4 TP...) dawał wyniki najlepsze. Rozszerzanie zbioru pogarszało niestety wyniki. Dodatkowo patrząc na wyniki z LIMEa to generalnie praktycznie się nie mylił (poza 1 przypadkiem). Powiedziałbym że jest doś prostolinijny i to byłoby jego zaletą.
Fasttext - co do tabelek to radził sobie średnio, generalnie pół na pół. Wyniki różnych metryk miał całkiem dobre, ale rozszerzanie zbioru wyszło mu na gorsze, bo zaczął łapać mało znaczące słowa jako ważne (chociażby @anonymous_acount). Bez rozszerzania generalnie radził sobie całkiem dobrze, ale widać też wiele różnych "wpływow" wiekszośc z wybranych przykładów jest jednoznaczna, ale w fasttext są one podzielone i nawet jest 50/50 jeden. W Fasttext te różne wagi dobrze pokazują jak on działa.
Modele transformery...
- Do you think comparison of raw performance values on a single task is enough to assess the value of a given algorithm/model?
Raczej nie. Te modele dają bardzo różne wyniki. Tym bardiej jak rozszerzyłem zbiór to okazało się że  te wyniki były zupełnie inne, a modele transformerowe się uczyły lepiej.
- Did SHAP show that the models use valuable features/words when performing their decision?
?