In [62]:
import numpy as np
import pandas as pd

# load predictions from tsv

after training, a tsv is generated with test set predictions vs ground truth

In [123]:
# load data from generated tsv
csv_file = "./experiments/wili2018/testset.tsv"
df = pd.read_csv(csv_file, sep="\t")
true_labels = df["true_label"].to_list()
true_names = df["true_name"].to_list()
true_texts = df["sample"].to_list()
pred_labels = df["pred_label"].to_list()
assert len(true_labels) == len(pred_labels) == len(true_texts)
assert len(set(true_labels)) == 235
# create mapping from labels to full names
lang_map = dict(list(zip(true_labels, true_names)))
print(f"test data has {len(true_labels)} samples, {len(set(true_labels))} labels")

test data has 117500 samples, 235 labels


In [11]:
# check label distribution
from collections import Counter
counts = Counter(true_labels)
print(f"set of counts per label: {set(counts.values())}")

set of counts per label: {500}


## statistics

In [83]:
# macro precision, recall, fl
from sklearn.metrics import precision_score, recall_score, f1_score, classification_report
prc = precision_score(y_true=true_labels, y_pred=pred_labels, average="macro")
rcl = recall_score(y_true=true_labels, y_pred=pred_labels, average="macro")
f1b = f1_score(y_true=true_labels, y_pred=pred_labels, average="macro")
print(f"classification results, test set:\n")
print(f"macro precision: {prc:.5f}")
print(f"macro recall:    {rcl:.5f}")
print(f"macro F1:        {f1b:.5f}")

classification results, test set:

macro precision: 0.91144
macro recall:    0.90686
macro F1:        0.90768


In [70]:
# quick and dirty way to get per-class metrics
results = str(classification_report(y_true=true_labels, y_pred=pred_labels, digits=5)).split("\n")
line_results = results[2:-5]
cls_scores = [line.split() for line in line_results]
assert set(([len(x) for x in cls_scores])) == {5}
# cls_scores are now 5-element tuples of (lang-code, precision, recall, f1, support)

In [79]:
# get statistics about class F1 scores
cls_f1_scores = [float(t[3]) for t in cls_scores]
print(f"class f1 score statistics\n")
print(f"minimum F1: {np.min(cls_f1_scores)}")
print(f"10 %ile F1: {np.percentile(cls_f1_scores, 10)}")
print(f"median  F1: {np.median(cls_f1_scores)}")
print(f"mean    F1: {np.mean(cls_f1_scores):.5f}, std: {np.std(cls_f1_scores):.5f}")
print(f"90 %ile F1: {np.percentile(cls_f1_scores, 90)}")
print(f"maximum F1: {np.max(cls_f1_scores)}")

class f1 score statistics

minimum F1: 0.4425
10 %ile F1: 0.79118
median  F1: 0.94405
mean    F1: 0.90768, std: 0.10524
90 %ile F1: 0.99156
maximum F1: 1.0


In [80]:
# get highestt-F1 languages
print(f"ten languages by highest F1 score:\n")
for cls_score in sorted(cls_scores, key=lambda x: x[3])[-10:]:
    try:
        print(f"lang: {lang_map.get(cls_score[0], 'nan'):<16} {cls_score[0]:<8}\tF1: {cls_score[3]}, prc: {cls_score[1]}, rcl: {cls_score[2]}")
    except:
        print(cls_score)

ten languages by highest F1 score:

lang: nan              nan     	F1: 0.99501, prc: 0.99205, rcl: 0.99800
lang: Maori            mri     	F1: 0.99599, prc: 0.99799, rcl: 0.99400
lang: Malagasy         mlg     	F1: 0.99600, prc: 0.99600, rcl: 0.99600
lang: Uighur           uig     	F1: 0.99699, prc: 1.00000, rcl: 0.99400
lang: Lojban           jbo     	F1: 0.99800, prc: 0.99602, rcl: 1.00000
lang: Burmese          mya     	F1: 0.99800, prc: 1.00000, rcl: 0.99600
lang: Tibetan          bod     	F1: 0.99900, prc: 1.00000, rcl: 0.99800
lang: Dhivehi          div     	F1: 0.99900, prc: 1.00000, rcl: 0.99800
lang: Central Kurdish  ckb     	F1: 1.00000, prc: 1.00000, rcl: 1.00000
lang: Navajo           nav     	F1: 1.00000, prc: 1.00000, rcl: 1.00000


In [81]:
# get lowest-F1 languages
print(f"ten languages by lowest F1 score:\n")
for cls_score in sorted(cls_scores, key=lambda x: x[3])[:10]:
    print(f"lang: {lang_map.get(cls_score[0], 'nan'):<16} {cls_score[0]:<8}\tF1: {cls_score[3]}, prc: {cls_score[1]}, rcl: {cls_score[2]}")

ten languages by lowest F1 score:

lang: Croatian         hrv     	F1: 0.44250, prc: 0.43156, rcl: 0.45400
lang: Bosnian          bos     	F1: 0.44815, prc: 0.55457, rcl: 0.37600
lang: Pampanga         pam     	F1: 0.46772, prc: 0.82741, rcl: 0.32600
lang: Serbo-Croatian   hbs     	F1: 0.47122, prc: 0.42810, rcl: 0.52400
lang: Indonesian       ind     	F1: 0.48993, prc: 0.55584, rcl: 0.43800
lang: English          eng     	F1: 0.54071, prc: 0.43890, rcl: 0.70400
lang: Malay            msa     	F1: 0.56330, prc: 0.55299, rcl: 0.57400
lang: German           deu     	F1: 0.60388, prc: 0.47138, rcl: 0.84000
lang: Banyumasan       map-bms 	F1: 0.61326, prc: 0.53858, rcl: 0.71200
lang: Chavacano        cbk     	F1: 0.66294, prc: 0.62021, rcl: 0.71200


## qualitative analysis

both English and German are both among the worst classified, so let's check out examples

i'm not exactly a `pandas` master yet so forgive the list comprehensions below

### German

In [104]:
# get most common false negatives
lang = "deu"

false_negs = [pred_labels[i] for i in range(len(true_labels)) if true_labels[i] == lang and pred_labels[i] != lang]
print(f"most common languages for {lang_map[lang]} false negatives\n")
for lang_code, count in Counter(false_negs).most_common(10):
    print(f"{lang_map[lang_code]:<22} {lang_code:<8}: {count:>3}")
print("\n")

# get most common false positives
false_pos = [true_labels[i] for i in range(len(true_labels)) if true_labels[i] != lang and pred_labels[i] == lang]
print(f"most common languages for {lang_map[lang]} false positives\n")
for lang_code, count in Counter(false_pos).most_common(10):
    print(f"{lang_map[lang_code]:<22} {lang_code:<8}: {count:>3}")

most common languages for German false negatives

Pennsylvania German    pdc     :  18
Low German             nds     :  15
Alemannic German       als     :   9
Palatine German        pfl     :   4
Saterfriesisch         stq     :   3
Luxembourgish          ltz     :   3
Bavarian               bar     :   3
Albanian               sqi     :   2
Dutch                  nld     :   2
Bulgarian              bul     :   2


most common languages for German false positives

Pampanga               pam     : 261
Alemannic German       als     :  40
Bavarian               bar     :  34
Low German             nds     :  20
Palatine German        pfl     :  19
Pennsylvania German    pdc     :  17
Luxembourgish          ltz     :  12
Aromanian              rup     :   6
Saterfriesisch         stq     :   5
Albanian               sqi     :   5


for the most part, this looks reasonable from a linguistic and regional perspective.

most languages with the highest confusion with 'German' label are dialects of German, are Germanic languages, or are regionally proximate.

it is odd that `Pampanga` has so many false positives though, so let's take a qualitative look.

it seems there are some mislabeled entries in the WiLI-2018 dataset.

In [124]:
true_lang = "pam"
pred_lang = "deu"
print(f"'{true_lang}' tagged samples that were predicted '{pred_lang}':\n")
mistaken_samples = [true_texts[i] for i in range(len(true_labels)) 
                    if true_labels[i] == true_lang 
                    and pred_labels[i] == pred_lang]
for idx in range(10):
    print(f"- {mistaken_samples[idx]}")

print(f"\ncompare to:\n")

print(f"'{true_lang}' tagged samples, correctly identified:\n")
mistaken_samples = [true_texts[i] for i in range(len(true_labels)) 
                    if true_labels[i] == true_lang 
                    and pred_labels[i] == true_lang]
for idx in range(10):
    print(f"- {mistaken_samples[idx]}")

'pam' tagged samples that were predicted 'deu':

- Unter MS-DOS gibt es beispielsweise Companion-Viren, die zu einer ausführbaren EXE-Datei eine versteckte Datei gleichen Namens mit der Endung „.com“ erstellen, die dann nur das Virus enthält. Wird in der Kommandozeile von MS-DOS ein Programmname ohne Endung eingegeben, sucht das Betriebssystem zuerst nach Programmen mit der Endung „.com“ und danach erst nach Programmen mit der Endung „.exe“, so dass der Schädling vor dem eigentlichen Programm in der Suchreihenfolge erscheint und aufgerufen wird. Der Schädling führt, nachdem er sich meist im Arbeitsspeicher festgesetzt hat, das ursprüngliche Programm aus, so dass der Benutzer oft nichts von der Infektion bemerkt. Überschreibende
- Im Jahr 2002 wurde das erste Virus geschrieben, das sowohl Win32-Anwendungen als auch ELF-Dateien (zum Beispiel Linux-Anwendungen) infizieren konnte. Dieses Virus kann als das Einläuten eines neuen Zeitalters der Viren gesehen werden.
- Heutzutage sind Compute

just to check, let's find the line numbers for one of the mislabeled samples, and check against the label data.

it seems there are (near?) duplicates in the test data, and in this case, all entries are mis-labeled

In [131]:
!grep -in "Unter MS-DOS gibt es beispielsweise Companion-Viren" ./datasets/WiLi_2018/x_test.txt

407:Unter MS-DOS gibt es beispielsweise Companion-Viren, die zu einer ausführbaren EXE-Datei eine versteckte Datei gleichen Namens mit der Endung „.com“ erstellen, die dann nur das Virus enthält. Wird in der Kommandozeile von MS-DOS ein Programmname ohne Endung eingegeben, sucht das Betriebssystem zuerst nach Programmen mit der Endung „.com“ und danach erst nach Programmen mit der Endung „.exe“, so dass der Schädling vor dem eigentlichen Programm in der Suchreihenfolge erscheint und aufgerufen wird. Der Schädling führt, nachdem er sich meist im Arbeitsspeicher festgesetzt hat, das ursprüngliche Programm aus, so dass der Benutzer oft nichts von der Infektion bemerkt. Überschreibende
29415:Unter MS-DOS gibt es beispielsweise Companion-Viren, die zu einer ausführbaren EXE-Datei eine versteckte Datei gleichen Namens mit der Endung „.com“ erstellen, die dann nur das Virus enthält. Wird in der Kommandozeile von MS-DOS ein Programmname ohne Endung eingegeben, sucht das Betriebssystem zuerst n

In [134]:
!sed -n 407p ./datasets/WiLi_2018/y_test.txt

pam


In [135]:
!sed -n 29415p ./datasets/WiLi_2018/y_test.txt

pam


In [136]:
!sed -n 61464p ./datasets/WiLi_2018/y_test.txt

pam


In [138]:
!wc -l ./datasets/WiLi_2018/x_test.txt

117500 ./datasets/WiLi_2018/x_test.txt


In [139]:
!wc -l ./datasets/WiLi_2018/y_test.txt

117500 ./datasets/WiLi_2018/y_test.txt


### English

In [105]:
# get most common false negatives
lang = "eng"

false_negs = [pred_labels[i] for i in range(len(true_labels)) if true_labels[i] == lang and pred_labels[i] != lang]
print(f"most common languages for {lang_map[lang]} false negatives\n")
for lang_code, count in Counter(false_negs).most_common(10):
    print(f"{lang_map[lang_code]:<22} {lang_code:<8}: {count:>3}")
print("\n")

# get most common false positives
false_pos = [true_labels[i] for i in range(len(true_labels)) if true_labels[i] != lang and pred_labels[i] == lang]
print(f"most common languages for {lang_map[lang]} false positives\n")
for lang_code, count in Counter(false_pos).most_common(10):
    print(f"{lang_map[lang_code]:<22} {lang_code:<8}: {count:>3}")

most common languages for English false negatives

Chavacano              cbk     :  25
Scots                  sco     :  23
Xhosa                  xho     :   9
Igbo                   ibo     :   8
Yoruba                 yor     :   7
Fiji Hindi             hif     :   5
Shona                  sna     :   5
Newari                 new     :   5
Kinyarwanda            kin     :   4
Pennsylvania German    pdc     :   3


most common languages for English false positives

Scots                  sco     :  58
Igbo                   ibo     :  27
Central Khmer          khm     :  25
Konkani                kok     :  19
Yoruba                 yor     :  19
Old English            ang     :  17
Picard                 pcd     :  17
Avar                   ava     :  16
Pangasinan             pag     :  14
Fiji Hindi             hif     :  12


In [128]:
true_lang = "eng"
pred_lang = "cbk"
print(f"'{true_lang}' tagged samples that were predicted '{pred_lang}':\n")
mistaken_samples = [true_texts[i] for i in range(len(true_labels)) 
                    if true_labels[i] == true_lang 
                    and pred_labels[i] == pred_lang]
for idx in range(10):
    print(f"- {mistaken_samples[idx]}")

print(f"\ncompare to:\n")

print(f"'{pred_lang}' tagged samples, correctly identified:\n")
mistaken_samples = [true_texts[i] for i in range(len(true_labels)) 
                    if true_labels[i] == pred_lang 
                    and pred_labels[i] == pred_lang]
for idx in range(10):
    print(f"- {mistaken_samples[idx]}")

'eng' tagged samples that were predicted 'cbk':

- Inclusive Democracy (ID) is a project that aims for direct democracy; economic democracy in a stateless, moneyless and marketless economy; self-management (democracy in the social realm); and ecological democracy.
- 1st place in Freedom of the Press, Best Local Non-Staff Column, Best Investigative/In-depth Story or Series, Best News Feature Story, Best Spot News Story, Best Special Section or Campaign (Advertising), Best Editorial Writing, Best-In-House Promotion, Best Illustration, Best Information Graphic, Best Non-staff story, Editorial of the Year, Best Online Writing in 2013, Nevada Press Association
- Following the Seagram Building He built several smaller projects in a more personal, expressive style, with ornament touches and features far from the sobriety of the modernist style; the Synagogue of Port Chester New York, with a plaster vaulted ceiling and narrow colored windows (1954–56); the Art Gallery of the University of Nebr

In [126]:
true_lang = "ibo"
pred_lang = "eng"
print(f"'{true_lang}' tagged samples that were predicted '{pred_lang}':\n")
mistaken_samples = [true_texts[i] for i in range(len(true_labels)) 
                    if true_labels[i] == true_lang 
                    and pred_labels[i] == pred_lang]
for idx in range(10):
    print(f"- {mistaken_samples[idx]}")

print(f"\ncompare to:\n")

print(f"'{true_lang}' tagged samples, correctly identified:\n")
mistaken_samples = [true_texts[i] for i in range(len(true_labels)) 
                    if true_labels[i] == true_lang 
                    and pred_labels[i] == true_lang]
for idx in range(10):
    print(f"- {mistaken_samples[idx]}")

'ibo' tagged samples that were predicted 'eng':

- Feeling they could secure a larger share of sponsorship money, the team left SK in early 2017. Later in the year, some members returned to SK Gaming, forcing GOB.U to replace them.
- In the episode "Wikiality" of The Colbert Report, host Stephen Colbert has instigated his viewers to vandalize articles in humorous ways, once doing so on the Wikipedia article on elephants. "Weird Al" Yankovic's character in his video 'White & Nerdy' is seen vandalising the entry for the Atlantic record label with the exclamation "You suck!," after they rescinded permission for a parody.
- Concerns have also been raised regarding the lack of accountability that results from users' anonymity, and that it is vulnerable to vandalism and Internet trolls. For example, false information was introduced into the biography of John Seigenthaler, Sr. and remained undetected for four months.
- Most wikis offer at least a title search, and sometimes a full-text search