In [1]:
import pandas as pd
from collections import Counter

### Load dataset

In [2]:
import pandas as pd
from pathlib import Path
from utils import loadNormData

datasets = {}
for p in Path("./datasets").rglob("*"):
    if p.is_file():
        name = f"{p.parent.name}-{p.name.replace('.norm', '')}"
        raw, norm = loadNormData(p)
        datasets[name] = [{"raw": r, "norm": n} for r,n in list(zip(raw, norm))]

pd.DataFrame(datasets['en-dev']).head(3)

Unnamed: 0,raw,norm
0,"[@cdutra5, bruh, get, out, yo, feelings, lol]","[@cdutra5, brother, get, out, your, feelings, ..."
1,"[rt, @demberel_s, :, manan, dund, xaragdax, te...","[rt, @demberel_s, :, manan, dund, xaragdax, te..."
2,"[why, dese, niggas, think, dey, doin, summn]","[why, these, niggers, think, they, doing, some..."


### Seperate train/validation/test

In [3]:
from datasets import Dataset, DatasetDict, concatenate_datasets

splits = {"train": [], "validation": [], "test": []}

for k, ds in datasets.items():
    lang, split = k.split("-", 1)  # e.g. en-train, de-dev
    split = "validation" if split == "dev" else split

    if split in splits:
        ds = Dataset.from_list(ds)
        ds = ds.add_column("lang", [lang] * len(ds))
        splits[split].append(ds)

out = DatasetDict({
    s: concatenate_datasets(dsets)
    for s, dsets in splits.items()
    if dsets
})

out

DatasetDict({
    train: Dataset({
        features: ['raw', 'norm', 'lang'],
        num_rows: 39178
    })
    validation: Dataset({
        features: ['raw', 'norm', 'lang'],
        num_rows: 8408
    })
    test: Dataset({
        features: ['raw', 'norm', 'lang'],
        num_rows: 11956
    })
})

In [4]:
print({x.split('-')[0] for x in datasets.keys()})

{'es', 'iden', 'ja', 'it', 'nl', 'th', 'tr', 'sl', 'da', 'vi', 'hr', 'ko', 'de', 'sr', 'trde', 'en', 'id'}


### Seperate test data

In [5]:
from utils import sampling_dev

full_test = out["test"]
dev_test = sampling_dev(out["test"], ratio=0.5)

df = pd.DataFrame({"before": Counter(full_test["lang"]), "after": Counter(dev_test["lang"])
}).fillna(0).astype(int)

df["kept_ratio"] = df["after"] / df["before"]
df

Unnamed: 0,before,after,kept_ratio
da,181,90,0.497238
de,583,291,0.499142
en,1967,983,0.499746
es,531,265,0.499058
hr,1586,793,0.5
id,861,430,0.499419
iden,165,82,0.49697
it,100,50,0.5
ja,609,304,0.499179
ko,214,107,0.5


In [6]:
# Mask
input_full_test = [{"raw": d['raw'], "lang":d['lang'], "norm": [""]*len(d['norm'])} for d in full_test]
input_dev_test = [{"raw": d['raw'], "lang":d['lang'], "norm": [""]*len(d['norm'])} for d in dev_test]

#### Private data

In [7]:
from utils import save_data

## path
input_path = "bundle/final_phase/input_data/input.json"
label_path = "bundle/final_phase/reference_data/label.json"
save_data(input_full_test, input_path)
save_data([d for d in full_test], label_path)

## path
dev_input_path = "bundle/dev_phase/input_data/input.json"
dev_label_path = "bundle/dev_phase/reference_data/label.json"
save_data(input_dev_test, dev_input_path)
save_data([d for d in dev_test], dev_label_path)

In [8]:
from collections import defaultdict

newlangs = ['th', 'vi', 'ja', 'ko', 'id']

code2lang = {'th': 'Thai',
 'vi': 'Vietnamese',
 'id': 'Indonesian',
 'ja': 'Japanese',
 'ko': 'Korean',
 'hr': 'Croatian',
 'da': 'Danish',
 'nl': 'Dutch',
 'en': 'English',
 'de': 'German',
 'iden': 'Indonesian-English',
 'it': 'Italian',
 'sr': 'Serbian',
 'sl': 'Slovenian',
 'es': 'Spanish',
 'tr': 'Turkish',
 'trde': 'Turkish-German'}


def count_tokens_by_lang(dataset, split_name):
    counts = defaultdict(int)

    for ex in dataset[split_name]:
        counts[f'{ex["lang"]}'] += len(ex["raw"])

    return counts

stats = {}
stats["train"] = count_tokens_by_lang(out, "train")
stats["validation"] = count_tokens_by_lang(out, "validation")
stats["test"] = count_tokens_by_lang(out, "test")

stats = pd.DataFrame(stats)
stats['total'] = stats.sum(axis=1)
stats.sort_index()
stats = stats.reset_index().rename(columns={"index": "lang"})
stats

Unnamed: 0,lang,train,validation,test,total
0,da,16448,,3758,20206.0
1,de,15006,4860.0,5082,24948.0
2,en,35216,9169.0,29421,73806.0
3,es,7189,,6635,13824.0
4,hr,54416,18941.0,15695,89052.0
5,id,35502,4306.0,8908,48716.0
6,iden,13949,4809.0,4366,23124.0
7,it,12645,,1996,14641.0
8,ja,61903,10919.0,22594,95416.0
9,ko,13130,1880.0,1567,16577.0


In [9]:
def norm_ratio(data, language):
    data = data.filter(lambda x: x["lang"] == language)

    count, not_norm = 0, 0
    for items in data:
        count += len(items['raw'])
        not_norm += sum([1 for x in zip(items['raw'], items['norm']) if x[0]==x[1]])

    ratio = round((1-(not_norm/count))*100, 2)
    print(f"Lang: {language}: {count}, {not_norm}, {ratio}")
    return ratio

data = concatenate_datasets([out["train"], out["validation"], out["test"]])
stats['norm'] = stats['lang'].apply(lambda x: norm_ratio(data, x))
stats["languages"] = stats["lang"].map(code2lang)
stats.sort_values('languages')

Filter:   0%|          | 0/59542 [00:00<?, ? examples/s]

Lang: da: 20206, 18369, 9.09


Filter:   0%|          | 0/59542 [00:00<?, ? examples/s]

Lang: de: 24948, 20609, 17.39


Filter:   0%|          | 0/59542 [00:00<?, ? examples/s]

Lang: en: 73806, 68183, 7.62


Filter:   0%|          | 0/59542 [00:00<?, ? examples/s]

Lang: es: 13824, 12790, 7.48


Filter:   0%|          | 0/59542 [00:00<?, ? examples/s]

Lang: hr: 89052, 81781, 8.16


Filter:   0%|          | 0/59542 [00:00<?, ? examples/s]

Lang: id: 48716, 25591, 47.47


Filter:   0%|          | 0/59542 [00:00<?, ? examples/s]

Lang: iden: 23124, 19902, 13.93


Filter:   0%|          | 0/59542 [00:00<?, ? examples/s]

Lang: it: 14641, 13614, 7.01


Filter:   0%|          | 0/59542 [00:00<?, ? examples/s]

Lang: ja: 95416, 88713, 7.03


Filter:   0%|          | 0/59542 [00:00<?, ? examples/s]

Lang: ko: 16577, 15327, 7.54


Filter:   0%|          | 0/59542 [00:00<?, ? examples/s]

Lang: nl: 21657, 15412, 28.84


Filter:   0%|          | 0/59542 [00:00<?, ? examples/s]

Lang: sl: 75276, 64038, 14.93


Filter:   0%|          | 0/59542 [00:00<?, ? examples/s]

Lang: sr: 91738, 84512, 7.88


Filter:   0%|          | 0/59542 [00:00<?, ? examples/s]

Lang: th: 200915, 192902, 3.99


Filter:   0%|          | 0/59542 [00:00<?, ? examples/s]

Lang: tr: 8082, 5105, 36.83


Filter:   0%|          | 0/59542 [00:00<?, ? examples/s]

Lang: trde: 16508, 12284, 25.59


Filter:   0%|          | 0/59542 [00:00<?, ? examples/s]

Lang: vi: 128685, 108119, 15.98


Unnamed: 0,lang,train,validation,test,total,norm,languages
4,hr,54416,18941.0,15695,89052.0,8.16,Croatian
0,da,16448,,3758,20206.0,9.09,Danish
10,nl,12381,3863.0,5413,21657.0,28.84,Dutch
2,en,35216,9169.0,29421,73806.0,7.62,English
1,de,15006,4860.0,5082,24948.0,17.39,German
5,id,35502,4306.0,8908,48716.0,47.47,Indonesian
6,iden,13949,4809.0,4366,23124.0,13.93,Indonesian-English
7,it,12645,,1996,14641.0,7.01,Italian
8,ja,61903,10919.0,22594,95416.0,7.03,Japanese
9,ko,13130,1880.0,1567,16577.0,7.54,Korean


#### Public data

In [10]:
from datasets import Dataset

# Full phase
DatasetDict({
    "train": out["train"], 
    "validation": out["validation"],
    "test": Dataset.from_list(input_full_test),
}).push_to_hub("weerayut/multilexnorm2026-full-pub", private=True)

# Dev phase
DatasetDict({
    "train": out["train"], 
    "validation": out["validation"],
    "test": Dataset.from_list(input_dev_test),
}).push_to_hub("weerayut/multilexnorm2026-dev-pub", private=True)

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/40 [00:00<?, ?ba/s]

Uploading files as a binary IO buffer is not supported by Xet Storage. Falling back to HTTP upload.


Upload 0 LFS files: 0it [00:00, ?it/s]

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/9 [00:00<?, ?ba/s]

Uploading files as a binary IO buffer is not supported by Xet Storage. Falling back to HTTP upload.


Upload 0 LFS files: 0it [00:00, ?it/s]

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/12 [00:00<?, ?ba/s]

Uploading files as a binary IO buffer is not supported by Xet Storage. Falling back to HTTP upload.


Upload 0 LFS files: 0it [00:00, ?it/s]

No files have been modified since last commit. Skipping to prevent empty commit.


Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/40 [00:00<?, ?ba/s]

Uploading files as a binary IO buffer is not supported by Xet Storage. Falling back to HTTP upload.


Upload 0 LFS files: 0it [00:00, ?it/s]

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/9 [00:00<?, ?ba/s]

Uploading files as a binary IO buffer is not supported by Xet Storage. Falling back to HTTP upload.


Upload 0 LFS files: 0it [00:00, ?it/s]

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/6 [00:00<?, ?ba/s]

Uploading files as a binary IO buffer is not supported by Xet Storage. Falling back to HTTP upload.


Upload 0 LFS files: 0it [00:00, ?it/s]

No files have been modified since last commit. Skipping to prevent empty commit.


CommitInfo(commit_url='https://huggingface.co/datasets/weerayut/multilexnorm2026-dev-pub/commit/e6817fb2943c9abb3d7840a946a9c60760b555e4', commit_message='Upload dataset', commit_description='', oid='e6817fb2943c9abb3d7840a946a9c60760b555e4', pr_url=None, repo_url=RepoUrl('https://huggingface.co/datasets/weerayut/multilexnorm2026-dev-pub', endpoint='https://huggingface.co', repo_type='dataset', repo_id='weerayut/multilexnorm2026-dev-pub'), pr_revision=None, pr_num=None)

## Data

In [11]:
from datasets import load_dataset

full_pub_data = load_dataset("weerayut/multilexnorm2026-full-pub")
dev_pub_data = load_dataset("weerayut/multilexnorm2026-dev-pub")

full_pub_data, dev_pub_data

(DatasetDict({
     train: Dataset({
         features: ['raw', 'norm', 'lang'],
         num_rows: 39178
     })
     validation: Dataset({
         features: ['raw', 'norm', 'lang'],
         num_rows: 8408
     })
     test: Dataset({
         features: ['raw', 'norm', 'lang'],
         num_rows: 11956
     })
 }),
 DatasetDict({
     train: Dataset({
         features: ['raw', 'norm', 'lang'],
         num_rows: 39178
     })
     validation: Dataset({
         features: ['raw', 'norm', 'lang'],
         num_rows: 8408
     })
     test: Dataset({
         features: ['raw', 'norm', 'lang'],
         num_rows: 5972
     })
 }))

In [12]:
lang = "en"
en_train = dev_pub_data["train"].filter(lambda x: x["lang"] == lang)
en_validation = dev_pub_data["validation"].filter(lambda x: x["lang"] == lang)
en_test = dev_pub_data["test"].filter(lambda x: x["lang"] == lang)

pd.DataFrame(en_train).head(4)

Unnamed: 0,raw,norm,lang
0,"[rt, @teddyferrari1, :, "", ah, ..., @datzmenon...","[rt, @teddyferrari1, :, "", ah, ..., @datzmenon...",en
1,"[u, have, a, very, sexy, header, @jaibrooks1, ...","[you, have, a, very, sexy, header, @jaibrooks1...",en
2,"[i, miss, u, my, bie, !, where, u, wanna, out,...","[i, miss, you, my, bie, !, where, you, want to...",en
3,"["", cantik, ., rt, @historyinpics, :, julie, c...","["", cantik, ., rt, @historyinpics, :, julie, c...",en


In [13]:
pd.DataFrame(en_validation).head(4)

Unnamed: 0,raw,norm,lang
0,"[@cdutra5, bruh, get, out, yo, feelings, lol]","[@cdutra5, brother, get, out, your, feelings, ...",en
1,"[rt, @demberel_s, :, manan, dund, xaragdax, te...","[rt, @demberel_s, :, manan, dund, xaragdax, te...",en
2,"[why, dese, niggas, think, dey, doin, summn]","[why, these, niggers, think, they, doing, some...",en
3,"[@tylermajewski, it's, about, more, than, numb...","[@tylermajewski, it's, about, more, than, numb...",en


In [14]:
pd.DataFrame(en_test).head(4)

Unnamed: 0,raw,norm,lang
0,"[@ez_doesssit, yeh, but, still, that's, wild, ...","[, , , , , , ]",en
1,"[dick, in, janice, ,, im, poppin, xanax, and, ...","[, , , , , , , , , , ]",en
2,"[ucsb, i, fear, the, next, rampage, will, b, c...","[, , , , , , , , , , , , , , , , , , , , , , , ]",en
3,"[rt, @ahadmadriidyy, :, @m_salman_0, @yousefmo...","[, , , , , , , , , , , , , , ]",en


## Baseline

In [15]:
from utils import counting, mfr, evaluate


def mfrs(train_set, test_label, test_input, lang):
    print(f"\nLang: {lang}")
    train_set = train_set.filter(lambda x: x["lang"] == lang)
    test_label = test_label.filter(lambda x: x["lang"] == lang)
    test_input = test_input.filter(lambda x: x["lang"] == lang)
    counts = counting(train_set)

    ds = pd.DataFrame(test_input)
    ds['pred'] = ds['raw'].apply(lambda x: mfr(x, counts))
    lai, acc, err = evaluate(
        ds['raw'], 
        test_label['norm'], 
        ds['pred'], info=True
    )
    
    return round(err*100, 2)


dev_train = concatenate_datasets([dev_pub_data["train"], dev_pub_data["validation"]])
stats['mfr-dev'] = stats['lang'].apply(lambda x: mfrs(dev_train, dev_test, dev_pub_data['test'], x))

full_train = concatenate_datasets([full_pub_data["train"], full_pub_data["validation"]])
stats['mfr-full'] = stats['lang'].apply(lambda x: mfrs(full_train, full_test, full_pub_data['test'], x))

stats


Lang: da


Filter:   0%|          | 0/5972 [00:00<?, ? examples/s]

Baseline acc.(LAI): 83.85
Accuracy:           92.92
ERR:                56.17

Lang: de


Filter:   0%|          | 0/5972 [00:00<?, ? examples/s]

Baseline acc.(LAI): 83.01
Accuracy:           88.93
ERR:                34.87

Lang: en


Filter:   0%|          | 0/5972 [00:00<?, ? examples/s]

Baseline acc.(LAI): 91.58
Accuracy:           97.11
ERR:                65.65

Lang: es


Filter:   0%|          | 0/5972 [00:00<?, ? examples/s]

Baseline acc.(LAI): 92.60
Accuracy:           94.14
ERR:                20.82

Lang: hr


Filter:   0%|          | 0/5972 [00:00<?, ? examples/s]

Baseline acc.(LAI): 87.02
Accuracy:           92.53
ERR:                42.41

Lang: id


Filter:   0%|          | 0/5972 [00:00<?, ? examples/s]

Baseline acc.(LAI): 54.59
Accuracy:           81.86
ERR:                60.05

Lang: iden


Filter:   0%|          | 0/5972 [00:00<?, ? examples/s]

Baseline acc.(LAI): 85.26
Accuracy:           94.20
ERR:                60.69

Lang: it


Filter:   0%|          | 0/5972 [00:00<?, ? examples/s]

Baseline acc.(LAI): 96.20
Accuracy:           97.44
ERR:                32.50

Lang: ja


Filter:   0%|          | 0/5972 [00:00<?, ? examples/s]

Baseline acc.(LAI): 93.16
Accuracy:           93.56
ERR:                5.87

Lang: ko


Filter:   0%|          | 0/5972 [00:00<?, ? examples/s]

Baseline acc.(LAI): 93.68
Accuracy:           94.29
ERR:                9.62

Lang: nl


Filter:   0%|          | 0/5972 [00:00<?, ? examples/s]

Baseline acc.(LAI): 52.67
Accuracy:           74.85
ERR:                46.87

Lang: sl


Filter:   0%|          | 0/5972 [00:00<?, ? examples/s]

Baseline acc.(LAI): 85.42
Accuracy:           94.07
ERR:                59.36

Lang: sr


Filter:   0%|          | 0/5972 [00:00<?, ? examples/s]

Baseline acc.(LAI): 90.06
Accuracy:           94.37
ERR:                43.35

Lang: th


Filter:   0%|          | 0/5972 [00:00<?, ? examples/s]

Baseline acc.(LAI): 95.81
Accuracy:           97.53
ERR:                41.00

Lang: tr


Filter:   0%|          | 0/5972 [00:00<?, ? examples/s]

Baseline acc.(LAI): 65.36
Accuracy:           70.27
ERR:                14.18

Lang: trde


Filter:   0%|          | 0/5972 [00:00<?, ? examples/s]

Baseline acc.(LAI): 70.39
Accuracy:           78.54
ERR:                27.52

Lang: vi


Filter:   0%|          | 0/5972 [00:00<?, ? examples/s]

Baseline acc.(LAI): 83.91
Accuracy:           95.63
ERR:                72.85

Lang: da


Filter:   0%|          | 0/11956 [00:00<?, ? examples/s]

Baseline acc.(LAI): 91.59
Accuracy:           95.77
ERR:                49.68

Lang: de


Filter:   0%|          | 0/11956 [00:00<?, ? examples/s]

Baseline acc.(LAI): 82.53
Accuracy:           88.53
ERR:                34.35

Lang: en


Filter:   0%|          | 0/11956 [00:00<?, ? examples/s]

Baseline acc.(LAI): 92.10
Accuracy:           97.36
ERR:                66.57

Lang: es


Filter:   0%|          | 0/11956 [00:00<?, ? examples/s]

Baseline acc.(LAI): 92.75
Accuracy:           94.60
ERR:                25.57

Lang: hr


Filter:   0%|          | 0/11956 [00:00<?, ? examples/s]

Baseline acc.(LAI): 87.14
Accuracy:           92.48
ERR:                41.53

Lang: id


Filter:   0%|          | 0/11956 [00:00<?, ? examples/s]

Baseline acc.(LAI): 52.73
Accuracy:           80.97
ERR:                59.75

Lang: iden


Filter:   0%|          | 0/11956 [00:00<?, ? examples/s]

Baseline acc.(LAI): 86.67
Accuracy:           94.87
ERR:                61.51

Lang: it


Filter:   0%|          | 0/11956 [00:00<?, ? examples/s]

Baseline acc.(LAI): 94.94
Accuracy:           95.79
ERR:                16.83

Lang: ja


Filter:   0%|          | 0/11956 [00:00<?, ? examples/s]

Baseline acc.(LAI): 93.56
Accuracy:           93.97
ERR:                6.32

Lang: ko


Filter:   0%|          | 0/11956 [00:00<?, ? examples/s]

Baseline acc.(LAI): 91.96
Accuracy:           92.47
ERR:                6.35

Lang: nl


Filter:   0%|          | 0/11956 [00:00<?, ? examples/s]

Baseline acc.(LAI): 72.66
Accuracy:           83.43
ERR:                39.39

Lang: sl


Filter:   0%|          | 0/11956 [00:00<?, ? examples/s]

Baseline acc.(LAI): 85.27
Accuracy:           93.92
ERR:                58.70

Lang: sr


Filter:   0%|          | 0/11956 [00:00<?, ? examples/s]

Baseline acc.(LAI): 90.46
Accuracy:           94.77
ERR:                45.19

Lang: th


Filter:   0%|          | 0/11956 [00:00<?, ? examples/s]

Baseline acc.(LAI): 95.90
Accuracy:           97.65
ERR:                42.77

Lang: tr


Filter:   0%|          | 0/11956 [00:00<?, ? examples/s]

Baseline acc.(LAI): 63.88
Accuracy:           69.13
ERR:                14.53

Lang: trde


Filter:   0%|          | 0/11956 [00:00<?, ? examples/s]

Baseline acc.(LAI): 69.45
Accuracy:           76.20
ERR:                22.09

Lang: vi


Filter:   0%|          | 0/11956 [00:00<?, ? examples/s]

Baseline acc.(LAI): 83.79
Accuracy:           96.07
ERR:                75.77


Unnamed: 0,lang,train,validation,test,total,norm,languages,mfr-dev,mfr-full
0,da,16448,,3758,20206.0,9.09,Danish,56.17,49.68
1,de,15006,4860.0,5082,24948.0,17.39,German,34.87,34.35
2,en,35216,9169.0,29421,73806.0,7.62,English,65.65,66.57
3,es,7189,,6635,13824.0,7.48,Spanish,20.82,25.57
4,hr,54416,18941.0,15695,89052.0,8.16,Croatian,42.41,41.53
5,id,35502,4306.0,8908,48716.0,47.47,Indonesian,60.05,59.75
6,iden,13949,4809.0,4366,23124.0,13.93,Indonesian-English,60.69,61.51
7,it,12645,,1996,14641.0,7.01,Italian,32.5,16.83
8,ja,61903,10919.0,22594,95416.0,7.03,Japanese,5.87,6.32
9,ko,13130,1880.0,1567,16577.0,7.54,Korean,9.62,6.35


In [16]:
features = ['languages', 'total', 'norm', 'mfr-full', 'mfr-dev']

subset = stats[stats["lang"].isin(newlangs)]
subset.sort_values("languages")[features]

Unnamed: 0,languages,total,norm,mfr-full,mfr-dev
5,Indonesian,48716.0,47.47,59.75,60.05
8,Japanese,95416.0,7.03,6.32,5.87
9,Korean,16577.0,7.54,6.35,9.62
13,Thai,200915.0,3.99,42.77,41.0
16,Vietnamese,128685.0,15.98,75.77,72.85


In [17]:
stats[~stats["lang"].isin(newlangs)].sort_values("languages")[features]

Unnamed: 0,languages,total,norm,mfr-full,mfr-dev
4,Croatian,89052.0,8.16,41.53,42.41
0,Danish,20206.0,9.09,49.68,56.17
10,Dutch,21657.0,28.84,39.39,46.87
2,English,73806.0,7.62,66.57,65.65
1,German,24948.0,17.39,34.35,34.87
6,Indonesian-English,23124.0,13.93,61.51,60.69
7,Italian,14641.0,7.01,16.83,32.5
12,Serbian,91738.0,7.88,45.19,43.35
11,Slovenian,75276.0,14.93,58.7,59.36
3,Spanish,13824.0,7.48,25.57,20.82
