In [1]:

!pip install datasets transformers torch torchvision scikit-learn --quiet

import torch, random
import numpy as np
import pandas as pd
from datasets import load_dataset

wili = load_dataset("wili_2018")
train = wili["train"]
test  = wili["test"]

print(train[0])
print("Train size:", len(train), " Test size:", len(test))

langs = list(set(train['label']))[:10]
train_small = train.filter(lambda x: x['label'] in langs)
test_small  = test.filter(lambda x: x['label'] in langs)

print("Subset -> Train:", len(train_small), " Test:", len(test_small))


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


README.md: 0.00B [00:00, ?B/s]

train-00000-of-00001.parquet:   0%|          | 0.00/45.7M [00:00<?, ?B/s]

test-00000-of-00001.parquet:   0%|          | 0.00/46.0M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/117500 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/117500 [00:00<?, ? examples/s]

{'sentence': 'Klement Gottwaldi surnukeha palsameeriti ning paigutati mausoleumi. Surnukeha oli aga liiga hilja ja oskamatult palsameeritud ning hakkas ilmutama lagunemise tundemärke. 1962. aastal viidi ta surnukeha mausoleumist ära ja kremeeriti. Zlíni linn kandis aastatel 1949–1989 nime Gottwaldov. Ukrainas Harkivi oblastis kandis Zmiivi linn aastatel 1976–1990 nime Gotvald.', 'label': 112}
Train size: 117500  Test size: 117500


Filter:   0%|          | 0/117500 [00:00<?, ? examples/s]

Filter:   0%|          | 0/117500 [00:00<?, ? examples/s]

Subset -> Train: 5000  Test: 5000


In [2]:
!pip install --upgrade transformers




In [3]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report

X_train, y_train = train_small['sentence'], train_small['label']
X_test, y_test   = test_small['sentence'], test_small['label']

vectorizer = TfidfVectorizer(analyzer="char", ngram_range=(3,5), max_features=50000)
X_train_tfidf = vectorizer.fit_transform(X_train)
X_test_tfidf  = vectorizer.transform(X_test)
clf = LogisticRegression(max_iter=200)
clf.fit(X_train_tfidf, y_train)

y_pred = clf.predict(X_test_tfidf)
print(classification_report(y_test, y_pred))


              precision    recall  f1-score   support

           0       1.00      0.96      0.98       500
           1       0.99      0.98      0.99       500
           2       1.00      0.99      0.99       500
           3       1.00      0.99      0.99       500
           4       1.00      1.00      1.00       500
           5       0.97      0.99      0.98       500
           6       0.97      0.99      0.98       500
           7       0.99      0.99      0.99       500
           8       0.98      1.00      0.99       500
           9       0.98      0.99      0.99       500

    accuracy                           0.99      5000
   macro avg       0.99      0.99      0.99      5000
weighted avg       0.99      0.99      0.99      5000



In [4]:
import torch.nn as nn
from torch.utils.data import DataLoader, Dataset

# Encode labels
lang2id = {lang:i for i,lang in enumerate(langs)}
id2lang = {i:lang for lang,i in lang2id.items()}

class TextDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_len=50):
        self.texts = texts
        self.labels = [lang2id[l] for l in labels]
        self.tok = tokenizer
        self.max_len = max_len
    def __len__(self): return len(self.texts)
    def __getitem__(self, i):
        tokens = self.tok(self.texts[i])[:self.max_len]
        ids = torch.tensor(tokens + [0]*(self.max_len-len(tokens)))
        return ids, torch.tensor(self.labels[i])

# Simple char-level tokenizer
alphabet = list("abcdefghijklmnopqrstuvwxyz ")
ch2id = {ch:i+1 for i,ch in enumerate(alphabet)}
def tokenizer(text): return [ch2id.get(ch.lower(),0) for ch in text if ch.lower() in ch2id]

train_ds = TextDataset(train_small['sentence'], train_small['label'], tokenizer)
test_ds  = TextDataset(test_small['sentence'], test_small['label'], tokenizer)

train_dl = DataLoader(train_ds, batch_size=64, shuffle=True)
test_dl  = DataLoader(test_ds, batch_size=64)

# LSTM model
class LSTMClassifier(nn.Module):
    def __init__(self, vocab_size, emb_dim, hidden, num_classes):
        super().__init__()
        self.emb = nn.Embedding(vocab_size, emb_dim, padding_idx=0)
        self.lstm = nn.LSTM(emb_dim, hidden, batch_first=True)
        self.fc = nn.Linear(hidden, num_classes)
    def forward(self, x):
        x = self.emb(x)
        _,(h,_) = self.lstm(x)
        return self.fc(h[-1])

device = "cuda" if torch.cuda.is_available() else "cpu"
model = LSTMClassifier(len(ch2id)+1, 64, 128, len(langs)).to(device)
opt = torch.optim.Adam(model.parameters(), lr=1e-3)
loss_fn = nn.CrossEntropyLoss()

# Training
for epoch in range(3):
    model.train()
    for X,y in train_dl:
        X,y = X.to(device), y.to(device)
        opt.zero_grad()
        out = model(X)
        loss = loss_fn(out,y)
        loss.backward(); opt.step()
    print(f"Epoch {epoch+1} done")

# Eval
model.eval(); correct=total=0
with torch.no_grad():
    for X,y in test_dl:
        X,y = X.to(device), y.to(device)
        pred = model(X).argmax(1)
        correct += (pred==y).sum().item()
        total += y.size(0)
print("LSTM Accuracy:", correct/total)


Epoch 1 done
Epoch 2 done
Epoch 3 done
LSTM Accuracy: 0.6072


In [9]:

from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments

model_name = "xlm-roberta-base"
tokenizer = AutoTokenizer.from_pretrained(model_name)

def encode_batch(batch):
    return tokenizer(batch["sentence"], truncation=True, padding="max_length", max_length=64)

train_enc = train_small.map(encode_batch, batched=True)
test_enc  = test_small.map(encode_batch, batched=True)

train_enc.set_format(type="torch", columns=["input_ids","attention_mask","label"])
test_enc.set_format(type="torch", columns=["input_ids","attention_mask","label"])

model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=len(langs))

args = TrainingArguments(
    do_eval=True,
    save_strategy="no",
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=1,
    learning_rate=2e-5
)

trainer = Trainer(
    model=model,
    args=args,
    train_dataset=train_enc,
    eval_dataset=test_enc
)

trainer.train()
metrics = trainer.evaluate()
print(metrics)


Map:   0%|          | 0/5000 [00:00<?, ? examples/s]

Some weights of XLMRobertaForSequenceClassification were not initialized from the model checkpoint at xlm-roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).


Step,Training Loss


{'eval_loss': 0.07590505480766296, 'eval_runtime': 1046.8704, 'eval_samples_per_second': 4.776, 'eval_steps_per_second': 0.299, 'epoch': 1.0}


In [16]:
label_names = train.features["label"].names
id2lang = {i: label_names[i] for i in range(len(label_names))}

chosen_langs = [id2lang[l] for l in langs]
print("Chosen 10 languages:", chosen_langs)


Chosen 10 languages: ['cdo', 'glk', 'jam', 'lug', 'san', 'rue', 'wol', 'new', 'mwl', 'bre']
