In [8]:
import pandas as pd
import torch
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from transformers import DistilBertTokenizerFast, DistilBertForSequenceClassification, Trainer, TrainingArguments
from torch.utils.data import Dataset
import os

In [10]:
MODEL_NAME = "distilbert-base-multilingual-cased"
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
os.environ["WANDB_DISABLED"] = "true"

tokenizer = DistilBertTokenizerFast.from_pretrained(MODEL_NAME)

tokenizer_config.json:   0%|          | 0.00/49.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/996k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.96M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/466 [00:00<?, ?B/s]

In [12]:
!unzip -o /content/wili-2018.zip
print("Loading Language Dataset from local files...")

def stream_text_file(x_path, y_path, limit=None):
    X, Y = [], []
    with open(x_path, "r", encoding="utf-8") as fx, open(y_path, "r", encoding="utf-8") as fy:
        for i, (text, label) in enumerate(zip(fx, fy)):
            X.append(text.strip())
            Y.append(label.strip())
            if limit and i >= limit:
                break
    return X, Y


print("Loading dataset in streaming mode...")

X_train, Y_train = stream_text_file("x_train.txt", "y_train.txt", limit=30000)
X_test, Y_test = stream_text_file("x_test.txt", "y_test.txt", limit=5000)

print("Samples loaded:", len(X_train), len(X_test))

Archive:  /content/wili-2018.zip
  inflating: x_train.txt             
  inflating: y_train.txt             
  inflating: x_test.txt              
  inflating: y_test.txt              
  inflating: labels.csv              
  inflating: README.txt              
  inflating: urls.txt                
Loading Language Dataset from local files...
Loading dataset in streaming mode...
Samples loaded: 30001 5001


In [13]:
print("Loading language names from labels.csv...")
try:
    labels_meta = pd.read_csv('labels.csv', sep=';')
    CODE_TO_NAME = dict(zip(labels_meta['Label'], labels_meta['English']))
except Exception as e:
    print(f"Warning: Could not load labels.csv ({e}). Using raw codes.")
    CODE_TO_NAME = {}

Loading language names from labels.csv...


In [17]:
label_encoder = LabelEncoder()

df_train = pd.DataFrame({'text': X_train, 'labels': Y_train})
df_valid = pd.DataFrame({'text': X_test, 'labels': Y_test})

if not labels_meta.empty:
    all_possible_labels = labels_meta['Label'].unique()
    label_encoder.fit(all_possible_labels)
else:
    label_encoder.fit(pd.concat([df_train['labels'], df_valid['labels']]))

df_train = df_train[df_train['labels'] != 'nan'].copy()
df_valid = df_valid[df_valid['labels'] != 'nan'].copy()

df_train.dropna(subset=['labels'], inplace=True)
df_valid.dropna(subset=['labels'], inplace=True)

df_train['label_id'] = label_encoder.transform(df_train['labels'])
df_valid['label_id'] = label_encoder.transform(df_valid['labels'])

num_labels = len(label_encoder.classes_)
print(f"Total supported languages: {num_labels}")

# Initialize the model here, after num_labels is determined
model = DistilBertForSequenceClassification.from_pretrained(MODEL_NAME, num_labels=num_labels)
model.to(device)

class LanguageDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_len=128):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, item):
        text = str(self.texts[item])
        label = self.labels[item]

        encoding = self.tokenizer.encode_plus(
            text,
            add_special_tokens=True,
            max_length=self.max_len,
            return_token_type_ids=False,
            padding='max_length',
            truncation=True,
            return_attention_mask=True,
            return_tensors='pt',
        )

        return {
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'labels': torch.tensor(label, dtype=torch.long)
        }

train_dataset = LanguageDataset(
    df_train['text'].to_numpy(),
    df_train['label_id'].to_numpy(),
    tokenizer
)

valid_dataset = LanguageDataset(
    df_valid['text'].to_numpy(),
    df_valid['label_id'].to_numpy(),
    tokenizer
)

Total supported languages: 235


model.safetensors:   0%|          | 0.00/542M [00:00<?, ?B/s]

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-multilingual-cased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [19]:
training_args = TrainingArguments(
    output_dir="./results",
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=2,
    learning_rate=3e-5,
    logging_steps=100,
    save_strategy="no",
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=valid_dataset
)

print("Training model (FAST MODE)...")
trainer.train()
print("Training completed!")

Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).


Training model (FAST MODE)...


Step,Training Loss
100,0.2465
200,0.1891
300,0.2225
400,0.3141
500,0.1812
600,0.1996
700,0.202
800,0.2117
900,0.2083
1000,0.25


Training completed!


In [28]:
def predict_language(text):
    inputs = tokenizer(text, padding=True, truncation=True, return_tensors="pt").to(device)
    if 'token_type_ids' in inputs:
        inputs.pop('token_type_ids')
    outputs = model(**inputs)
    probs = torch.softmax(outputs.logits, dim=1)

    confidence, idx = torch.max(probs, dim=1)

    lang = label_encoder.inverse_transform([idx.item()])[0]
    return f"üåç Language: {lang} | Confidence: {confidence.item():.2f}"

In [39]:
print("Example of label encoder classes:")
print(label_encoder.classes_[:20])
print("Total:", len(label_encoder.classes_))

Example of label encoder classes:
['ace' 'afr' 'als' 'amh' 'ang' 'ara' 'arg' 'arz' 'asm' 'ast' 'ava' 'aym'
 'azb' 'aze' 'bak' 'bar' 'bcl' 'be-tarask' 'bel' 'ben']
Total: 235


In [40]:
index_to_iso = {i: code for i, code in enumerate(label_encoder.classes_)}

In [46]:
import csv

iso_to_lang = {}

with open("labels.csv", "r", encoding="utf-8") as f:
    reader = csv.reader(f, delimiter=";")

    for row in reader:
        if len(row) < 2:
            continue

        iso = row[0].strip()
        name = row[1].strip()

        if iso:
            iso_to_lang[iso] = name


In [47]:
for k in list(iso_to_lang.keys())[:20]:
    print(k, "‚Üí", iso_to_lang[k])


Label ‚Üí English
ace ‚Üí Achinese
afr ‚Üí Afrikaans
als ‚Üí Alemannic German
amh ‚Üí Amharic
ang ‚Üí Old English
ara ‚Üí Arabic
arg ‚Üí Aragonese
arz ‚Üí Egyptian Arabic
asm ‚Üí Assamese
ast ‚Üí Asturian
ava ‚Üí Avar
aym ‚Üí Aymara
azb ‚Üí South Azerbaijani
aze ‚Üí Azerbaijani
bak ‚Üí Bashkir
bar ‚Üí Bavarian
bcl ‚Üí Central Bikol
be-tarask ‚Üí Belarusian (Taraschkewiza)
bel ‚Üí Belarusian


In [54]:
def predict_language(text):
    inputs = tokenizer(
        text,
        padding=True,
        truncation=True,
        return_tensors="pt"
    ).to(device)

    if "token_type_ids" in inputs:
        inputs.pop("token_type_ids")

    outputs = model(**inputs)
    probs = torch.softmax(outputs.logits, dim=1).flatten()
    ava_index = list(index_to_iso.keys())[list(index_to_iso.values()).index("ava")]
    probs[ava_index] = 0.0


    top3_idx = torch.topk(probs, 3).indices.tolist()
    top3_conf = torch.topk(probs, 3).values.tolist()

    result = "üîç Top 3 Predictions:\n"

    for i, idx in enumerate(top3_idx):
        iso = index_to_iso[idx]
        full_name = iso_to_lang.get(iso, iso)
        result += f"{i+1}. {full_name} ({iso}) ‚Äî {top3_conf[i]:.2f}\n"

    return result


In [55]:
print("\n===========================================")
print("Type any text to detect its language.")
print("Type 'quit' to exit.")
print("===========================================\n")

while True:
    user_input = input("Enter text: ")

    if user_input.lower().strip() == "quit":
        print("Exiting program...")
        break

    print(predict_language(user_input))
    print("-------------------------------------------")


Type any text to detect its language.
Type 'quit' to exit.

Enter text: Am fr√ºhen Morgen habe ich beschlossen einen Spaziergang zu machen, weil die frische Luft mir hilft, klarer zu denken und den Tag gut zu beginnen.
üîç Top 3 Predictions:
1. German (deu) ‚Äî 0.72
2. Pampanga (pam) ‚Äî 0.05
3. Alemannic German (als) ‚Äî 0.04

-------------------------------------------
Enter text: ‡§Ü‡§ú ‡§∏‡•Å‡§¨‡§π ‡§ú‡§¨ ‡§Æ‡•à‡§Ç ‡§™‡§æ‡§∞‡•ç‡§ï ‡§Æ‡•á‡§Ç ‡§ü‡§π‡§≤‡§®‡•á ‡§ó‡§Ø‡§æ ‡§§‡•ã ‡§Æ‡•å‡§∏‡§Æ ‡§¨‡§π‡•Å‡§§ ‡§∏‡•Å‡§π‡§æ‡§µ‡§®‡§æ ‡§•‡§æ ‡§î‡§∞ ‡§ï‡§à ‡§≤‡•ã‡§ó ‡§Ø‡•ã‡§ó ‡§î‡§∞ ‡§µ‡•ç‡§Ø‡§æ‡§Ø‡§æ‡§Æ ‡§ï‡§∞‡§§‡•á ‡§π‡•Å‡§è ‡§¶‡§ø‡§ñ‡§æ‡§à ‡§¶‡§ø‡§è‡•§
üîç Top 3 Predictions:
1. Hindi (hin) ‚Äî 0.97
2. Bhojpuri (bho) ‚Äî 0.02
3. Bengali (ben) ‚Äî 0.00

-------------------------------------------
Enter text: ‡∞à‡∞∞‡±ã‡∞ú‡±Å ‡∞Æ‡∞æ ‡∞á‡∞Ç‡∞ü‡±ç‡∞≤‡±ã ‡∞í‡∞ï ‡∞ö‡∞ø‡∞®‡±ç‡∞® ‡∞µ‡±á‡∞°‡±Å‡∞ï ‡∞ú‡∞∞‡∞ø‡∞ó‡∞ø‡∞Ç‡∞¶‡∞ø ‡∞Æ‡∞∞‡∞ø‡∞Ø‡±Å ‡∞Ö‡∞Ç‡∞¶‡∞∞‡∞Ç ‡∞ï‡∞≤‡∞ø‡∞∏‡∞ø ‡∞é‡∞Ç‡∞§‡±ã ‡∞Ü‡∞®‡∞Ç‡∞¶‡∞Ç‡∞ó‡∞æ