In [1]:
!pip install transformers torch pandas tqdm seqeval torchinfo

Collecting seqeval
  Downloading seqeval-1.2.2.tar.gz (43 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/43.6 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m43.6/43.6 kB[0m [31m3.7 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting torchinfo
  Downloading torchinfo-1.8.0-py3-none-any.whl.metadata (21 kB)
Downloading torchinfo-1.8.0-py3-none-any.whl (23 kB)
Building wheels for collected packages: seqeval
  Building wheel for seqeval (setup.py) ... [?25l[?25hdone
  Created wheel for seqeval: filename=seqeval-1.2.2-py3-none-any.whl size=16161 sha256=02f6381f0247fae799562d1f4a74d59de404c78c97b7720c789f0e36698fad10
  Stored in directory: /root/.cache/pip/wheels/1a/67/4a/ad4082dd7dfc30f2abfe4d80a2ed5926a506eb8a972b4767fa
Successfully built seqeval
Installing collected packages: torchinfo, seqeval
Successfully installed seqeval-1.2.2 torchinfo-1.8.0


In [3]:
# importing Modules
import pandas as pd
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import DistilBertTokenizerFast, DistilBertForTokenClassification, AdamW
from seqeval.metrics import classification_report
import numpy as np
from tqdm import tqdm
from typing import List, Tuple

# Constants
MAX_LEN = 128
TRAIN_BATCH_SIZE = 32
VALID_BATCH_SIZE = 16
EPOCHS = 3
LEARNING_RATE = 2e-5
MAX_GRAD_NORM = 1.0

# Define label mappings
label2id = {'EN': 0, 'HI': 1, 'O': 2}
id2label = {v: k for k, v in label2id.items()}

In [4]:
print(label2id)
print(id2label)

{'EN': 0, 'HI': 1, 'O': 2}
{0: 'EN', 1: 'HI', 2: 'O'}


### Create Custom Dataset

In [5]:
class CodeSwitchingDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_len):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, item):
        text = str(self.texts[item])
        word_labels = self.labels[item]

        # Tokenize the text
        encoding = self.tokenizer(
            text.split(),
            is_split_into_words=True,
            padding='max_length',
            truncation=True,
            max_length=self.max_len
        )

        # Align labels with tokens
        word_ids = encoding.word_ids()
        label_ids = [-100 if i is None else label2id[word_labels[i]] for i in word_ids]

        return {
            'input_ids': torch.tensor(encoding['input_ids'], dtype=torch.long),
            'attention_mask': torch.tensor(encoding['attention_mask'], dtype=torch.long),
            'labels': torch.tensor(label_ids, dtype=torch.long)
        }

### Create Training Loop

In [6]:
def train_epoch(model, data_loader, optimizer, device, scheduler):
    model.train()
    total_loss = 0

    for batch in tqdm(data_loader, desc="Training"):
        optimizer.zero_grad()

        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)

        outputs = model(
            input_ids=input_ids,
            attention_mask=attention_mask,
            labels=labels
        )

        loss = outputs.loss
        total_loss += loss.item()

        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), MAX_GRAD_NORM)

        optimizer.step()
        scheduler.step()

    return total_loss / len(data_loader)

### Evaluation Function

In [7]:
def evaluate(model, data_loader, device):
    model.eval()
    predictions = []
    true_labels = []

    with torch.no_grad():
        for batch in data_loader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)

            outputs = model(
                input_ids=input_ids,
                attention_mask=attention_mask
            )

            preds = torch.argmax(outputs.logits, dim=2)

            predictions.extend([
                [id2label[p.item()] for (p, l) in zip(pred, label) if l != -100]
                for pred, label in zip(preds, labels)
            ])
            true_labels.extend([
                [id2label[l.item()] for l in label if l != -100]
                for label in labels
            ])

    return classification_report(true_labels, predictions)

### Function to Predict tags for each word

In [8]:
def predict_language_tags(model: DistilBertForTokenClassification,
                         tokenizer: DistilBertTokenizerFast,
                         text: str,
                         device: torch.device) -> List[Tuple[str, str]]:
    """
    Predict language tags for each word in the input text.
    Returns a list of (word, language) tuples.
    """
    # Tokenize the text
    words = text.split()
    encoding = tokenizer(
        words,
        is_split_into_words=True,
        return_tensors='pt',
        padding=True,
        truncation=True,
        max_length=MAX_LEN
    )

    # Move inputs to device
    input_ids = encoding['input_ids'].to(device)
    attention_mask = encoding['attention_mask'].to(device)

    # Get predictions
    model.eval()
    with torch.no_grad():
        outputs = model(input_ids=input_ids, attention_mask=attention_mask)
        predictions = torch.argmax(outputs.logits, dim=2)

    # Process predictions
    word_predictions = []
    predicted_labels = predictions[0].cpu().numpy()

    # Create a mapping from tokens to words
    token_to_word = encoding.word_ids()

    # Group predictions by word
    current_word_index = -1
    current_word_predictions = []

    for token_index, word_index in enumerate(token_to_word):
        if word_index is None:
            continue

        if word_index != current_word_index:
            if current_word_predictions:
                # Use majority vote for word prediction
                word_label = max(set(current_word_predictions), key=current_word_predictions.count)
                word_predictions.append((words[current_word_index], id2label[word_label]))
            current_word_predictions = []
            current_word_index = word_index

        current_word_predictions.append(predicted_labels[token_index])

    # Handle the last word
    if current_word_predictions:
        word_label = max(set(current_word_predictions), key=current_word_predictions.count)
        word_predictions.append((words[current_word_index], id2label[word_label]))

    return word_predictions


In [9]:
def visualize_predictions(word_predictions: List[Tuple[str, str]]) -> str:
    """
    Create a colored visualization of the predictions.
    Returns a string with color-coded predictions.
    """
    color_map = {
        'EN': '\033[94m',  # Blue for English
        'HI': '\033[92m',  # Green for Hindi
        'O': '\033[0m'     # Default color for Other
    }
    end_color = '\033[0m'

    result = []
    for word, lang in word_predictions:
        result.append(f"{color_map[lang]}{word}{end_color}")

    return ' '.join(result)

## Preprocessing and Data Loader

In [10]:
from torchinfo import summary
train_texts = ["This is a sample हिंदी English mixed text", "Another एक example"]
train_labels = [['EN', 'EN', 'EN', 'EN', 'HI', 'EN', 'EN', 'EN'],
                ['EN', 'HI', 'EN']]

print("Initializing tokenizer and model...")
tokenizer = DistilBertTokenizerFast.from_pretrained('distilbert-base-multilingual-cased', clean_up_tokenization_spaces = True)
model = DistilBertForTokenClassification.from_pretrained(
    'distilbert-base-multilingual-cased',
    num_labels=len(label2id),
    id2label=id2label,
    label2id=label2id
)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")
model.to(device)
summary(model)

Initializing tokenizer and model...


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/49.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/996k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.96M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/466 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/542M [00:00<?, ?B/s]

Some weights of DistilBertForTokenClassification were not initialized from the model checkpoint at distilbert-base-multilingual-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Using device: cuda


Layer (type:depth-idx)                                  Param #
DistilBertForTokenClassification                        --
├─DistilBertModel: 1-1                                  --
│    └─Embeddings: 2-1                                  --
│    │    └─Embedding: 3-1                              91,812,096
│    │    └─Embedding: 3-2                              393,216
│    │    └─LayerNorm: 3-3                              1,536
│    │    └─Dropout: 3-4                                --
│    └─Transformer: 2-2                                 --
│    │    └─ModuleList: 3-5                             42,527,232
├─Dropout: 1-2                                          --
├─Linear: 1-3                                           2,307
Total params: 134,736,387
Trainable params: 134,736,387
Non-trainable params: 0

In [11]:
# Create datasets and train the model
print("Preparing dataset...")
train_dataset = CodeSwitchingDataset(train_texts, train_labels, tokenizer, MAX_LEN)
train_data_loader = DataLoader(train_dataset, batch_size=TRAIN_BATCH_SIZE, shuffle=True)

Preparing dataset...


## Train the model

In [12]:
optimizer = AdamW(model.parameters(), lr=LEARNING_RATE)
total_steps = len(train_data_loader) * EPOCHS
scheduler = torch.optim.lr_scheduler.LinearLR(optimizer, total_iters=total_steps)

# Training loop
print("Starting training...")
for epoch in range(EPOCHS):
    print(f"Epoch {epoch + 1}/{EPOCHS}")
    train_loss = train_epoch(model, train_data_loader, optimizer, device, scheduler)
    print(f"Training loss: {train_loss}")



Starting training...
Epoch 1/3


Training: 100%|██████████| 1/1 [00:01<00:00,  1.40s/it]


Training loss: 1.243477702140808
Epoch 2/3


Training: 100%|██████████| 1/1 [00:00<00:00, 30.15it/s]


Training loss: 1.1572251319885254
Epoch 3/3


Training: 100%|██████████| 1/1 [00:00<00:00, 12.24it/s]

Training loss: 1.0473487377166748





## Saving the model

In [13]:
print("Saving model...")
model.save_pretrained('./code_switching_model')
tokenizer.save_pretrained('./code_switching_model')

Saving model...


('./code_switching_model/tokenizer_config.json',
 './code_switching_model/special_tokens_map.json',
 './code_switching_model/vocab.txt',
 './code_switching_model/added_tokens.json',
 './code_switching_model/tokenizer.json')

### Prediction

In [14]:
def load_model_and_predict(text: str, model_path: str = './code_switching_model'):
    """
    Load a saved model and make predictions on new text.
    """
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

    # Load model and tokenizer
    model = DistilBertForTokenClassification.from_pretrained(model_path)
    tokenizer = DistilBertTokenizerFast.from_pretrained(model_path)

    model.to(device)

    # Make predictions
    predictions = predict_language_tags(model, tokenizer, text, device)

    # Print predictions
    print(f"Input text: {text}")
    print("Predictions:")
    for word, lang in predictions:
        print(f"{word}: {lang}")

    # Visualize predictions
    print("\nVisualized predictions:")
    print(visualize_predictions(predictions))

In [15]:
print("\nTesting saved model:")
test_text = "I love eating दाल चावल for lunch"
try:
    load_model_and_predict(test_text)
except Exception as e:
    print(f"Error loading saved model: {e}")
    print("This is expected if you're running this in a notebook environment")
    print("The saved model can be loaded in a separate script")


Testing saved model:
Input text: I love eating दाल चावल for lunch
Predictions:
I: EN
love: O
eating: EN
दाल: EN
चावल: O
for: O
lunch: EN

Visualized predictions:
[94mI[0m [0mlove[0m [94meating[0m [94mदाल[0m [0mचावल[0m [0mfor[0m [94mlunch[0m
