```
sample_data = [
    ("Deep learning models have revolutionized computer vision tasks.",
     "B-KP I-KP I-KP O O B-KP I-KP O"),
    ("Neural networks process data efficiently.",
     "B-KP I-KP O B-KP O"),
    ("Machine learning algorithms improve performance metrics.",
     "B-KP I-KP I-KP O B-KP I-KP"),
    ("The transformer architecture enables efficient processing.",
     "O B-KP I-KP O O O"),
    ("Data science techniques analyze large datasets.",
     "B-KP I-KP I-KP O O O")
]
```

# BiLSTM

In [None]:
import torch
import torch.nn as nn
import spacy
import numpy as np
from torch.utils.data import Dataset, DataLoader
from torch.nn.utils.rnn import pad_sequence, pack_padded_sequence, pad_packed_sequence

# Load spaCy model
nlp = spacy.load('en_core_web_sm')

# Example data with BIO tags
texts = [
    "Deep learning is revolutionizing artificial intelligence research",
    "Climate change poses significant environmental challenges",
    "Quantum computing could transform cryptography",
    "Renewable energy sources reduce carbon emissions",
    "Machine learning algorithms improve decision making",
    "Space exploration advances technological innovation",
    "Biotechnology developments impact medical research",
    "Digital transformation reshapes business operations",
    "Neural networks enable pattern recognition",
    "Sustainable development promotes environmental conservation"
]

# BIO tags for each token in the texts
# B: Beginning of keyphrase (B-KP)
# I: Inside of keyphrase (I-KP)
# O: Outside of keyphrase (O)
bio_tags = [
    ["B", "I", "O", "O", "B", "I", "O"],  # Deep learning, artificial intelligence
    ["B", "I", "O", "O", "B", "I"],  # Climate change, environmental challenges
    ["B", "I", "O", "O", "B"],  # Quantum computing, cryptography
    ["B", "I", "O", "O", "B", "I"],  # Renewable energy, carbon emissions
    ["B", "I", "O", "O", "O", "I"],  # Machine learning, decision making
    ["B", "I", "O", "B", "I"],  # Space exploration, technological innovation
    ["B", "O", "O", "B", "I"],  # Biotechnology, medical research
    ["B", "I", "O", "B", "I"],  # Digital transformation, business operations
    ["B", "I", "O", "B", "I"],  # Neural networks, pattern recognition
    ["B", "I", "O", "B", "I"]   # Sustainable development, environmental conservation
]

# Create vocabularies
def build_vocab(texts):
    word_vocab = {'<PAD>': 0, '<UNK>': 1}
    for text in texts:
        doc = nlp(text.lower())
        for token in doc:
            if token.text not in word_vocab:
                word_vocab[token.text] = len(word_vocab)
    return word_vocab

def build_tag_vocab():
    return {'B': 0, 'I': 1, 'O': 2, '<PAD>': 3}

word_vocab = build_vocab(texts)
tag_vocab = build_tag_vocab()
tag_vocab_reverse = {v: k for k, v in tag_vocab.items()}

# Dataset class
class KeyPhraseDataset(Dataset):
    def __init__(self, texts, bio_tags, word_vocab, tag_vocab):
        self.texts = texts
        self.bio_tags = bio_tags
        self.word_vocab = word_vocab
        self.tag_vocab = tag_vocab

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = self.texts[idx]
        doc = nlp(text.lower())

        # Convert text to indices
        indices = [self.word_vocab.get(token.text, self.word_vocab['<UNK>'])
                  for token in doc]

        # Convert BIO tags to indices
        tag_indices = [self.tag_vocab[tag] for tag in self.bio_tags[idx]]

        return (torch.tensor(indices),
                torch.tensor(tag_indices),
                len(indices))

# BiLSTM model
class KeyPhraseBiLSTM(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim, num_tags):
        super(KeyPhraseBiLSTM, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim, padding_idx=0)
        self.lstm = nn.LSTM(embedding_dim, hidden_dim,
                           bidirectional=True, batch_first=True)
        self.fc = nn.Linear(hidden_dim * 2, num_tags)

    def forward(self, x, lengths):
        embedded = self.embedding(x)

        # Pack padded sequence
        packed = pack_padded_sequence(embedded, lengths,
                                    batch_first=True, enforce_sorted=False)

        # LSTM forward pass
        lstm_out, _ = self.lstm(packed)

        # Unpack sequence
        unpacked, _ = pad_packed_sequence(lstm_out, batch_first=True)

        # Linear layer
        logits = self.fc(unpacked)

        return logits

# Collate function for DataLoader
def collate_fn(batch):
    texts, tags, lengths = zip(*batch)

    # Pad sequences
    texts_padded = pad_sequence(texts, batch_first=True)
    tags_padded = pad_sequence(tags, batch_first=True,
                              padding_value=tag_vocab['<PAD>'])

    return texts_padded, tags_padded, torch.tensor(lengths)

# Create dataset and dataloader
dataset = KeyPhraseDataset(texts, bio_tags, word_vocab, tag_vocab)
dataloader = DataLoader(dataset, batch_size=2, collate_fn=collate_fn, shuffle=True)

# Initialize model and optimizer
model = KeyPhraseBiLSTM(len(word_vocab), embedding_dim=100,
                        hidden_dim=64, num_tags=len(tag_vocab))
optimizer = torch.optim.Adam(model.parameters())
criterion = nn.CrossEntropyLoss(ignore_index=tag_vocab['<PAD>'])

# Training loop
num_epochs = 10
for epoch in range(num_epochs):
    total_loss = 0
    model.train()

    for batch_texts, batch_tags, batch_lengths in dataloader:
        optimizer.zero_grad()

        # Forward pass
        logits = model(batch_texts, batch_lengths)

        # Reshape for loss calculation
        logits = logits.view(-1, len(tag_vocab))
        batch_tags = batch_tags.view(-1)

        # Calculate loss
        loss = criterion(logits, batch_tags)

        # Backward pass and optimize
        loss.backward()
        optimizer.step()

        total_loss += loss.item()

    print(f"Epoch {epoch+1}/{num_epochs}, Loss: {total_loss/len(dataloader):.4f}")

# Example inference
def predict_keyphrases(text, model, word_vocab, tag_vocab_reverse):
    model.eval()
    doc = nlp(text.lower())
    tokens = [token.text for token in doc]
    indices = [word_vocab.get(token, word_vocab['<UNK>']) for token in tokens]

    with torch.no_grad():
        input_tensor = torch.tensor(indices).unsqueeze(0)
        lengths = torch.tensor([len(indices)])
        logits = model(input_tensor, lengths)
        predictions = torch.argmax(logits, dim=-1)[0]

    # Convert predictions to BIO tags
    predicted_tags = [tag_vocab_reverse[pred.item()] for pred in predictions]

    # Extract keyphrases based on BIO tags
    keyphrases = []
    current_phrase = []

    for token, tag in zip(tokens, predicted_tags):
        if tag == 'B':
            if current_phrase:
                keyphrases.append(" ".join(current_phrase))
            current_phrase = [token]
        elif tag == 'I' and current_phrase:
            current_phrase.append(token)
        elif tag == 'O' and current_phrase:
            keyphrases.append(" ".join(current_phrase))
            current_phrase = []

    if current_phrase:
        keyphrases.append(" ".join(current_phrase))

    return keyphrases

# Test prediction
test_text = "what is the purpose of CNN in using deep learning"
predicted_keyphrases = predict_keyphrases(test_text, model, word_vocab, tag_vocab_reverse)
print(f"\nTest text: {test_text}")
print(f"Predicted keyphrases: {predicted_keyphrases}")

Epoch 1/10, Loss: 1.3739
Epoch 2/10, Loss: 1.2788
Epoch 3/10, Loss: 1.1981
Epoch 4/10, Loss: 1.1170
Epoch 5/10, Loss: 1.0408
Epoch 6/10, Loss: 0.9510
Epoch 7/10, Loss: 0.8633
Epoch 8/10, Loss: 0.7726
Epoch 9/10, Loss: 0.6741
Epoch 10/10, Loss: 0.5776

Test text: what is the purpose of CNN in using deep learning
Predicted keyphrases: ['what', 'the', 'purpose', 'of', 'cnn', 'in', 'using', 'deep learning']


# bert-base-uncased

In [None]:
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from transformers import BertTokenizer, BertModel
import numpy as np
from typing import List, Tuple

# Initialize BERT tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

# Example data with BIO tags
texts = [
    "Deep learning is revolutionizing artificial intelligence research",
    "Climate change poses significant environmental challenges",
    "Quantum computing could transform cryptography",
    "Renewable energy sources reduce carbon emissions",
    "Machine learning algorithms improve decision making",
    "Space exploration advances technological innovation",
    "Biotechnology developments impact medical research",
    "Digital transformation reshapes business operations",
    "Neural networks enable pattern recognition",
    "Sustainable development promotes environmental conservation"
]

# BIO tags for each token in the texts
bio_tags = [
    ["B", "I", "O", "O", "B", "I", "O"],  # Deep learning, artificial intelligence
    ["B", "I", "O", "O", "B", "I"],  # Climate change, environmental challenges
    ["B", "I", "O", "O", "B"],  # Quantum computing, cryptography
    ["B", "I", "O", "O", "B", "I"],  # Renewable energy, carbon emissions
    ["B", "I", "O", "O", "O", "I"],  # Machine learning, decision making
    ["B", "I", "O", "B", "I"],  # Space exploration, technological innovation
    ["B", "O", "O", "B", "I"],  # Biotechnology, medical research
    ["B", "I", "O", "B", "I"],  # Digital transformation, business operations
    ["B", "I", "O", "B", "I"],  # Neural networks, pattern recognition
    ["B", "I", "O", "B", "I"]   # Sustainable development, environmental conservation
]

# Create tag vocabulary
tag2idx = {'B': 0, 'I': 1, 'O': 2, 'PAD': 3}
idx2tag = {v: k for k, v in tag2idx.items()}

class KeyPhraseDataset(Dataset):
    def __init__(self, texts: List[str], bio_tags: List[List[str]], tokenizer, tag2idx: dict):
        self.texts = texts
        self.bio_tags = bio_tags
        self.tokenizer = tokenizer
        self.tag2idx = tag2idx

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = self.texts[idx]
        tags = self.bio_tags[idx]

        # Tokenize text
        encoding = self.tokenizer(
            text,
            add_special_tokens=True,
            truncation=True,
            max_length=128,
            return_tensors='pt'
        )

        # Get input_ids and attention_mask
        input_ids = encoding['input_ids'].squeeze(0)
        attention_mask = encoding['attention_mask'].squeeze(0)

        # Create token type IDs
        token_type_ids = torch.zeros_like(input_ids)

        # Align BIO tags with BERT tokens
        bert_tokens = self.tokenizer.convert_ids_to_tokens(input_ids)
        aligned_labels = self.align_labels_with_tokens(tags, bert_tokens)

        # Convert tags to tensor
        label_ids = torch.tensor([self.tag2idx.get(label, self.tag2idx['O'])
                                for label in aligned_labels])

        return {
            'input_ids': input_ids,
            'attention_mask': attention_mask,
            'token_type_ids': token_type_ids,
            'labels': label_ids
        }

    def align_labels_with_tokens(self, tags: List[str], bert_tokens: List[str]) -> List[str]:
        aligned_labels = ['O']  # for [CLS]
        orig_token_idx = 0

        for bert_token in bert_tokens[1:-1]:  # Skip [CLS] and [SEP]
            if bert_token.startswith('##'):
                if orig_token_idx > 0:
                    aligned_labels.append(tags[orig_token_idx-1])
            else:
                if orig_token_idx < len(tags):
                    aligned_labels.append(tags[orig_token_idx])
                    orig_token_idx += 1
                else:
                    aligned_labels.append('O')

        aligned_labels.append('O')  # for [SEP]
        return aligned_labels

class BERTForKeyPhraseExtraction(nn.Module):
    def __init__(self, num_labels: int):
        super().__init__()
        self.bert = BertModel.from_pretrained('bert-base-uncased')
        self.dropout = nn.Dropout(0.1)
        self.classifier = nn.Linear(self.bert.config.hidden_size, num_labels)

    def forward(self, input_ids, attention_mask, token_type_ids):
        outputs = self.bert(
            input_ids=input_ids,
            attention_mask=attention_mask,
            token_type_ids=token_type_ids
        )

        sequence_output = outputs[0]
        sequence_output = self.dropout(sequence_output)
        logits = self.classifier(sequence_output)

        return logits

def collate_fn(batch):
    max_len = max([len(item['input_ids']) for item in batch])

    input_ids = torch.stack([
        torch.nn.functional.pad(item['input_ids'], (0, max_len - len(item['input_ids'])), value=tokenizer.pad_token_id)
        for item in batch
    ])

    attention_mask = torch.stack([
        torch.nn.functional.pad(item['attention_mask'], (0, max_len - len(item['attention_mask'])), value=0)
        for item in batch
    ])

    token_type_ids = torch.stack([
        torch.nn.functional.pad(item['token_type_ids'], (0, max_len - len(item['token_type_ids'])), value=0)
        for item in batch
    ])

    labels = torch.stack([
        torch.nn.functional.pad(item['labels'], (0, max_len - len(item['labels'])), value=tag2idx['PAD'])
        for item in batch
    ])

    return {
        'input_ids': input_ids,
        'attention_mask': attention_mask,
        'token_type_ids': token_type_ids,
        'labels': labels
    }

# Create dataset and dataloader
dataset = KeyPhraseDataset(texts, bio_tags, tokenizer, tag2idx)
dataloader = DataLoader(dataset, batch_size=2, collate_fn=collate_fn, shuffle=True)

# Initialize model, optimizer, and loss function
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = BERTForKeyPhraseExtraction(num_labels=len(tag2idx))
model.to(device)

optimizer = torch.optim.AdamW(model.parameters(), lr=2e-5)
criterion = nn.CrossEntropyLoss(ignore_index=tag2idx['PAD'])

# Training loop
num_epochs = 25

for epoch in range(num_epochs):
    model.train()
    total_loss = 0

    for batch in dataloader:
        # Move batch to device
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        token_type_ids = batch['token_type_ids'].to(device)
        labels = batch['labels'].to(device)

        # Forward pass
        optimizer.zero_grad()
        logits = model(input_ids, attention_mask, token_type_ids)

        # Calculate loss
        loss = criterion(logits.view(-1, len(tag2idx)), labels.view(-1))

        # Backward pass and optimize
        loss.backward()
        optimizer.step()

        total_loss += loss.item()

    print(f"Epoch {epoch+1}/{num_epochs}, Loss: {total_loss/len(dataloader):.4f}")

Epoch 1/25, Loss: 1.2703
Epoch 2/25, Loss: 0.8663
Epoch 3/25, Loss: 0.6358
Epoch 4/25, Loss: 0.4443
Epoch 5/25, Loss: 0.3155
Epoch 6/25, Loss: 0.1900
Epoch 7/25, Loss: 0.1217
Epoch 8/25, Loss: 0.0868
Epoch 9/25, Loss: 0.0534
Epoch 10/25, Loss: 0.0414
Epoch 11/25, Loss: 0.0302
Epoch 12/25, Loss: 0.0239
Epoch 13/25, Loss: 0.0172
Epoch 14/25, Loss: 0.0132
Epoch 15/25, Loss: 0.0128
Epoch 16/25, Loss: 0.0093
Epoch 17/25, Loss: 0.0085
Epoch 18/25, Loss: 0.0074
Epoch 19/25, Loss: 0.0077
Epoch 20/25, Loss: 0.0067
Epoch 21/25, Loss: 0.0057
Epoch 22/25, Loss: 0.0053
Epoch 23/25, Loss: 0.0054
Epoch 24/25, Loss: 0.0052
Epoch 25/25, Loss: 0.0044


In [None]:
def predict_keyphrases(text: str, model, tokenizer, tag2idx, idx2tag):
    model.eval()

    # Tokenize input text
    encoding = tokenizer(
        text,
        add_special_tokens=True,
        truncation=True,
        max_length=128,
        return_tensors='pt'
    )

    input_ids = encoding['input_ids'].to(device)
    attention_mask = encoding['attention_mask'].to(device)
    token_type_ids = torch.zeros_like(input_ids).to(device)

    with torch.no_grad():
        logits = model(input_ids, attention_mask, token_type_ids)
        predictions = torch.argmax(logits, dim=-1)[0]

    # Convert predictions to tags
    predicted_tags = [idx2tag[pred.item()] for pred in predictions]

    # Get original tokens
    tokens = tokenizer.convert_ids_to_tokens(input_ids[0])

    # Extract keyphrases
    keyphrases = []
    current_phrase = []

    for token, tag in zip(tokens[1:-1], predicted_tags[1:-1]):  # Skip [CLS] and [SEP]
        if tag == 'B':
            if current_phrase:
                keyphrases.append(" ".join(current_phrase))
            current_phrase = [token.replace('##', '')]
        elif tag == 'I' and current_phrase:
            current_phrase.append(token.replace('##', ''))
        elif tag == 'O' and current_phrase:
            keyphrases.append(" ".join(current_phrase))
            current_phrase = []

    if current_phrase:
        keyphrases.append(" ".join(current_phrase))

    return keyphrases

# Test prediction
test_text = "Artificial intelligence applications transform modern technology"
predicted_keyphrases = predict_keyphrases(test_text, model, tokenizer, tag2idx, idx2tag)
print(f"\nTest text: {test_text}")
print(f"Predicted keyphrases: {predicted_keyphrases}")


Test text: Artificial intelligence applications transform modern technology
Predicted keyphrases: ['artificial intelligence', 'modern technology']


# Test

In [None]:
test_text = "what is the purpose of CNN in using deep learning"
predicted_keyphrases = predict_keyphrases(test_text, model, tokenizer, tag2idx, idx2tag)
print(f"\nTest text: {test_text}")
print(f"Predicted keyphrases: {predicted_keyphrases}")


Test text: what is the purpose of CNN in using deep learning
Predicted keyphrases: ['cnn', 'deep learning']


In [None]:
test_text = "I have used large language model for the question answering task"
predicted_keyphrases = predict_keyphrases(test_text, model, tokenizer, tag2idx, idx2tag)
print(f"\nTest text: {test_text}")
print(f"Predicted keyphrases: {predicted_keyphrases}")


Test text: I have used large language model for the question answering task
Predicted keyphrases: ['large', 'language model', 'question answering']
