In [14]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from collections import defaultdict
import numpy as np
import stanza

In [15]:
def extract_text(path):
    with open(path, "r") as f:
        train = f.read().split("\n\n")
        train = [x.split("\n") for x in train]

    sentences = []
    labels = []

    for i in range(len(train)):
        sentence = train[i][0].split(" ")
        label = train[i][1].split(" ")
        if len(sentence) == len(label):
            sentences.append(sentence)
            labels.append(label)
    
    return sentences, labels

In [16]:
# Initialize Stanza NLP pipeline
nlp = stanza.Pipeline('id', processors='tokenize,pos')

# Data Sample
sentences, labels = extract_text("TAGGED REVISI DIKIT.txt")


# Create Vocabulary for words, POS tags, and labels
word2idx = defaultdict(lambda: len(word2idx))
pos2idx = defaultdict(lambda: len(pos2idx))
label2idx = defaultdict(lambda: len(label2idx))

# Add special tokens
word2idx["<PAD>"] = 0
pos2idx["<PAD>"] = 0
label2idx["O"] = 0

2024-11-27 23:12:53 INFO: Checking for updates to resources.json in case models have been updated.  Note: this behavior can be turned off with download_method=None or download_method=DownloadMethod.REUSE_RESOURCES
Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/main/resources_1.9.0.json: 392kB [00:00, 12.5MB/s]                    
2024-11-27 23:12:53 INFO: Downloaded file to C:\Users\Anand\stanza_resources\resources.json
2024-11-27 23:12:54 INFO: Loading these models for language: id (Indonesian):
| Processor | Package    |
--------------------------
| tokenize  | gsd        |
| mwt       | gsd        |
| pos       | gsd_charlm |

2024-11-27 23:12:54 INFO: Using device: cpu
2024-11-27 23:12:54 INFO: Loading: tokenize
2024-11-27 23:12:54 INFO: Loading: mwt
2024-11-27 23:12:54 INFO: Loading: pos
2024-11-27 23:12:54 INFO: Done loading processors!


In [17]:

# Preprocess training data for initial vocabulary
def preprocess_training_data(sentences, labels):
    processed_data = []
    for sentence, sent_labels in zip(sentences, labels):
        # Use Stanza for POS tagging
        doc = nlp(" ".join(sentence))
        
        processed_words = []
        processed_pos = []
        processed_labels = []
        
        for sent in doc.sentences:
            for word in sent.words:
                processed_words.append(word.text)
                processed_pos.append(word.upos)
        
        # Match labels to processed words (assuming same order)
        processed_labels = sent_labels[:len(processed_words)]
        
        processed_data.append((processed_words, processed_pos, processed_labels))
        
        # Update vocabularies
        for word in processed_words:
            word2idx[word]
        for pos in processed_pos:
            pos2idx[pos]
        for label in processed_labels:
            label2idx[label]
    
    return processed_data

# Preprocess the initial training data
processed_training_data = preprocess_training_data(sentences, labels)

In [18]:
# Convert words, POS tags, and labels to indices
input_words = [[word2idx[word] for word in sentence[0]] for sentence in processed_training_data]
input_pos = [[pos2idx[pos] for pos in sentence[1]] for sentence in processed_training_data]
label_data = [[label2idx[label] for label in sentence[2]] for sentence in processed_training_data]

# Dynamic Padding: Use the length of the longest sentence in the data
MAX_LEN = max([len(sentence) for sentence in input_words])

# Padding the sequences
input_words = [sentence + [0]*(MAX_LEN - len(sentence)) for sentence in input_words]
input_pos = [pos + [0]*(MAX_LEN - len(pos)) for pos in input_pos]
label_data = [label + [0]*(MAX_LEN - len(label)) for label in label_data]

In [19]:
# Dataset Class
class NERDataset(Dataset):
    def __init__(self, words, pos, labels):
        self.words = words
        self.pos = pos
        self.labels = labels

    def __len__(self):
        return len(self.words)

    def __getitem__(self, idx):
        return self.words[idx], self.pos[idx], self.labels[idx]
    
# Sorting by length (this is necessary for pack_padded_sequence)
def collate_fn(batch):
    words, pos, labels = zip(*batch)
    
    # Sorting by length of the words (descending order)
    lengths = torch.tensor([len(w) for w in words])
    sorted_idx = torch.argsort(lengths, descending=True)
    
    words = [words[i] for i in sorted_idx]
    pos = [pos[i] for i in sorted_idx]
    labels = [labels[i] for i in sorted_idx]
    lengths = lengths[sorted_idx]

    # Padding the sequences
    words_padded = torch.nn.utils.rnn.pad_sequence([torch.tensor(w) for w in words], batch_first=True, padding_value=0)
    pos_padded = torch.nn.utils.rnn.pad_sequence([torch.tensor(p) for p in pos], batch_first=True, padding_value=0)
    labels_padded = torch.nn.utils.rnn.pad_sequence([torch.tensor(l) for l in labels], batch_first=True, padding_value=0)

    return words_padded, pos_padded, labels_padded, lengths

# Create dataset and dataloader
dataset = NERDataset(input_words, input_pos, label_data)
dataloader = DataLoader(dataset, batch_size=2, shuffle=True, collate_fn=collate_fn)

In [20]:
# Enhanced NER Model with POS tag input
class NERModel(nn.Module):
    def __init__(self, vocab_size, pos_size, tagset_size, 
                 embedding_dim=50, pos_embedding_dim=20, hidden_dim=100, dropout=0.3):
        super(NERModel, self).__init__()
        # Embedding layers for words and POS tags
        self.word_embedding = nn.Embedding(vocab_size, embedding_dim)
        self.pos_embedding = nn.Embedding(pos_size, pos_embedding_dim)
        
        # Combine word and POS embeddings
        combined_dim = embedding_dim + pos_embedding_dim
        
        # LSTM layer with dropout
        self.lstm = nn.LSTM(combined_dim, hidden_dim, batch_first=True, dropout=dropout)
        
        # Fully connected layer
        self.fc = nn.Linear(hidden_dim, tagset_size)

    def forward(self, words, pos):
        # Embed words and POS tags
        word_emb = self.word_embedding(words)
        pos_emb = self.pos_embedding(pos)
        
        # Concatenate word and POS embeddings
        combined_emb = torch.cat((word_emb, pos_emb), dim=2)
        
        # LSTM and classification
        lstm_out, _ = self.lstm(combined_emb)
        output = self.fc(lstm_out)
        return output

In [21]:
# Set model parameters
vocab_size = len(word2idx)
pos_size = len(pos2idx)
tagset_size = len(label2idx)

# Create model
model = NERModel(vocab_size, pos_size, tagset_size)

# Define loss and optimizer
loss_fn = nn.CrossEntropyLoss(ignore_index=0)  # Ignore padding in loss
optimizer = optim.Adam(model.parameters(), lr=0.001)

In [22]:
# Training loop
for epoch in range(100):  # for simplicity, we use 10 epochs
    model.train()  # Set model to training mode
    total_loss = 0
    for words, pos, labels, lengths in dataloader:
        # Forward pass
        outputs = model(words, pos)

        # Flatten the outputs and labels for the loss function
        outputs = outputs.view(-1, tagset_size)
        labels = labels.view(-1)

        # Compute loss, for all tokens (including padding)
        loss = loss_fn(outputs, labels)

        # Backward pass and optimize
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        total_loss += loss.item()

    print(f'Epoch {epoch+1}, Loss: {total_loss / len(dataloader)}')


Epoch 1, Loss: 1.8726191520690918
Epoch 2, Loss: 1.8213127851486206
Epoch 3, Loss: 1.7707313299179077
Epoch 4, Loss: 1.7205090522766113
Epoch 5, Loss: 1.6703182458877563
Epoch 6, Loss: 1.6198782920837402
Epoch 7, Loss: 1.5689777135849
Epoch 8, Loss: 1.5174819231033325
Epoch 9, Loss: 1.4653443098068237
Epoch 10, Loss: 1.4126147031784058
Epoch 11, Loss: 1.3594303131103516
Epoch 12, Loss: 1.3059930801391602
Epoch 13, Loss: 1.2525259256362915
Epoch 14, Loss: 1.1992220878601074
Epoch 15, Loss: 1.1461960077285767
Epoch 16, Loss: 1.0934590101242065
Epoch 17, Loss: 1.0409386157989502
Epoch 18, Loss: 0.9885411858558655
Epoch 19, Loss: 0.9362373948097229
Epoch 20, Loss: 0.8841332793235779
Epoch 21, Loss: 0.8324933648109436
Epoch 22, Loss: 0.781702995300293
Epoch 23, Loss: 0.7321908473968506
Epoch 24, Loss: 0.6843464374542236
Epoch 25, Loss: 0.6384745240211487
Epoch 26, Loss: 0.5947960019111633
Epoch 27, Loss: 0.5534727573394775
Epoch 28, Loss: 0.5146258473396301
Epoch 29, Loss: 0.478323161602020

In [23]:
# Predict function
def predict(sentence):
    # Use Stanza for POS tagging
    doc = nlp(" ".join(sentence))
    
    # Extract words and POS tags
    processed_words = []
    processed_pos = []
    
    for sent in doc.sentences:
        for word in sent.words:
            processed_words.append(word.text)
            processed_pos.append(word.upos)
    
    # Convert to indices, handling out-of-vocabulary words
    input_words = [word2idx.get(word, word2idx["<PAD>"]) for word in processed_words]
    input_pos = [pos2idx.get(pos, pos2idx["<PAD>"]) for pos in processed_pos]

    print(input_pos)
    
    # Pad sequences
    input_words = input_words + [0] * (MAX_LEN - len(input_words))
    input_pos = input_pos + [0] * (MAX_LEN - len(input_pos))
    
    # Convert to tensors
    input_words = torch.tensor([input_words]).long()
    input_pos = torch.tensor([input_pos]).long()

    # Get model predictions
    with torch.no_grad():
        outputs = model(input_words, input_pos)
        _, predicted = torch.max(outputs, dim=2)

    print(outputs)
    
    # Convert indices back to labels
    predicted_labels = [list(label2idx.keys())[i] for i in predicted[0]]

    # Truncate to original sentence length
    predicted_labels = predicted_labels[:len(processed_words)]

    return predicted_labels

In [24]:
# Test the model
test_sentence = ["Maka", "Les", "Parisiens", "dituntut", "bangkit", "meski", "punya", "catatan", "negatif", "saat", "bertemu", "Bayern", "Munich"]
predictions = predict(test_sentence)
print("Predicted Labels:", predictions)

# Print out vocabularies for reference
print("\nWord to Index:", dict(word2idx))
print("\nPOS to Index:", dict(pos2idx))
print("\nLabel to Index:", dict(label2idx))

[7, 2, 2, 3, 3, 7, 3, 8, 6, 7, 3, 2, 2]
tensor([[[-0.9895, -0.3388, -0.2981,  1.4286,  0.9781, -0.5685, -0.5134],
         [-2.1508, -1.4610, -0.6643,  5.4295,  0.6401, -1.6415, -1.2331],
         [-2.0208, -2.5483, -1.4885,  1.2121,  6.4200, -0.6240, -1.0807],
         [-1.9078, -2.5131, -2.0502,  0.9658,  5.2353,  0.8700, -0.9629],
         [-1.8682, -1.8244, -2.0295,  1.3754,  3.4175,  1.8726, -0.4297],
         [-2.2606, -1.3152, -1.6946,  2.3939,  1.8620,  1.6803,  0.0916],
         [-2.1008, -1.2035, -1.7506,  3.2557,  1.0498,  1.0543,  0.5209],
         [-2.0609, -0.3882, -1.9209,  3.5733,  0.4202,  0.6645,  0.3359],
         [-1.9322, -0.5706, -1.6758,  3.9669, -0.0149,  0.0972,  0.2398],
         [-2.2716, -0.7673, -1.9419,  4.7202,  0.3696, -0.4383, -0.1416],
         [-2.2754, -0.6196, -1.6878,  5.0830,  0.8944, -1.1121, -0.7300],
         [-2.5820, -1.6361, -1.2808,  6.5665,  0.9111, -2.2195, -1.2715],
         [-2.0463, -2.5599, -1.9928,  1.2388,  6.8753, -0.5562, -0.9090]

In [25]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from collections import defaultdict
import stanza

def extract_text(path):
    with open(path, "r") as f:
        train = f.read().split("\n\n")
        train = [x.split("\n") for x in train]

    sentences = []
    labels = []

    for i in range(len(train)):
        print(train[i])
        sentence = train[i][0].split(" ")
        label = train[i][1].split(" ")
        if len(sentence) == len(label):
            sentences.append(sentence)
            labels.append(label)
    
    return sentences, labels

# Initialize Stanza NLP pipeline
nlp = stanza.Pipeline('id', processors='tokenize,pos')

# Data Sample
sentences, labels = extract_text("TAGGED REVISI DIKIT.txt")
print(sentences)

# Create Vocabulary for POS tags and labels
pos2idx = defaultdict(lambda: len(pos2idx))
label2idx = defaultdict(lambda: len(label2idx))

# Add special tokens
pos2idx["<PAD>"] = 0
label2idx["O"] = 0

# Preprocess training data for initial vocabulary
def preprocess_training_data(sentences, labels):
    processed_data = []
    for sentence, sent_labels in zip(sentences, labels):
        # Use Stanza for POS tagging
        doc = nlp(" ".join(sentence))
        
        processed_pos = []
        processed_labels = []
        
        for sent in doc.sentences:
            for word in sent.words:
                processed_pos.append(word.upos)
        
        # Match labels to processed POS tags (assuming same order)
        processed_labels = sent_labels[:len(processed_pos)]
        
        processed_data.append((processed_pos, processed_labels))
        
        # Update vocabularies
        for pos in processed_pos:
            pos2idx[pos]
        for label in processed_labels:
            label2idx[label]
    
    return processed_data

# Preprocess the initial training data
processed_training_data = preprocess_training_data(sentences, labels)

# Convert POS tags and labels to indices
input_pos = [[pos2idx[pos] for pos in sentence[0]] for sentence in processed_training_data]
label_data = [[label2idx[label] for label in sentence[1]] for sentence in processed_training_data]

# Dynamic Padding: Use the length of the longest sentence in the data
MAX_LEN = max([len(sentence) for sentence in input_pos])

# Padding the sequences
input_pos = [pos + [0]*(MAX_LEN - len(pos)) for pos in input_pos]
label_data = [label + [0]*(MAX_LEN - len(label)) for label in label_data]

# Dataset Class
class NERDataset(Dataset):
    def __init__(self, pos, labels):
        self.pos = pos
        self.labels = labels

    def __len__(self):
        return len(self.pos)

    def __getitem__(self, idx):
        return self.pos[idx], self.labels[idx]
    
# Sorting by length (this is necessary for pack_padded_sequence)
def collate_fn(batch):
    pos, labels = zip(*batch)
    
    # Sorting by length of the POS tags (descending order)
    lengths = torch.tensor([len(p) for p in pos])
    sorted_idx = torch.argsort(lengths, descending=True)
    
    pos = [pos[i] for i in sorted_idx]
    labels = [labels[i] for i in sorted_idx]
    lengths = lengths[sorted_idx]

    # Padding the sequences
    pos_padded = torch.nn.utils.rnn.pad_sequence([torch.tensor(p) for p in pos], batch_first=True, padding_value=0)
    labels_padded = torch.nn.utils.rnn.pad_sequence([torch.tensor(l) for l in labels], batch_first=True, padding_value=0)

    return pos_padded, labels_padded, lengths

# Create dataset and dataloader
dataset = NERDataset(input_pos, label_data)
dataloader = DataLoader(dataset, batch_size=2, shuffle=True, collate_fn=collate_fn)

# Enhanced NER Model with POS tag input
class NERModel(nn.Module):
    def __init__(self, pos_size, tagset_size, pos_embedding_dim=100, hidden_dim=100, dropout=0):
        super(NERModel, self).__init__()
        # Embedding layer for POS tags
        self.pos_embedding = nn.Embedding(pos_size, pos_embedding_dim)
        
        # LSTM layer with dropout
        self.lstm = nn.LSTM(pos_embedding_dim, hidden_dim, batch_first=True, dropout=dropout)
        
        # Fully connected layer
        self.fc = nn.Linear(hidden_dim, tagset_size)

    def forward(self, pos):
        # Embed POS tags
        pos_emb = self.pos_embedding(pos)
        
        # LSTM and classification
        lstm_out, _ = self.lstm(pos_emb)
        output = self.fc(lstm_out)
        return output

# Set model parameters
pos_size = len(pos2idx)
tagset_size = len(label2idx)

# Create model
model = NERModel(pos_size, tagset_size)

# Define loss and optimizer
loss_fn = nn.CrossEntropyLoss(ignore_index=0)  # Ignore padding in loss
optimizer = optim.Adam(model.parameters(), lr=0.001)

# Training loop
for epoch in range(100):  # for simplicity, we use 100 epochs
    model.train()  # Set model to training mode
    total_loss = 0
    for pos, labels, lengths in dataloader:
        # Forward pass
        outputs = model(pos)

        # Flatten the outputs and labels for the loss function
        outputs = outputs.view(-1, tagset_size)
        labels = labels.view(-1)

        # Create mask to ignore padding tokens in the loss calculation
        mask = labels != 0  # Only valid tokens, ignore padding (0)

        # Check if there are any valid tokens to compute loss
        if mask.sum() == 0:  # No valid tokens to calculate loss
            continue
        
        # Apply the mask to the outputs and labels (flattened)
        outputs = outputs[mask]
        labels = labels[mask]

        # Compute loss, only for valid tokens (non-padding)
        loss = loss_fn(outputs, labels)

        # Backward pass and optimize
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        total_loss += loss.item()

    print(f'Epoch {epoch+1}, Loss: {total_loss / len(dataloader)}')

# Predict function (using only POS tags as input)
def predict(sentence):
    # Use Stanza for POS tagging
    doc = nlp(" ".join(sentence))
    
    # Extract POS tags
    processed_pos = [word.upos for sent in doc.sentences for word in sent.words]
    
    # Convert to indices, handling out-of-vocabulary POS tags
    input_pos = [pos2idx.get(pos, pos2idx["<PAD>"]) for pos in processed_pos]
    
    # Pad sequences
    input_pos = input_pos + [0] * (MAX_LEN - len(input_pos))
    
    # Convert to tensor
    input_pos = torch.tensor([input_pos]).long()

    # Get model predictions
    with torch.no_grad():
        outputs = model(input_pos)
        _, predicted = torch.max(outputs, dim=2)

    # Convert indices back to labels
    predicted_labels = [list(label2idx.keys())[i] for i in predicted[0]]

    # Truncate to original sentence length
    predicted_labels = predicted_labels[:len(processed_pos)]

    return predicted_labels

# Test the model
test_sentence = ["Maka", "Les", "Parisiens", "dituntut", "bangkit", "meski", "punya", "catatan", "negatif", "saat", "bertemu", "Bayern", "Munich"]
predictions = predict(test_sentence)
print("Predicted Labels:", predictions)

# Print out vocabularies for reference
print("\nPOS to Index:", dict(pos2idx))
print("\nLabel to Index:", dict(label2idx))


2024-11-27 23:12:56 INFO: Checking for updates to resources.json in case models have been updated.  Note: this behavior can be turned off with download_method=None or download_method=DownloadMethod.REUSE_RESOURCES
Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/main/resources_1.9.0.json: 392kB [00:00, 12.7MB/s]                    
2024-11-27 23:12:56 INFO: Downloaded file to C:\Users\Anand\stanza_resources\resources.json
2024-11-27 23:12:56 INFO: Loading these models for language: id (Indonesian):
| Processor | Package    |
--------------------------
| tokenize  | gsd        |
| mwt       | gsd        |
| pos       | gsd_charlm |

2024-11-27 23:12:56 INFO: Using device: cpu
2024-11-27 23:12:56 INFO: Loading: tokenize
2024-11-27 23:12:56 INFO: Loading: mwt
2024-11-27 23:12:56 INFO: Loading: pos
2024-11-27 23:12:57 INFO: Done loading processors!


['ï»¿Luis Enrique tahu start Paris Saint Germain di Liga Champions tidak bagus', 'B-PER I-PER O O B-ORG I-ORG I-ORG O B-LEAGUE I-LEAGUE O O']
['Maka Les Parisiens dituntut bangkit meski punya catatan negatif saat bertemu Bayern Munich', 'O B-ORG I-ORG O O O O O O O O B-ORG I-ORG']
[['ï»¿Luis', 'Enrique', 'tahu', 'start', 'Paris', 'Saint', 'Germain', 'di', 'Liga', 'Champions', 'tidak', 'bagus'], ['Maka', 'Les', 'Parisiens', 'dituntut', 'bangkit', 'meski', 'punya', 'catatan', 'negatif', 'saat', 'bertemu', 'Bayern', 'Munich']]
Epoch 1, Loss: 1.9105525016784668
Epoch 2, Loss: 1.831746220588684
Epoch 3, Loss: 1.755952000617981
Epoch 4, Loss: 1.6827203035354614
Epoch 5, Loss: 1.6122287511825562
Epoch 6, Loss: 1.545040488243103
Epoch 7, Loss: 1.4817801713943481
Epoch 8, Loss: 1.4228688478469849
Epoch 9, Loss: 1.3683631420135498
Epoch 10, Loss: 1.317850112915039
Epoch 11, Loss: 1.2704427242279053
Epoch 12, Loss: 1.2249923944473267
Epoch 13, Loss: 1.1804107427597046
Epoch 14, Loss: 1.1359336376