<a href="https://colab.research.google.com/github/SunnyZhao2004/Data_Project/blob/main/BiLSTM_CRF00.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
from google.colab import drive;
drive.mount('/content/drive');
import pandas as pd
import os
import numpy as np

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
import pandas as pd
import numpy as np
import tensorflow as tf
import ast

# ---------------------------
# 1. Define the mapping and helper functions
# ---------------------------

label_to_id = {
    "B-art": 0,
    "B-eve": 1,
    "B-geo": 2,
    "B-gpe": 3,
    "B-nat": 4,
    "B-org": 5,
    "B-per": 6,
    "B-tim": 7,
    "I-art": 8,
    "I-eve": 9,
    "I-geo": 10,
    "I-gpe": 11,
    "I-nat": 12,
    "I-org": 13,
    "I-per": 14,
    "I-tim": 15,
    "O": 16
}

def process_label(label):
    """
    Given a label stored as a byte-string (e.g.,
    b"['O', 'O', 'O', ...]"),
    decode it (if necessary) and convert the string representation
    into a Python list.
    """
    if isinstance(label, bytes):
        label = label.decode('utf-8')
    # Use ast.literal_eval to safely convert string to list.
    return ast.literal_eval(label)

def map_labels_to_ids(label_list, mapping):
    """
    Map each label in label_list to its numeric ID using the provided mapping.
    """
    return [mapping[label] for label in label_list]

def pad_labels(label_list, max_length):
    """
    Pads a list of label IDs with 0s until its length equals max_length.
    (Note: Ensure that using 0 as pad does not conflict with a valid label.)
    """
    return label_list + [0] * (max_length - len(label_list))

# ---------------------------
# 2. Load and split the dataset
# ---------------------------

folder_path = "/content/drive/MyDrive/comp-4211-spring-25-project/"
csv_path = folder_path + "train.csv"

# Read the CSV file.
df = pd.read_csv(csv_path)

# Manually split the dataset:
train_dataset_df = df[:32000].reset_index(drop=True)
test_dataset_df = df[32000:].reset_index(drop=True)

print("Train dataset shape:", train_dataset_df.shape)
print("Test dataset shape:", test_dataset_df.shape)

# ---------------------------
# 3. Process the NER tag column for both train and test sets
# ---------------------------

# Process train set:
train_dataset_df['processed_NER_tag'] = train_dataset_df['NER Tag'].apply(process_label)
train_dataset_df['NER_tag_ids'] = train_dataset_df['processed_NER_tag'].apply(lambda x: map_labels_to_ids(x, label_to_id))
# Determine the maximum sequence length from the training set.
max_length = train_dataset_df['NER_tag_ids'].apply(len).max()
print("Maximum label sequence length (train):", max_length)
# Pad the sequences:
train_dataset_df['NER_tag_ids_padded'] = train_dataset_df['NER_tag_ids'].apply(lambda x: pad_labels(x, max_length))

# Process test set:
test_dataset_df['processed_NER_tag'] = test_dataset_df['NER Tag'].apply(process_label)
test_dataset_df['NER_tag_ids'] = test_dataset_df['processed_NER_tag'].apply(lambda x: map_labels_to_ids(x, label_to_id))
test_dataset_df['NER_tag_ids_padded'] = test_dataset_df['NER_tag_ids'].apply(lambda x: pad_labels(x, max_length))

# ---------------------------
# 4. Convert to NumPy arrays and create tf.data.Datasets
# ---------------------------

# Assume the text sentence column is named "sentence"
train_sentences = train_dataset_df['Sentence'].values  # shape: (num_train_examples,)
test_sentences = test_dataset_df['Sentence'].values    # shape: (num_test_examples,)

# Convert padded label lists to a NumPy array of shape (num_examples, max_length)
train_labels = np.array(train_dataset_df['NER_tag_ids_padded'].tolist(), dtype=np.int32)
test_labels = np.array(test_dataset_df['NER_tag_ids_padded'].tolist(), dtype=np.int32)

print("Train sentences array shape:", train_sentences.shape)
print("Train labels array shape:", train_labels.shape)

# Create TensorFlow datasets from the numpy arrays.
train_dataset = tf.data.Dataset.from_tensor_slices((train_sentences, train_labels))
test_dataset = tf.data.Dataset.from_tensor_slices((test_sentences, test_labels))

# Optionally, you can batch and prefetch your datasets:
batch_size = 32
train_dataset = train_dataset.batch(batch_size).prefetch(tf.data.experimental.AUTOTUNE)
test_dataset = test_dataset.batch(batch_size).prefetch(tf.data.experimental.AUTOTUNE)

# ---------------------------
# 5. (Optional) Preview a few examples from the tf.data.Dataset
# ---------------------------
for sentence, labels in train_dataset.take(1):
    print("Sentence:", sentence.numpy())
    print("Padded NER tag IDs:", labels.numpy())


Train dataset shape: (32000, 3)
Test dataset shape: (8000, 3)
Maximum label sequence length (train): 104
Train sentences array shape: (32000,)
Train labels array shape: (32000, 104)
Sentence: [b"['Thousands', 'of', 'demonstrators', 'have', 'marched', 'through', 'London', 'to', 'protest', 'the', 'war', 'in', 'Iraq', 'and', 'demand', 'the', 'withdrawal', 'of', 'British', 'troops', 'from', 'that', 'country', '.']"
 b"['Iranian', 'officials', 'say', 'they', 'expect', 'to', 'get', 'access', 'to', 'sealed', 'sensitive', 'parts', 'of', 'the', 'plant', 'Wednesday', ',', 'after', 'an', 'IAEA', 'surveillance', 'system', 'begins', 'functioning', '.']"
 b"['Helicopter', 'gunships', 'Saturday', 'pounded', 'militant', 'hideouts', 'in', 'the', 'Orakzai', 'tribal', 'region', ',', 'where', 'many', 'Taliban', 'militants', 'are', 'believed', 'to', 'have', 'fled', 'to', 'avoid', 'an', 'earlier', 'military', 'offensive', 'in', 'nearby', 'South', 'Waziristan', '.']"
 b"['They', 'left', 'after', 'a', 'tense'

In [None]:
!pip install torchcrf



In [None]:
pip show torchcrf

Name: TorchCRF
Version: 1.1.0
Summary: An Implementation of Conditional Random Fields in pytorch
Home-page: https://github.com/s14t284/TorchCRF
Author: Ryuya Ikeda
Author-email: rikeda71@gmail.com
License: MIT
Location: /usr/local/lib/python3.11/dist-packages
Requires: numpy, torch
Required-by: 


In [None]:
import torchcrf


ModuleNotFoundError: No module named 'torchcrf'

In [None]:
import os
import pandas as pd
import torch
import torch.nn as nn
import torch.optim as optim
from torchcrf import CRF
from torch.utils.data import Dataset, DataLoader
from collections import Counter
import numpy as np

# ---------------
# Model definitions
# ---------------

class BiLSTM_CRF_WithChar(nn.Module):
    def __init__(self, vocab_size, tagset_size, char_vocab_size,
                 word_embedding_dim=100, char_embedding_dim=30, char_out_channels=50, hidden_dim=256):
        super(BiLSTM_CRF_WithChar, self).__init__()
        self.word_embeddings = nn.Embedding(vocab_size, word_embedding_dim)
        self.char_cnn = CharCNN(char_vocab_size, char_embedding_dim, char_out_channels, kernel_size=3)
        # Concatenate word and character features.
        combined_dim = word_embedding_dim + char_out_channels
        self.lstm = nn.LSTM(combined_dim, hidden_dim // 2, num_layers=1,
                            bidirectional=True, batch_first=True)
        self.hidden2tag = nn.Linear(hidden_dim, tagset_size)
        self.crf = CRF(tagset_size, batch_first=True)

    def forward(self, sentences, words_chars, tags=None):
        """
        sentences: Tensor of shape (batch_size, seq_len) containing word indices.
        words_chars: Tensor of shape (batch_size, seq_len, word_length) containing character indices.
        tags: (Optional) Tensor of shape (batch_size, seq_len) containing tag indices.
        """
        word_embeds = self.word_embeddings(sentences)  # (batch_size, seq_len, word_embedding_dim)
        char_embeds = self.char_cnn(words_chars)         # (batch_size, seq_len, char_out_channels)
        # Concatenate along the feature dimension.
        embeds = torch.cat([word_embeds, char_embeds], dim=2)  # (batch_size, seq_len, combined_dim)
        lstm_out, _ = self.lstm(embeds)                 # (batch_size, seq_len, hidden_dim)
        emissions = self.hidden2tag(lstm_out)           # (batch_size, seq_len, tagset_size)

        if tags is not None:
            loss = -self.crf(emissions, tags, mask=(sentences != 0))
            return loss
        else:
            prediction = self.crf.decode(emissions)
            return prediction

class CharCNN(nn.Module):
    def __init__(self, char_vocab_size, char_embedding_dim=30, out_channels=50, kernel_size=3):
        super(CharCNN, self).__init__()
        self.char_embedding = nn.Embedding(char_vocab_size, char_embedding_dim)
        self.conv1d = nn.Conv1d(in_channels=char_embedding_dim,
                                out_channels=out_channels, kernel_size=kernel_size)

    def forward(self, words_chars):
        """
        words_chars: Tensor of shape (batch_size, seq_len, word_length)
        """
        batch_size, seq_len, word_length = words_chars.size()
        # Reshape to (batch_size*seq_len, word_length)
        words_chars = words_chars.view(-1, word_length)
        emb = self.char_embedding(words_chars)  # (batch_size*seq_len, word_length, char_embedding_dim)
        # Permute for convolution: (batch_size*seq_len, char_embedding_dim, word_length)
        emb = emb.permute(0, 2, 1)
        conv_out = self.conv1d(emb)  # (batch_size*seq_len, out_channels, L_out)
        # Apply max pooling over time dimension (L_out) to get fixed-length feature per word.
        char_features = torch.max(conv_out, dim=2)[0]  # (batch_size*seq_len, out_channels)
        # Reshape back to (batch_size, seq_len, out_channels)
        char_features = char_features.view(batch_size, seq_len, -1)
        return char_features

# ---------------
# Data Processing Utilities
# ---------------

def tokenize_text(text):
    return text.strip().split()

def build_vocab(sequences, min_freq=1, special_tokens=['<PAD>', '<UNK>']):
    counter = Counter()
    for seq in sequences:
        counter.update(seq)
    vocab = {token: idx for idx, token in enumerate(special_tokens)}
    for word, count in counter.items():
        if count >= min_freq and word not in vocab:
            vocab[word] = len(vocab)
    return vocab

def encode_sequence(seq, vocab):
    return [vocab.get(token, vocab['<UNK>']) for token in seq]

def pad_sequences(sequences, pad_value, max_len=None):
    if max_len is None:
        max_len = max(len(seq) for seq in sequences)
    padded = []
    for seq in sequences:
        seq = seq[:max_len]
        padded_seq = seq + [pad_value] * (max_len - len(seq))
        padded.append(padded_seq)
    return padded, max_len

# For characters, pad each word to a fixed length.
def pad_word(word_seq, max_word_len):
    if len(word_seq) >= max_word_len:
        return word_seq[:max_word_len]
    else:
        return word_seq + [0]*(max_word_len - len(word_seq))

# ---------------
# PyTorch Dataset Definition
# ---------------

class NERDataset(Dataset):
    def __init__(self, df, word_vocab, tag_vocab, char_vocab, max_word_len=10, sentence_max_len=None):
        """
        df: DataFrame with 'sentence' and 'tags' columns (whitespace separated strings)
        word_vocab: mapping from words to indices.
        tag_vocab: mapping from tags to indices.
        char_vocab: mapping from characters to indices.
        max_word_len: maximum number of characters per word.
        sentence_max_len: optional fixed sentence length; if None, use the longest sentence in the data.
        """
        self.sentences = []
        self.tags = []
        self.chars = []
        self.max_word_len = max_word_len

        # Tokenize sentences and tags.
        for _, row in df.iterrows():
            word_tokens = tokenize_text(row['sentence'])
            tag_tokens = tokenize_text(row['tags'])
            # Skip if number of tokens and tags do not match.
            if len(word_tokens) != len(tag_tokens):
                continue
            self.sentences.append(word_tokens)
            self.tags.append(tag_tokens)

            # Build character sequences per word.
            char_seq = []
            for word in word_tokens:
                chars = [char_vocab.get(c, 0) for c in list(word)]
                # Pad individual word to fixed length.
                chars = pad_word(chars, max_word_len)
                char_seq.append(chars)
            self.chars.append(char_seq)

        # Optionally fix the sentence length
        if sentence_max_len is None:
            self.sentence_max_len = max(len(s) for s in self.sentences)
        else:
            self.sentence_max_len = sentence_max_len

        # Encode sequences
        self.sentences = [encode_sequence(s, word_vocab) for s in self.sentences]
        self.tags = [encode_sequence(s, tag_vocab) for s in self.tags]

    def __len__(self):
        return len(self.sentences)

    def __getitem__(self, idx):
        sent = self.sentences[idx]
        tags = self.tags[idx]
        chars = self.chars[idx]
        return {'words': torch.tensor(sent, dtype=torch.long),
                'tags': torch.tensor(tags, dtype=torch.long),
                'chars': torch.tensor(chars, dtype=torch.long)}

def collate_fn(batch):
    """
    Pads the sequences in a batch to the maximum length found in that batch.
    """
    words = [item['words'] for item in batch]
    tags = [item['tags'] for item in batch]
    chars = [item['chars'] for item in batch]

    # Pad word sequences and tag sequences.
    words_padded, max_len = pad_sequences([w.tolist() for w in words], pad_value=0)
    tags_padded, _ = pad_sequences([t.tolist() for t in tags], pad_value=0)

    # For characters, pad on sentence dimension.
    padded_chars = []
    for char_seq in chars:
        # Pad sentence length for characters.
        padded_sentence = char_seq.tolist() + [[0]*char_seq.size(1)] * (max_len - char_seq.size(0))
        padded_chars.append(padded_sentence)

    words_tensor = torch.tensor(words_padded, dtype=torch.long)
    tags_tensor = torch.tensor(tags_padded, dtype=torch.long)
    chars_tensor = torch.tensor(padded_chars, dtype=torch.long)

    return words_tensor, chars_tensor, tags_tensor

# ---------------
# Main Training and Evaluation Code
# ---------------

def main():
    # Set device
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

    # Import dataset from CSV and split manually
    folder_path = "/content/drive/MyDrive/comp-4211-spring-25-project/"
    df = pd.read_csv(os.path.join(folder_path, "train.csv"))
    train_df = df[:32000]
    test_df = df[32000:]

    # Prepare token lists from training data (you could also include test for a larger vocab)
    word_sequences = [tokenize_text(s) for s in train_df['sentence']]
    tag_sequences = [tokenize_text(t) for t in train_df['tags']]
    char_sequences = [[list(word) for word in s] for s in word_sequences]

    # Build vocabularies for words, tags, and characters.
    word_vocab = build_vocab(word_sequences, special_tokens=['<PAD>', '<UNK>'])
    tag_set = sorted(set(token for tags in tag_sequences for token in tags))
    # For tag vocab, we include padding as tag index 0.
    tag_vocab = {tag: idx + 1 for idx, tag in enumerate(tag_set)}
    tag_vocab['<PAD>'] = 0
    # For characters, include padding token (index 0).
    char_vocab = build_vocab([list(word) for sent in word_sequences for word in sent], special_tokens=['<PAD>', '<UNK>'])

    print("Word vocab size:", len(word_vocab))
    print("Tag vocab size:", len(tag_vocab))
    print("Char vocab size:", len(char_vocab))

    # Create Dataset and DataLoader for training and test.
    max_word_len = 10  # you can adjust this value
    train_dataset = NERDataset(train_df, word_vocab, tag_vocab, char_vocab, max_word_len=max_word_len)
    test_dataset = NERDataset(test_df, word_vocab, tag_vocab, char_vocab, max_word_len=max_word_len,
                              sentence_max_len=train_dataset.sentence_max_len)

    batch_size = 32
    train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, collate_fn=collate_fn)
    test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False, collate_fn=collate_fn)

    # Model initialization.
    vocab_size = len(word_vocab)
    tagset_size = len(tag_vocab)
    char_vocab_size = len(char_vocab)

    model = BiLSTM_CRF_WithChar(vocab_size, tagset_size, char_vocab_size).to(device)
    optimizer = optim.Adam(model.parameters(), lr=0.001)
    num_epochs = 10  # adjust the number of epochs as necessary

    # Training loop.
    model.train()
    for epoch in range(1, num_epochs + 1):
        total_loss = 0
        for batch in train_loader:
            words, chars, tags = [b.to(device) for b in batch]
            optimizer.zero_grad()
            loss = model(words, chars, tags)
            loss.backward()
            optimizer.step()
            total_loss += loss.item()
        avg_loss = total_loss / len(train_loader)
        print(f"Epoch {epoch}/{num_epochs} - Average Loss: {avg_loss:.4f}")

    # Evaluation on test set
    model.eval()
    predictions_all = []
    tags_all = []
    with torch.no_grad():
        for batch in test_loader:
            words, chars, tags = [b.to(device) for b in batch]
            predictions = model(words, chars)
            predictions_all.extend(predictions)
            tags_all.extend(tags.cpu().tolist())

    # (Optional) Post-process predictions (e.g., convert tag indices back to tag strings)
    idx2tag = {idx: tag for tag, idx in tag_vocab.items()}
    predictions_str = []
    for sent_pred in predictions_all:
        sent_tags = [idx2tag.get(idx, "<UNK>") for idx in sent_pred]
        predictions_str.append(sent_tags)
    # Here you could compute evaluation metrics if you have the gold labels.

    # Example: print first few predictions from test set
    for i in range(5):
        print(f"Sentence {i+1} prediction:", predictions_str[i])

if __name__ == "__main__":
    main()


ModuleNotFoundError: No module named 'torchcrf'