## Q1. (40 marks)

In Q1, we will be focusing on the IMDb dataset. This is a dataset for binary sentiment classification, and is provided with a set of 25,000 highly polar movie reviews for training, and 25,000 for testing. To load the dataset, you can easily download the dataset by adding this line in your colab notebook:

```
! wget http://ai.stanford.edu/%7Eamaas/data/sentiment/aclImdb_v1.tar.gz
```

In [1]:
# download the Large IMDB Movie Review Dataset
# the task is binary classification: positive or negative review

!wget http://ai.stanford.edu/%7Eamaas/data/sentiment/aclImdb_v1.tar.gz
!tar -xzf aclImdb_v1.tar.gz

--2024-11-29 19:12:15--  http://ai.stanford.edu/%7Eamaas/data/sentiment/aclImdb_v1.tar.gz
Resolving ai.stanford.edu (ai.stanford.edu)... 171.64.68.10
Connecting to ai.stanford.edu (ai.stanford.edu)|171.64.68.10|:80... connected.
HTTP request sent, awaiting response... 200 OK
Length: 84125825 (80M) [application/x-gzip]
Saving to: ‘aclImdb_v1.tar.gz’


2024-11-29 19:12:34 (4.24 MB/s) - ‘aclImdb_v1.tar.gz’ saved [84125825/84125825]



# Import Required Libraries
Import necessary libraries including torch, numpy, and os.

In [7]:
# Import Required Libraries
import torch
import numpy as np
import os
import torch.nn as nn
from torch.utils.data import DataLoader
from torch import optim
from collections import namedtuple

# Set random seed for reproducibility
seed = 4011
torch.manual_seed(seed)
np.random.seed(seed)
torch.backends.cudnn.deterministic = True


# Set Random Seed and Device
Set the random seed for reproducibility and define the device for computation (CPU/GPU).

In [8]:
# Set random seed for reproducibility
seed = 4011
torch.manual_seed(seed)
np.random.seed(seed)
torch.backends.cudnn.deterministic = True

# Set device to MPS (Metal Performance Shaders) if available, otherwise use CUDA or CPU
device = torch.device("mps" if torch.backends.mps.is_built() else "cuda" if torch.cuda.is_available() else "cpu")
#device = "cpu"
print(f"Using device: {device}")

Using device: mps


# Load IMDb Dataset
Load the IMDb dataset and read the movie reviews from the dataset directory.

In [4]:
# Load IMDb Dataset
train_path = "aclImdb/train/"
test_path = "aclImdb/test/"

# Define a namedtuple to store sentences
Sentence = namedtuple('Sentence', ['index', 'tokens', 'label'])

def read_imdb_movie_dataset(dataset_path):
    indices = []
    text = []
    rating = []
    i = 0

    # Read positive reviews
    for filename in os.listdir(os.path.join(dataset_path, "pos")):
        file_path = os.path.join(dataset_path, "pos", filename)
        data = open(file_path, 'r', encoding="ISO-8859-1").read()
        indices.append(i)
        text.append(data)
        rating.append(1)
        i += 1

    # Read negative reviews
    for filename in os.listdir(os.path.join(dataset_path, "neg")):
        file_path = os.path.join(dataset_path, "neg", filename)
        data = open(file_path, 'r', encoding="ISO-8859-1").read()
        indices.append(i)
        text.append(data)
        rating.append(0)
        i += 1

    sentences = [Sentence(index, text.split(), rating) for index, text, rating in zip(indices, text, rating)]
    return sentences

# Load train and test datasets
train_examples = read_imdb_movie_dataset(train_path)
test_examples = read_imdb_movie_dataset(test_path)

# Print the number of examples in train and test datasets
print(f"Number of training examples: {len(train_examples)}")
print(f"Number of testing examples: {len(test_examples)}")

Number of training examples: 25000
Number of testing examples: 25000


# Preprocess Data
Preprocess the text data including tokenization and building vocabulary with special tokens.

In [5]:
# Define special tokens
UNK = '<UNK>'
PAD = '<PAD>'
BOS = '<BOS>'
EOS = '<EOS>'

# Define VocabItem class
class VocabItem:
    def __init__(self, string, hash=None):
        self.string = string
        self.count = 0
        self.hash = hash

    def __str__(self):
        return 'VocabItem({})'.format(self.string)

    def __repr__(self):
        return self.__str__()

# Define Vocab class
class Vocab:
    def __init__(self, min_count=0, no_unk=False, add_padding=False, add_bos=False, add_eos=False, unk=None):
        self.no_unk = no_unk
        self.vocab_items = []
        self.vocab_hash = {}
        self.word_count = 0
        self.special_tokens = []
        self.min_count = min_count
        self.add_padding = add_padding
        self.add_bos = add_bos
        self.add_eos = add_eos
        self.unk = unk

        self.UNK = None
        self.PAD = None
        self.BOS = None
        self.EOS = None

        self.index2token = []
        self.token2index = {}

        self.finished = False

    def add_tokens(self, tokens):
        if self.finished:
            raise RuntimeError('Vocabulary is finished')

        for token in tokens:
            if token not in self.vocab_hash:
                self.vocab_hash[token] = len(self.vocab_items)
                self.vocab_items.append(VocabItem(token))

            self.vocab_items[self.vocab_hash[token]].count += 1
            self.word_count += 1

    def finish(self):
        token2index = self.token2index
        index2token = self.index2token

        tmp = []

        if not self.no_unk:
            if self.unk:
                self.UNK = VocabItem(self.unk, hash=0)
                self.UNK.count = self.vocab_items[self.vocab_hash[self.unk]].count
                index2token.append(self.UNK)
                self.special_tokens.append(self.UNK)

                for token in self.vocab_items:
                    if token.string != self.unk:
                        tmp.append(token)
            else:
                self.UNK = VocabItem(UNK, hash=0)
                index2token.append(self.UNK)
                self.special_tokens.append(self.UNK)

                for token in self.vocab_items:
                    if token.count <= self.min_count:
                        self.UNK.count += token.count
                    else:
                        tmp.append(token)
        else:
            for token in self.vocab_items:
                tmp.append(token)

        tmp.sort(key=lambda token: token.count, reverse=True)

        if self.add_bos:
            self.BOS = VocabItem(BOS)
            tmp.append(self.BOS)
            self.special_tokens.append(self.BOS)

        if self.add_eos:
            self.EOS = VocabItem(EOS)
            tmp.append(self.EOS)
            self.special_tokens.append(self.EOS)

        if self.add_padding:
            self.PAD = VocabItem(PAD)
            tmp.append(self.PAD)
            self.special_tokens.append(self.PAD)

        index2token += tmp

        for i, token in enumerate(self.index2token):
            token2index[token.string] = i
            token.hash = i

        self.index2token = index2token
        self.token2index = token2index

        if not self.no_unk:
            print('Unknown vocab size:', self.UNK.count)

        print('Vocab size: %d' % len(self))

        self.finished = True

    def __getitem__(self, i):
        return self.index2token[i]

    def __len__(self):
        return len(self.index2token)

    def __iter__(self):
        return iter(self.index2token)

    def __contains__(self, key):
        return key in self.token2index

    def tokens2indices(self, tokens, add_bos=False, add_eos=False):
        string_seq = []
        if add_bos:
            string_seq.append(self.BOS.hash)
        for token in tokens:
            if self.no_unk:
                string_seq.append(self.token2index[token])
            else:
                string_seq.append(self.token2index.get(token, self.UNK.hash))
        if add_eos:
            string_seq.append(self.EOS.hash)
        return string_seq

    def indices2tokens(self, indices, ignore_ids=()):
        tokens = []
        for idx in indices:
            if idx in ignore_ids:
                continue
            tokens.append(self.index2token[idx].string)
        return tokens

# Initialize vocabularies
src_vocab = Vocab(min_count=10, add_padding=True)
tgt_vocab = Vocab(no_unk=True, add_padding=False)

# Add tokens to vocabularies
for sentence in train_examples:
    src_vocab.add_tokens(sentence.tokens[:300])
    tgt_vocab.add_tokens([sentence.label])

# Finish building vocabularies
src_vocab.finish()
tgt_vocab.finish()

# Print vocab sizes
print(f"Source vocab size: {len(src_vocab)}")
print(f"Target vocab size: {len(tgt_vocab)}")

Unknown vocab size: 424424
Vocab size: 22521
Vocab size: 2
Source vocab size: 22521
Target vocab size: 2


# Build Vocabulary
Build the vocabulary from the preprocessed text data and print the size of the vocabulary.

In [6]:
# Build Vocabulary
# Initialize vocabularies
src_vocab = Vocab(min_count=10, add_padding=True)
tgt_vocab = Vocab(no_unk=True, add_padding=False)

# Add tokens to vocabularies
for sentence in train_examples:
    src_vocab.add_tokens(sentence.tokens[:300])
    tgt_vocab.add_tokens([sentence.label])

# Finish building vocabularies
src_vocab.finish()
tgt_vocab.finish()

# Print vocab sizes
print(f"Source vocab size: {len(src_vocab)}")
print(f"Target vocab size: {len(tgt_vocab)}")

Vocabs = namedtuple('Vocabs', ['src', 'tgt'])
vocabs = Vocabs(src_vocab, tgt_vocab)

Unknown vocab size: 424424
Vocab size: 22521
Vocab size: 2
Source vocab size: 22521
Target vocab size: 2


# Create Embedding Matrix
Create an embedding matrix based on the vocabulary and print its size.

In [7]:
# Create Embedding Matrix
embedding_size = 300

# Create an embedding matrix based on the vocabulary
embeddings = nn.Embedding(
    len(src_vocab),
    embedding_size,
    padding_idx=src_vocab.PAD.hash
)

# Print the size of the embedding matrix
print(embeddings.weight.size())

torch.Size([22521, 300])



### Q1-1. To get your data prepared, build up Pytorch dataloaders for model training and print out one batch of training data. (15 marks)

- To check whether your dataloader can work successfully, you can choose to use `next(iter(train_dataloader))`. You can refer to https://pytorch.org/tutorials/beginner/basics/data_tutorial.html.

In [8]:
# Build PyTorch Dataloaders

# Define Batch and BatchTuple classes
class Batch(dict):
    def __init__(self, *args, **kwargs):
        super(Batch, self).__init__(*args, **kwargs)
        self.__dict__ = self
        self._is_torch = False

    def to_torch_(self, device):
        self._is_torch = False
        for key in self.keys():
            value = self[key]
            if isinstance(value, BatchTuple):
                value.to_torch_(device)
            if isinstance(value, np.ndarray):
                self[key] = torch.from_numpy(value).to(device)

class BatchTuple(object):
    def __init__(self, sequences, lengths, sublengths, masks):
        self.sequences = sequences
        self.lengths = lengths
        self.sublengths = sublengths
        self.masks = masks
        self._is_torch = False

    def to_torch_(self, device):
        if not self._is_torch:
            self.sequences = torch.tensor(
                self.sequences, device=device, dtype=torch.long
            )
            if self.lengths is not None:
                self.lengths = torch.tensor(
                    self.lengths, device=device, dtype=torch.long
                )
            if self.sublengths is not None:
                self.sublengths = torch.tensor(
                    self.sublengths, device=device, dtype=torch.long
                )
            if self.masks is not None:
                self.masks = torch.tensor(
                    self.masks, device=device, dtype=torch.float
                )

# Define padding function
def pad_list(sequences, dim0_pad=None, dim1_pad=None, align_right=False, pad_value=0):
    sequences = [np.asarray(sublist) for sublist in sequences]
    if not dim0_pad:
        dim0_pad = len(sequences)
    if not dim1_pad:
        dim1_pad = max(len(seq) for seq in sequences)
    out = np.full(shape=(dim0_pad, dim1_pad), fill_value=pad_value)
    lengths = []
    for i in range(len(sequences)):
        data_length = len(sequences[i])
        lengths.append(data_length)
        offset = dim1_pad - data_length if align_right else 0
        np.put(out[i], range(offset, offset + data_length), sequences[i])
    lengths = np.array(lengths)
    return out, lengths

# Define SequenceClassificationBatchBuilder class
class SequenceClassificationBatchBuilder(object):
    def __init__(self, vocabs, max_len=None):
        self.vocabs = vocabs
        self.max_len = max_len

    def __call__(self, examples):
        sequences = [example.tokens[:self.max_len] for example in examples]
        labels = [example.label for example in examples]
        sequences = [self.vocabs.src.tokens2indices(seq) for seq in sequences]
        sequences, lengths = pad_list(sequences, pad_value=self.vocabs.src.PAD.hash)
        labels = [self.vocabs.tgt.token2index[label] for label in labels]
        batch = Batch(
            src=BatchTuple(sequences, lengths, None, None),
            tgt=np.array(labels)
        )
        return batch

# Create DataLoader objects
train_dataloader = DataLoader(train_examples, batch_size=100, shuffle=True, collate_fn=SequenceClassificationBatchBuilder(vocabs, 300))
test_dataloader = DataLoader(test_examples, batch_size=100, shuffle=False, collate_fn=SequenceClassificationBatchBuilder(vocabs, 300))

# Print one batch of training data
one_batch = next(iter(train_dataloader))
print(one_batch)

{'src': <__main__.BatchTuple object at 0x7c0527cc17e0>, 'tgt': array([1, 0, 1, 0, 0, 0, 1, 0, 1, 0, 0, 0, 1, 0, 1, 1, 0, 0, 1, 1, 0, 1,
       0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0,
       0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 1, 1, 0, 0, 1, 0, 1, 1, 0,
       0, 1, 0, 0, 1, 0, 1, 1, 0, 1, 0, 1, 1, 1, 0, 1, 0, 1, 0, 1, 0, 1,
       0, 0, 1, 0, 1, 1, 0, 0, 1, 0, 0, 0]), '_is_torch': False}


### Q1-2. We choose bidirectional LSTM (BiLSTM) as the model. Train the model for 5 epoches with embedding matrix you obtained earlier, and for each epoch, print out the training loss, training accuracy, testing loss and testing accuracy. You could choose any appropriate loss function and values for hyperparameters. (25 marks)

- If you found difficulty understanding the structure of BiLSTM, you may refer to the supplementary note named *notes_on_lstm* inside tutorial 9 for detailed information.

- You definitely want to use GPU for this colab notebook. Go to Edit > Notebook settings as the following: Click on “Notebook settings” and select “GPU”.

In [9]:
# Define BiLSTM Model
class BiLSTM(nn.Module):
    def __init__(self, embedding_dim, hidden_dim, vocab_size, output_size, n_layers, dropout):
        super(BiLSTM, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim, padding_idx=src_vocab.PAD.hash)
        self.lstm = nn.LSTM(embedding_dim, hidden_dim, num_layers=n_layers, bidirectional=True, dropout=dropout, batch_first=True)
        self.fc = nn.Linear(hidden_dim * 2, output_size)
        self.dropout = nn.Dropout(dropout)

    def forward(self, x, lengths):
        embedded = self.dropout(self.embedding(x))
        lengths = lengths.cpu().to(torch.int64)  # Move lengths to CPU and ensure it is int64
        packed_embedded = nn.utils.rnn.pack_padded_sequence(embedded, lengths, batch_first=True, enforce_sorted=False)
        packed_output, (hidden, cell) = self.lstm(packed_embedded)
        output, output_lengths = nn.utils.rnn.pad_packed_sequence(packed_output, batch_first=True)
        hidden = self.dropout(torch.cat((hidden[-2,:,:], hidden[-1,:,:]), dim=1))
        return self.fc(hidden)

# Hyperparameters
embedding_dim = embedding_size
hidden_dim = 256
output_size = 1
n_layers = 2
dropout = 0.5

# Instantiate the model
model = BiLSTM(embedding_dim, hidden_dim, len(src_vocab), output_size, n_layers, dropout).to(device)

# Define loss and optimizer
criterion = nn.BCEWithLogitsLoss()
optimizer = optim.Adam(model.parameters())

In [10]:
from tqdm import tqdm
# Train and Evaluate BiLSTM Model

# Training function
def train_model(model, train_dataloader, criterion, optimizer, device):
    model.train()
    epoch_loss = 0
    correct = 0
    total = 0

    for batch in tqdm(train_dataloader):
        optimizer.zero_grad()
        batch.to_torch_(device)
        predictions = model(batch.src.sequences, batch.src.lengths).squeeze(1)
        loss = criterion(predictions, batch.tgt.float())
        loss.backward()
        optimizer.step()
        epoch_loss += loss.item()
        predicted = torch.round(torch.sigmoid(predictions))
        correct += (predicted == batch.tgt).sum().item()
        total += batch.tgt.size(0)

    accuracy = correct / total
    return epoch_loss / len(train_dataloader), accuracy

# Evaluation function
def evaluate_model(model, test_dataloader, criterion, device):
    model.eval()
    epoch_loss = 0
    correct = 0
    total = 0

    with torch.no_grad():
        for batch in test_dataloader:
            batch.to_torch_(device)
            predictions = model(batch.src.sequences, batch.src.lengths).squeeze(1)
            loss = criterion(predictions, batch.tgt.float())
            epoch_loss += loss.item()
            predicted = torch.round(torch.sigmoid(predictions))
            correct += (predicted == batch.tgt).sum().item()
            total += batch.tgt.size(0)

    accuracy = correct / total
    return epoch_loss / len(test_dataloader), accuracy

# Training loop
n_epochs = 5

for epoch in range(n_epochs):
    train_loss, train_acc = train_model(model, train_dataloader, criterion, optimizer, device)
    test_loss, test_acc = evaluate_model(model, test_dataloader, criterion, device)
    print(f'Epoch {epoch+1}/{n_epochs}')
    print(f'Train Loss: {train_loss:.4f}, Train Accuracy: {train_acc:.4f}')
    print(f'Test Loss: {test_loss:.4f}, Test Accuracy: {test_acc:.4f}')

100%|██████████| 250/250 [00:37<00:00,  6.70it/s]


Epoch 1/5
Train Loss: 0.6674, Train Accuracy: 0.5836
Test Loss: 0.5548, Test Accuracy: 0.7204


100%|██████████| 250/250 [00:37<00:00,  6.65it/s]


Epoch 2/5
Train Loss: 0.5602, Train Accuracy: 0.7118
Test Loss: 0.4748, Test Accuracy: 0.7746


100%|██████████| 250/250 [00:36<00:00,  6.77it/s]


Epoch 3/5
Train Loss: 0.4639, Train Accuracy: 0.7844
Test Loss: 0.5096, Test Accuracy: 0.7592


100%|██████████| 250/250 [00:37<00:00,  6.69it/s]


Epoch 4/5
Train Loss: 0.3849, Train Accuracy: 0.8295
Test Loss: 0.3838, Test Accuracy: 0.8417


100%|██████████| 250/250 [00:37<00:00,  6.75it/s]


Epoch 5/5
Train Loss: 0.3348, Train Accuracy: 0.8540
Test Loss: 0.4802, Test Accuracy: 0.8177


## Q2. (50 marks)
### Implement the idea in paper ***A Neural Probabilistic Language Model*** (https://www.jmlr.org/papers/volume3/bengio03a/bengio03a.pdf) to train a trigram model. We will use the brown corpus in nltk package as the dataset. Train the model for 5 epoches and print out the training loss, training accuracy, testing loss, and testing accuracy. You can use these codes to download the corpus:

```
import nltk
nltk.download("brown")
from nltk.corpus import brown
```

In [21]:
# Download the Brown corpus using NLTK
import nltk
nltk.download("brown")
from nltk.corpus import brown

# Prepare the corpus with all words
brown_words = brown.words()
print(f"Total number of words in Brown corpus: {len(brown_words)}")
print(f"First 10 words in Brown corpus: {brown_words[:10]}")

[nltk_data] Downloading package brown to
[nltk_data]     /Users/michaelzhu/nltk_data...
[nltk_data]   Package brown is already up-to-date!


Total number of words in Brown corpus: 1161192
First 10 words in Brown corpus: ['The', 'Fulton', 'County', 'Grand', 'Jury', 'said', 'Friday', 'an', 'investigation', 'of']


In [22]:
from collections import Counter

# Create term frequency of the words
term_freq = Counter(brown_words)
print(f"Total unique words in Brown corpus: {len(term_freq)}")
print(f"Most common words: {term_freq.most_common(10)}")

# Build the vocabulary
vocabulary = {word: idx for idx, (word, _) in enumerate(term_freq.items())}
print(f"Vocabulary size: {len(vocabulary)}")

Total unique words in Brown corpus: 56057
Most common words: [('the', 62713), (',', 58334), ('.', 49346), ('of', 36080), ('and', 27915), ('to', 25732), ('a', 21881), ('in', 19536), ('that', 10237), ('is', 10011)]
Vocabulary size: 56057


In [23]:
# Create Training and Development Sets

from sklearn.model_selection import train_test_split

# Convert words to indices based on the vocabulary
word_indices = [vocabulary[word] for word in brown_words]

# Create trigrams
trigrams = [(word_indices[i], word_indices[i+1], word_indices[i+2]) for i in range(len(word_indices) - 2)]

# Split the data into training and development sets (80% training, 20% development)
train_data, dev_data = train_test_split(trigrams, test_size=0.2, random_state=42)

print(f"Number of training samples: {len(train_data)}")
print(f"Number of development samples: {len(dev_data)}")

Number of training samples: 928952
Number of development samples: 232238


In [24]:
import torch
import torch.nn as nn
import torch.optim as optim

# Define the Trigram Neural Network Model
class TrigramNNModel(nn.Module):
    def __init__(self, vocab_size, embedding_dim, context_size):
        super(TrigramNNModel, self).__init__()
        self.embeddings = nn.Embedding(vocab_size, embedding_dim)
        self.linear1 = nn.Linear(context_size * embedding_dim, 128)
        self.linear2 = nn.Linear(128, vocab_size)

    def forward(self, inputs):
        embeds = self.embeddings(inputs).view((1, -1))
        out = torch.relu(self.linear1(embeds))
        out = self.linear2(out)
        log_probs = torch.log_softmax(out, dim=1)
        return log_probs

# Set parameters
vocab_size = len(vocabulary)
embedding_dim = 100
context_size = 2  # Trigram context size is 2

# Initialize the model, loss function, and optimizer
model = TrigramNNModel(vocab_size, embedding_dim, context_size)
loss_function = nn.NLLLoss()
optimizer = optim.SGD(model.parameters(), lr=0.01)

print(model)

TrigramNNModel(
  (embeddings): Embedding(56057, 100)
  (linear1): Linear(in_features=200, out_features=128, bias=True)
  (linear2): Linear(in_features=128, out_features=56057, bias=True)
)


In [None]:
# Define Negative Log-Likelihood Loss

# Implement the negative log-likelihood loss function
def negative_log_likelihood_loss(output, target):
    return loss_function(output, target)

# test
output = torch.tensor([[0.1, 0.2, 0.7]], requires_grad=True)  # Example output
target = torch.tensor([2])  # Example target

loss = negative_log_likelihood_loss(output, target)1
print(f"Loss: {loss.item()}")

Loss: -0.699999988079071


# Train and Save Model
Train the model for 5 epochs and save the trained model.

In [27]:
# Train and Save Model
from tqdm import tqdm
# Train the model for 5 epochs
num_epochs = 1

print(device)

for epoch in range(num_epochs):
    total_loss = 0
    correct_predictions = 0
    for context1, context2, target in tqdm(train_data):
        context_tensor = torch.tensor([context1, context2], dtype=torch.long)
        target_tensor = torch.tensor([target], dtype=torch.long)

        # Zero the gradients
        optimizer.zero_grad()

        # Forward pass
        log_probs = model(context_tensor)

        # Calculate the loss
        loss = negative_log_likelihood_loss(log_probs, target_tensor)

        # Backward pass and update weights
        loss.backward()
        optimizer.step()

        # Accumulate the loss
        total_loss += loss.item()

        # Calculate accuracy
        _, predicted = torch.max(log_probs, 1)
        correct_predictions += (predicted == target_tensor).sum().item()

    # Calculate average loss and accuracy
    avg_loss = total_loss / len(train_data)
    accuracy = correct_predictions / len(train_data)

    print(f"Epoch {epoch+1}/{num_epochs}, Loss: {avg_loss:.4f}, Accuracy: {accuracy:.4f}")

# Save the trained model
torch.save(model.state_dict(), "trigram_nn_model.pth")

# Evaluate on the development set
model.eval()
dev_loss = 0
correct_predictions = 0

with torch.no_grad():
    for context1, context2, target in dev_data:
        context_tensor = torch.tensor([context1, context2], dtype=torch.long)
        target_tensor = torch.tensor([target], dtype=torch.long)

        # Forward pass
        log_probs = model(context_tensor)

        # Calculate the loss
        loss = negative_log_likelihood_loss(log_probs, target_tensor)
        dev_loss += loss.item()

        # Calculate accuracy
        _, predicted = torch.max(log_probs, 1)
        correct_predictions += (predicted == target_tensor).sum().item()

# Calculate average loss and accuracy for the development set
avg_dev_loss = dev_loss / len(dev_data)
dev_accuracy = correct_predictions / len(dev_data)

print(f"Development Loss: {avg_dev_loss:.4f}, Development Accuracy: {dev_accuracy:.4f}")

cpu


100%|██████████| 928952/928952 [2:19:16<00:00, 111.17it/s]  


Epoch 1/1, Loss: 6.7277, Accuracy: 0.1097
Development Loss: 6.5272, Development Accuracy: 0.1188


The training time is too long for this task, so choose to train the model for fewer epochs (e.g., 1 epoch) to save time.

In [30]:
# Evaluate Model

# Evaluate on the training set
model.eval()
train_loss = 0
train_correct_predictions = 0

with torch.no_grad():
    for context1, context2, target in tqdm(train_data):
        context_tensor = torch.tensor([context1, context2], dtype=torch.long)
        target_tensor = torch.tensor([target], dtype=torch.long)

        # Forward pass
        log_probs = model(context_tensor)

        # Calculate the loss
        loss = negative_log_likelihood_loss(log_probs, target_tensor)
        train_loss += loss.item()

        # Calculate accuracy
        _, predicted = torch.max(log_probs, 1)
        train_correct_predictions += (predicted == target_tensor).sum().item()

# Calculate average loss and accuracy for the training set
avg_train_loss = train_loss / len(train_data)
train_accuracy = train_correct_predictions / len(train_data)

print(f"Training Loss: {avg_train_loss:.4f}, Training Accuracy: {train_accuracy:.4f}")

# Evaluate on the development set
dev_loss = 0
dev_correct_predictions = 0

with torch.no_grad():
    for context1, context2, target in dev_data:
        context_tensor = torch.tensor([context1, context2], dtype=torch.long)
        target_tensor = torch.tensor([target], dtype=torch.long)

        # Forward pass
        log_probs = model(context_tensor)

        # Calculate the loss
        loss = negative_log_likelihood_loss(log_probs, target_tensor)
        dev_loss += loss.item()

        # Calculate accuracy
        _, predicted = torch.max(log_probs, 1)
        dev_correct_predictions += (predicted == target_tensor).sum().item()

# Calculate average loss and accuracy for the development set
avg_dev_loss = dev_loss / len(dev_data)
dev_accuracy = dev_correct_predictions / len(dev_data)

print(f"Development Loss: {avg_dev_loss:.4f}, Development Accuracy: {dev_accuracy:.4f}")

100%|██████████| 928952/928952 [14:32<00:00, 1064.70it/s]


Training Loss: 6.3482, Training Accuracy: 0.1207
Development Loss: 6.5272, Development Accuracy: 0.1188


## Q3. (10 marks)

### Call the chatglm-4 API, and write a proper prompt using prompt engineering knowledge to let chatglm perform the task correctly:

``Take the last letters of the words and concatenate them.``


In [None]:
import os  
from openai import AzureOpenAI  

client = AzureOpenAI(  
    api_key="XXX",
    api_version="2024-02-01",  
    azure_endpoint="https://XXX.openai.azure.com/"  
)  

deployment_name = "gpt-35-turbo"  

# Define the words list
words_list = ['Linius Victor', 'strawberry cake', 'Nice headshot', 'Cristiano Ronaldo', 'Brawl Star', 'Natural Language Processing']

sentence = "Take the last letters of the words and concatenate them: " + ", ".join(words_list)

chat_prompt = [
{
    "role": "system",
    "content": "you are a helpful assistant"
},
{
    "role": "user",
    "content": sentence
}
]  

# Include speech result if speech is enabled  
speech_result = chat_prompt  

# Generate the completion  
completion = client.chat.completions.create(  
    model=deployment_name,  
    messages=speech_result,  
    max_tokens=800,  
    temperature=0.2,  
    top_p=0.95,  
    frequency_penalty=0,  
    presence_penalty=0,  
    stop=None,  
    stream=False  
)  
    
print(completion.to_json())  

{
  "id": "chatcmpl-AZBU9o2PPyP8POkghDwEUkZIQETVD",
  "choices": [
    {
      "finish_reason": "stop",
      "index": 0,
      "message": {
        "content": "sr, yek, ot, odo, ar, gnp",
        "role": "assistant"
      },
      "content_filter_results": {
        "hate": {
          "filtered": false,
          "severity": "safe"
        },
        "protected_material_code": {
          "filtered": false,
          "detected": false
        },
        "protected_material_text": {
          "filtered": false,
          "detected": false
        },
        "self_harm": {
          "filtered": false,
          "severity": "safe"
        },
        "sexual": {
          "filtered": false,
          "severity": "safe"
        },
        "violence": {
          "filtered": false,
          "severity": "safe"
        }
      }
    }
  ],
  "created": 1732948993,
  "model": "gpt-35-turbo",
  "object": "chat.completion",
  "system_fingerprint": null,
  "usage": {
    "completion_tokens": 14