<a href="https://colab.research.google.com/github/SirvavialTAG/NLP/blob/main/NLP_lab4.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import numpy as np

In [2]:
grammar_dataset = [
    ("The cat sleeps on the sofa.", 1),
    ("She went to the library yesterday.", 1),
    ("Me enjoy learning about natural language processing.", 0),
    ("They are playing football in the garden.", 1),
    ("He speaks English fluently.", 1),
    ("We will visit the museum next week.", 1),
    ("My computer needs an update.", 1),
    ("Cat the sleeps sofa the on.", 0),
    ("Birds fly high in the sky.", 1),
    ("She goed to the library yesterday.", 0),
    ("They is playing football in the garden.", 0),
    ("He speak English fluently.", 0),
    ("Apples are good for your health.", 1),
    ("This are a very interesting book.", 0),
    ("We visits the museum next week.", 0),
    ("This is a very interesting book.", 1),
    ("Birds flys high in the sky.", 0),
    ("I enjoy learning about natural language processing.", 1),
    ("My computer need an update.", 0),
    ("Apples is good for your health.", 0),
    ("The children are playing happily in the park.", 1),
    ("We visited the museum and saw many artifacts.", 1),
    ("Thinks careful before you make a decision.", 0),
    ("Can you please help me with this heavy box?", 1),
    ("He diligently completed all his assignments on time.", 1),
    ("The old bridge look sturdy.", 0),
    ("They live in a beautiful apartment overlooking the city.", 1),
    ("Learn effectively require consistent practice.", 0),
    ("A quick brown fox jumps over the lazy dog.", 1),
    ("Did she remember to lock the door?", 1),
    ("You should wear a helmet when riding a bike.", 1),
    ("Water boils at 100 degrees Celsius.", 1),
    ("I would appreciate your feedback on my presentation.", 1),
    ("Swimming is an excellent form of exercise.", 1),
    ("The old bridge looks sturdy.", 1),
    ("We visit the museum and seen many artifacts.", 0),
    ("This soup smells wonderful.", 1),
    ("Think carefully before you make a decision.", 1),
    ("The childrens is playing happy in the park.", 0),
    ("Read improve your understanding of world.", 0),
    ("Can you helping me with this heavy box?", 0),
    ("They have known each other for many years.", 1),
    ("The weather report say it might rain later today.", 0),
    ("He diligent completed all his assignments on time.", 0),
    ("My favorite season autumn, but my brother love summer.", 0),
    ("They live beautiful apartment overlooking city.", 0),
    ("You should wear a helmet when riding a bike.", 1),
    ("Lazy dog the over jumps fox brown quick a.", 0),
    ("The weather report says it might rain later today.", 1),
    ("She did remember to locking the door?", 0),
    ("You should to wear a helmet when riding a bike.", 0),
    ("To learn effectively requires consistent practice.", 1),
    ("Water boil at 100 degrees Celsius.", 0),
    ("I would appreciates your feedback on my presentation.", 0),
    ("Will they arrive before dinner?", 1),
    ("Swimming are an excellent form of exercise.", 0),
    ("They has know each other for many years.", 0),
    ("Reading improves your understanding of the world.", 1),
    ("My favorite season is autumn, but my brother loves summer.", 1),
    ("This soup smell wonderful.", 0),
    ("Will they arrives before dinner?", 0),
]

In [3]:
# Параметры
EMBEDDING_DIM = 16
NUM_HEADS = 2
FFN_DIM = 64
MAX_SEQ_LEN = 10
VOCAB_SIZE = 250  # будет переопределено после построения словаря
LEARNING_RATE = 0.01
EPOCHS = 20
BATCH_SIZE = 4
np.random.seed(42)

In [4]:
# Создание словаря слов из обучающего датасета
def build_vocabulary(dataset):
    vocabulary = {"<PAD>": 0, "<UNK>": 1}
    index = 2
    for sentence, _ in dataset:
        for word in sentence.lower().split():
            word = word.rstrip(".,!?;:")
            if word not in vocabulary:
                vocabulary[word] = index
                index += 1
    return vocabulary

In [5]:
def integer_tokenize(sentence, vocabulary):
    tokens = []
    for word in sentence.lower().split():
        word = word.rstrip(".,!?;:")
        if word in vocabulary:
            tokens.append(vocabulary[word])
        else:
            tokens.append(vocabulary["<UNK>"])
    return tokens

In [6]:
# Приведение данных к фиксированной длине
def pad_sequence(seq, max_len):
    sequence = seq[:max_len]
    return sequence + [0] * (max_len - len(sequence))

In [7]:
def prepare_data(dataset, vocabulary, max_len=MAX_SEQ_LEN):
    X, y = [], []
    for sentence, label in dataset:
        tokens = integer_tokenize(sentence, vocabulary)
        tokens = pad_sequence(tokens, max_len)
        X.append(tokens)
        y.append(label)
    return np.array(X), np.array(y).reshape(-1, 1)

In [8]:
# Деление набора данных не небольшие порции
def batch_iter(X, y, batch_size):
    index = np.arange(len(X))
    np.random.shuffle(index)
    for i in range(0, len(X), batch_size):
        batch_index = index[i:i + batch_size]
        yield X[batch_index], y[batch_index]

In [9]:
# Позиционное кодированиие
def positional_encoding(seq_len, dim_model):
    position = np.arange(seq_len)[:, None]
    i = np.arange(dim_model)[None, :]
    angle_rates = 1 / np.power(10000, (2 * (i // 2)) / np.float32(dim_model))
    angle_rads = position * angle_rates
    pos_enc = np.zeros((seq_len, dim_model))
    pos_enc[:, 0::2] = np.sin(angle_rads[:, 0::2])
    pos_enc[:, 1::2] = np.cos(angle_rads[:, 1::2])
    return pos_enc

In [10]:
# Нормализация
def layer_norm(X, epsilon=1e-6):
    mean = np.mean(X, axis=-1, keepdims=True)
    var = np.var(X, axis=-1, keepdims=True)
    return (X - mean) / np.sqrt(var + epsilon)

In [11]:
# Scaled Dot-Product Attention
def scaled_dot_product_attention(Q, K, V, mask=None):
    d_k = Q.shape[-1]
    scores = np.matmul(Q, K.transpose(0, 1, 3, 2)) / np.sqrt(d_k)

    if mask is not None:
        scores += (mask * -1e9)

    attention_weights = softmax(scores, axis=-1)
    output = np.matmul(attention_weights, V)
    return output, attention_weights

In [12]:
# Расчёт вероятности слова
def softmax(X, axis=-1):
    exp_x = np.exp(X - np.max(X, axis=-1, keepdims=True))
    return exp_x / np.sum(exp_x, axis=-1, keepdims=True)

In [13]:
def sigmoid(x):
    return 1 / (1 + np.exp(-x))

In [14]:
# Многоголовое внимание
class MultiHeadAttention:
    def __init__(self, dim_model, num_heads):
        assert dim_model % num_heads == 0
        self.num_heads = num_heads
        self.dim_model = dim_model
        self.d_k = dim_model // num_heads

        self.W_query = np.random.rand(dim_model, dim_model) / np.sqrt(dim_model)
        self.W_key = np.random.rand(dim_model, dim_model) / np.sqrt(dim_model)
        self.W_value = np.random.rand(dim_model, dim_model) / np.sqrt(dim_model)
        self.W_output = np.random.rand(dim_model, dim_model) / np.sqrt(dim_model)

    def split_heads(self, x):
        batch, sequence, dim_model = x.shape
        x = x.reshape(batch, sequence, self.num_heads, self.d_k)
        return x.transpose(0, 2, 1, 3)

    def combine_heads(self, x):
        batch, num_heads, sequence, d_k = x.shape
        x = x.transpose(0, 2, 1, 3)
        return x.reshape(batch, sequence, num_heads * d_k)

    def __call__(self, x):
        Q = np.matmul(x, self.W_query)
        K = np.matmul(x, self.W_key)
        V = np.matmul(x, self.W_value)
        Q = self.split_heads(Q)
        K = self.split_heads(K)
        V = self.split_heads(V)
        attention, _ = scaled_dot_product_attention(Q, K, V)
        attention = self.combine_heads(attention)
        output = np.matmul(attention, self.W_output)
        return output

In [15]:
# Feed Forward Network
class FeedForward:
    def __init__(self, dim_model, ffn_dim):
        self.W1 = np.random.rand(dim_model, ffn_dim) / np.sqrt(dim_model)
        self.b1 = np.zeros((1, ffn_dim))
        self.W2 = np.random.rand(ffn_dim, dim_model) / np.sqrt(ffn_dim)
        self.b2 = np.zeros((1, dim_model))

    def __call__(self, x):
        h = np.matmul(x, self.W1) + self.b1
        h = np.maximum(0, h)
        output = np.matmul(h, self.W2) + self.b2
        return output

In [16]:
# Encoder
class Encoder:
    def __init__(self, d_model, num_heads, ffn_dim):
        self.mha = MultiHeadAttention(d_model, num_heads)
        self.ffn = FeedForward(d_model, ffn_dim)

    def __call__(self, x):
        attn_out = self.mha(x)
        x = layer_norm(x + attn_out)
        ffn_out = self.ffn(x)
        x = layer_norm(x + ffn_out)
        return x

In [17]:
# Классификатор
class Classifier:
    def __init__(self, d_model):
        self.W = np.random.rand(d_model, 1) / np.sqrt(d_model)
        self.b = np.zeros((1, 1))

    def __call__(self, x):
        pooled = np.mean(x, axis=1)
        logits = np.matmul(pooled, self.W) + self.b
        return sigmoid(logits)

In [18]:
# Модель GPT
class GPT:
    def __init__(self, vocabulary_size, dim_model, num_heads, ffn_dim, max_seq_len):
        self.vocabulary_size = vocabulary_size
        self.dim_model = dim_model
        self.embedding = np.random.rand(vocabulary_size, dim_model) / np.sqrt(vocabulary_size)
        self.pos_encoding = positional_encoding(max_seq_len, dim_model)
        self.encoder = Encoder(dim_model, num_heads, ffn_dim)
        self.classifier = Classifier(dim_model)

    def forward(self, x):
        x = self.embedding[x] + self.pos_encoding[None, :, :]
        x = self.encoder(x)
        output = self.classifier(x)
        return output

    def parameters(self):
        params = [self.embedding, self.encoder.mha.W_query, self.encoder.mha.W_key, self.encoder.mha.W_value, self.encoder.mha.W_output,
                  self.encoder.ffn.W1, self.encoder.ffn.b1, self.encoder.ffn.W2, self.encoder.ffn.b2,
                  self.classifier.W, self.classifier.b]
        return params

In [19]:
def binary_cross_entropy(y_pred, y_true):
    y_pred = np.clip(y_pred, 1e-7, 1-1e-7)
    return -np.mean(y_true * np.log(y_pred) + (1 - y_true) * np.log(1 - y_pred))

In [20]:
def train(model, X, y, epochs=EPOCHS, lr=LEARNING_RATE):
    for epoch in range(epochs):
        losses = []
        for X_batch, y_batch in batch_iter(X, y, BATCH_SIZE):
            # Прямой проход
            y_pred = model.forward(X_batch)
            loss = binary_cross_entropy(y_pred, y_batch)
            losses.append(loss)
            # Градиенты (численно, для простоты)
            grads = numerical_gradients(model, X_batch, y_batch)
            # Обновление параметров
            for param, grad in zip(model.parameters(), grads):
                param -= lr * grad
        print(f"Epoch {epoch+1}/{epochs}, Loss: {np.mean(losses):.4f}")

In [21]:
def numerical_gradients(model, X, y, eps=1e-4):
    grads = []
    for param in model.parameters():
        grad = np.zeros_like(param)
        it = np.nditer(param, flags=['multi_index'], op_flags=['readwrite'])
        while not it.finished:
            idx = it.multi_index
            orig = param[idx]
            param[idx] = orig + eps
            loss1 = binary_cross_entropy(model.forward(X), y)
            param[idx] = orig - eps
            loss2 = binary_cross_entropy(model.forward(X), y)
            grad[idx] = (loss1 - loss2) / (2 * eps)
            param[idx] = orig
            it.iternext()
        grads.append(grad)
    return grads

In [22]:
def predict(model, sentence, vocab):
    tokens = integer_tokenize(sentence, vocab)
    tokens = pad_sequence(tokens, MAX_SEQ_LEN)
    X = np.array([tokens])
    y_pred = model.forward(X)
    return float(y_pred[0,0])

In [23]:
# Пример использования
vocabulary = build_vocabulary(grammar_dataset)
X, y = prepare_data(grammar_dataset, vocabulary)
modelGPT = GPT(len(vocabulary), EMBEDDING_DIM, NUM_HEADS, FFN_DIM, MAX_SEQ_LEN)
train(modelGPT, X, y, epochs=EPOCHS, lr=LEARNING_RATE)

Epoch 1/20, Loss: 0.6998
Epoch 2/20, Loss: 0.6997
Epoch 3/20, Loss: 0.7042
Epoch 4/20, Loss: 0.6983
Epoch 5/20, Loss: 0.6992
Epoch 6/20, Loss: 0.6977
Epoch 7/20, Loss: 0.6975
Epoch 8/20, Loss: 0.6963
Epoch 9/20, Loss: 0.7007
Epoch 10/20, Loss: 0.6991
Epoch 11/20, Loss: 0.6984
Epoch 12/20, Loss: 0.6979
Epoch 13/20, Loss: 0.6988
Epoch 14/20, Loss: 0.6967
Epoch 15/20, Loss: 0.6972
Epoch 16/20, Loss: 0.6981
Epoch 17/20, Loss: 0.6963
Epoch 18/20, Loss: 0.6942
Epoch 19/20, Loss: 0.6965
Epoch 20/20, Loss: 0.7004


In [24]:
print("The childen sleep on the sofa:", predict(modelGPT, "The childen sleep on the sofa.", vocabulary))
print("This soup smell wonderful:", predict(modelGPT, "This soup smell wonderful.", vocabulary))

The childen sleep on the sofa: 0.49832744895846887
This soup smell wonderful: 0.4979063430868901
