In [1]:
# Данный ноутбук использовал окружение google-colab
%pip install catboost fasttext -q

[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/73.4 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[90m╺[0m [32m71.7/73.4 kB[0m [31m4.0 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m73.4/73.4 kB[0m [31m1.3 MB/s[0m eta [36m0:00:00[0m
[?25h  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m99.2/99.2 MB[0m [31m8.5 MB/s[0m eta [36m0:00:00[0m
[?25h  Building wheel for fasttext (pyproject.toml) ... [?25l[?25hdone


# Домашнее задание "NLP. Часть 1"

In [2]:
import math
import re
import os
import random
import json
from collections import Counter, defaultdict
from typing import List, Dict, Tuple, Any

import torch
import numpy as np
import datasets
import fasttext
import fasttext.util
from transformers import BertTokenizer, BertModel

In [3]:
def seed_everything(seed: int):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = True

seed_everything(42)

In [4]:
def normalize_pretokenize_text(text: str) -> List[str]:
    text = text.lower()
    words = re.findall(r'\b\w+\b', text)
    return words

In [5]:
# This block is for tests only
test_corpus = [
    "the quick brown fox jumps over the lazy dog",
    "never jump over the lazy dog quickly",
    "brown foxes are quick and dogs are lazy"
]

def build_vocab(texts: List[str]) -> Tuple[List[str], Dict[str, int]]:
    all_words = []
    for text in texts:
        words = normalize_pretokenize_text(text)
        all_words.extend(words)
    vocab = sorted(set(all_words))
    vocab_index = {word: idx for idx, word in enumerate(vocab)}
    return vocab, vocab_index

vocab, vocab_index = build_vocab(test_corpus)

## Задание 1 (0.5 балла)
Реализовать One-Hot векторизацию текстов

In [6]:
def one_hot_vectorization(
    text: str,
    vocab: List[str] = None,
    vocab_index: Dict[str, int] = None
) -> List[List[int]]:
    words = normalize_pretokenize_text(text)
    result = []
    for word in words:
        vector = [0] * len(vocab)
        if word in vocab_index:
            i = vocab_index[word]
            vector[i] = 1
        result.append(vector)
    return result

def test_one_hot_vectorization(
    vocab: List[str],
    vocab_index: Dict[str, int]
) -> bool:
    try:
        text = "the quick brown fox"
        result = one_hot_vectorization(text, vocab, vocab_index)

        if not isinstance(result, list):
            return False

        expected_length = len(vocab)
        if len(result[0]) != expected_length:
            return False

        words_in_text = normalize_pretokenize_text(text)
        for i, word in enumerate(words_in_text):
            if word in vocab_index:
                idx = vocab_index[word]
                if result[i][idx] != 1:
                    return False

        print("One-Hot-Vectors test PASSED")

        return True
    except Exception as e:
        print(f"One-Hot-Vectors test FAILED: {e}")
        return False

In [7]:
assert test_one_hot_vectorization(vocab, vocab_index)

One-Hot-Vectors test PASSED


## Задание 2 (0.5 балла)
Реализовать Bag-of-Words

In [8]:
def bag_of_words_vectorization(text: str) -> Dict[str, int]:
    words = normalize_pretokenize_text(text)
    return dict(Counter(words))

def test_bag_of_words_vectorization() -> bool:
    try:
        text = "the the quick brown brown brown"
        result = bag_of_words_vectorization(text)

        if not isinstance(result, dict):
            return False

        if result.get('the', 0) != 2:
            return False
        if result.get('quick', 0) != 1:
            return False
        if result.get('brown', 0) != 3:
            return False
        if result.get('nonexistent', 0) != 0:
            return False

        print("Bad-of-Words test PASSED")
        return True
    except Exception as e:
        print(f"Bag-of-Words test FAILED: {e}")
        return False

In [9]:
assert test_bag_of_words_vectorization()

Bad-of-Words test PASSED


## Задание 3 (0.5 балла)
Реализовать TF-IDF

In [10]:
def tf_idf_vectorization(text: str, corpus: List[str] = None, vocab: List[str] = None, vocab_index: Dict[str, int] = None) -> List[float]:
    words = normalize_pretokenize_text(text)
    tfidf_vector = []
    N = len(corpus)

    tf_dict = {}
    for word in words:
        tf_dict[word] = tf_dict.get(word, 0) + 1

    df_dict = {}
    for doc in corpus:
        doc_words = set(normalize_pretokenize_text(doc))
        for word in doc_words:
            df_dict[word] = df_dict.get(word, 0) + 1

    for word in vocab:
        tf_word = tf_dict.get(word, 0) / len(words) if len(words) > 0 else 0.0
        df_word = df_dict.get(word, 0)
        idf_word = math.log((N + 1) / (df_word + 1)) + 1
        tfidf_vector.append(float(tf_word * idf_word))
    return tfidf_vector

def test_tf_idf_vectorization(corpus, vocab, vocab_index) -> bool:
    try:
        text = "the quick brown"
        result = tf_idf_vectorization(text, corpus, vocab, vocab_index)

        if not isinstance(result, list):
            return False

        expected_length = len(vocab)
        if len(result) != expected_length:
            return False

        for val in result:
            if not isinstance(val, float):
                return False

        print("TF-IDF test PASSED")
        return True
    except Exception as e:
        print(f"TF-IDF test FAILED: {e}")
        return False

In [11]:
assert test_tf_idf_vectorization(test_corpus, vocab, vocab_index)

TF-IDF test PASSED


## Задание 4 (1 балл)
Реализовать Positive Pointwise Mutual Information (PPMI).  
https://en.wikipedia.org/wiki/Pointwise_mutual_information
$$PPMI(word, context) = max(0, PMI(word, context))$$
$$PMI(word, context) = log \frac{P(word, context)}{P(word) P(context)} = log \frac{N(word, context)|(word, context)|}{N(word) N(context)}$$
где $N(word, context)$ -- число вхождений слова $word$ в окно $context$ (размер окна -- гиперпараметр)

In [12]:
def compute_ppmi_matrices(corpus, vocab, window_size=2):
    word_count = {word: 0 for word in vocab}
    context_count = {word: 0 for word in vocab}
    word_context_count = {word: {ctx: 0 for ctx in vocab} for word in vocab}
    total_windows = 0

    for doc in corpus:
        doc_words = normalize_pretokenize_text(doc)
        for i, word in enumerate(doc_words):
            if word not in vocab:
                continue
            word_count[word] += 1
            left = max(0, i - window_size)
            right = min(len(doc_words), i + window_size + 1)
            for j in range(left, right):
                if j == i:
                    continue
                context = doc_words[j]
                if context not in vocab:
                    continue
                context_count[context] += 1
                word_context_count[word][context] += 1
                total_windows += 1
    return word_count, context_count, word_context_count, total_windows

def ppmi_vectorization(
    text: str,
    vocab: list,
    word_count,
    context_count,
    word_context_count,
    total_windows
) -> list:
    words = normalize_pretokenize_text(text)
    vector = []
    N = total_windows if total_windows > 0 else 1
    for v in vocab:
        ppmi_sum = 0.0
        for w in words:
            if w not in vocab:
                continue
            n_wc = word_context_count[w].get(v, 0)
            n_w = word_count.get(w, 0)
            n_c = context_count.get(v, 0)
            if n_wc == 0 or n_w == 0 or n_c == 0:
                pmi = 0.0
            else:
                p_wc = n_wc / N
                p_w = n_w / N
                p_c = n_c / N
                pmi = math.log(p_wc / (p_w * p_c) + 1e-10)
            ppmi_sum += max(0.0, pmi)
        vector.append(ppmi_sum)
    return vector

def test_ppmi_vectorization(corpus, vocab, vocab_index):
    try:
        # Считаем матрицы один раз
        word_count, context_count, word_context_count, total_windows = compute_ppmi_matrices(corpus, vocab)
        text = "the quick brown"
        result = ppmi_vectorization(
            text, vocab, word_count, context_count, word_context_count, total_windows
        )
        if not isinstance(result, list):
            return False
        if len(result) != len(vocab):
            return False
        for val in result:
            if not isinstance(val, float) and not isinstance(val, int):
                return False
        print("PPMI test PASSED")
        return True
    except Exception as e:
        print(f"PPMI test FAILED: {e}")
        return False

In [13]:
assert test_ppmi_vectorization(test_corpus, vocab, vocab_index)

PPMI test PASSED


## Задание 5 (1 балл)
Реализовать получение эмбеддингов из fasttext и bert (для bert лучше использовать CLS токен)

In [14]:
!pip install gensim -q

[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m27.9/27.9 MB[0m [31m49.0 MB/s[0m eta [36m0:00:00[0m
[?25h

In [None]:
from gensim.models import FastText

import datasets
dataset = datasets.load_dataset("imdb", split="train[:1%]")  # 1% от train
texts = [item['text'] for item in dataset if 'text' in item and item['text'].strip()]
corpus = [normalize_pretokenize_text(text) for text in texts]

fasttext_model = FastText(sentences=corpus, vector_size=300, window=3, min_count=1, epochs=10)
print("Загрузка FastText модели окончена")

In [16]:
def get_fasttext_embeddings(text: str, model: any = None) -> list:
    global fasttext_model
    if model is None:
        model = fasttext_model
    words = normalize_pretokenize_text(text)
    embeddings = []
    for word in words:
        if word in model.wv:
            embeddings.append(model.wv[word])
        else:
            embeddings.append(np.zeros(model.vector_size))
    return embeddings

In [17]:
def get_bert_embeddings(
    text: str,
    model_name: str = 'bert-base-uncased',
    pool_method: str = 'cls'
) -> np.ndarray:
    tokenizer = BertTokenizer.from_pretrained(model_name)
    model = BertModel.from_pretrained(model_name)
    inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True)
    with torch.no_grad():
        outputs = model(**inputs)
    if pool_method == 'cls':
        return outputs.last_hidden_state[:, 0, :].squeeze(0).numpy()
    else:
        return outputs.last_hidden_state.mean(dim=1).squeeze(0).numpy()

## Задание 6 (1.5 балла)
Реализовать обучение так, чтобы можно было поверх эмбеддингов, реализованных в предыдущих заданиях, обучить какую-то модель (вероятно неглубокую, например, CatBoost) на задаче классификации текстов ([IMDB](https://huggingface.co/datasets/stanfordnlp/imdb)).

In [18]:
def vectorize_dataset(
    dataset_name: str = "imdb",
    vectorizer_type: str = "bow",
    split: str = "train",
    sample_size: int = 10,
    vocab: list = None,
    vocab_index: dict = None
) -> tuple:
    import datasets

    dataset = datasets.load_dataset(dataset_name, split=split)

    if sample_size:
        dataset = dataset.shuffle(seed=42).select(range(min(sample_size, len(dataset))))

    texts = [item['text'] for item in dataset if 'text' in item and item['text'].strip()]
    labels = [item['label'] for item in dataset if 'label' in item]

    if vocab is None or vocab_index is None:
        all_words = []
        for text in texts:
            words = normalize_pretokenize_text(text)
            all_words.extend(words)
        vocab = sorted(set(all_words))
        vocab_index = {word: idx for idx, word in enumerate(vocab)}

    if vectorizer_type == "ppmi":
        word_count, context_count, word_context_count, total_windows = compute_ppmi_matrices(texts, vocab)

    vectorized_data = []
    for text in texts:
        if vectorizer_type == "one_hot":
            mat = one_hot_vectorization(text, vocab, vocab_index)
            if len(mat) > 0:
                summed = np.sum(mat, axis=0)
            else:
                summed = np.zeros(len(vocab))
            vectorized_data.append(summed.tolist())
        elif vectorizer_type == "bow":
            bow_dict = bag_of_words_vectorization(text)
            vector = [bow_dict.get(word, 0) for word in vocab]
            vectorized_data.append(vector)
        elif vectorizer_type == "tfidf":
            vectorized_data.append(tf_idf_vectorization(text, texts, vocab, vocab_index))
        elif vectorizer_type == "ppmi":
            vectorized_data.append(
                ppmi_vectorization(
                    text, vocab, word_count, context_count, word_context_count, total_windows
                )
            )
        elif vectorizer_type == "fasttext":
            embeddings = get_fasttext_embeddings(text)
            if embeddings:
                avg_embedding = np.mean(embeddings, axis=0)
                vectorized_data.append(avg_embedding.tolist())
            else:
                vectorized_data.append([0] * 300)
        elif vectorizer_type == "bert":
            embedding = get_bert_embeddings(text)
            vectorized_data.append(embedding.tolist())
        else:
            raise ValueError(f"Unknown vectorizer type: {vectorizer_type}")
    return vocab, vectorized_data, labels

In [21]:
from catboost import CatBoostClassifier
from sklearn.metrics import classification_report, accuracy_score, f1_score
from sklearn.model_selection import train_test_split, cross_val_score, KFold

def train(
    embeddings_method="bow",
    cv_folds=5,
):
    vocab, X, y = vectorize_dataset("imdb", embeddings_method, "train")
    _, X_test, y_test = vectorize_dataset("imdb", embeddings_method, "test", vocab=vocab, vocab_index={w: i for i, w in enumerate(vocab)})

    X = np.array(X)
    y = np.array(y)
    X_test = np.array(X_test)
    y_test = np.array(y_test)

    # print(np.bincount(y))

    model = CatBoostClassifier(
        iterations=100,
        learning_rate=0.1,
        depth=6,
        verbose=0,
        random_seed=42
    )



    kf = KFold(n_splits=cv_folds, shuffle=True, random_state=42)
    cv_scores = cross_val_score(model, X, y, cv=kf, scoring='accuracy')
    print(f"CV accuracy (mean ± std): {cv_scores.mean():.4f} ± {cv_scores.std():.4f}")

    model.fit(X, y)
    y_pred_test = model.predict(X_test)
    print(f"Test accuracy: {accuracy_score(y_test, y_pred_test):.4f}")
    print(f"Test F1: {f1_score(y_test, y_pred_test, average='weighted', zero_division=0):.4f}")
    print("Classification report (test):")
    print(classification_report(y_test, y_pred_test, zero_division=0))

In [22]:
for embeddings_method in ["bow", "one_hot", "tfidf", "ppmi", "fasttext", "bert"]:
    print(embeddings_method)
    train(embeddings_method=embeddings_method)

bow
CV accuracy (mean ± std): 0.8000 ± 0.4000
Test accuracy: 0.6000
Test F1: 0.4500
Classification report (test):
              precision    recall  f1-score   support

           0       0.00      0.00      0.00         4
           1       0.60      1.00      0.75         6

    accuracy                           0.60        10
   macro avg       0.30      0.50      0.38        10
weighted avg       0.36      0.60      0.45        10

one_hot
CV accuracy (mean ± std): 0.8000 ± 0.4000
Test accuracy: 0.6000
Test F1: 0.4500
Classification report (test):
              precision    recall  f1-score   support

           0       0.00      0.00      0.00         4
           1       0.60      1.00      0.75         6

    accuracy                           0.60        10
   macro avg       0.30      0.50      0.38        10
weighted avg       0.36      0.60      0.45        10

tfidf
CV accuracy (mean ± std): 0.8000 ± 0.4000
Test accuracy: 0.5000
Test F1: 0.4000
Classification report (test)