In [0]:
!git clone https://github.com/Samsung-IT-Academy/stepik-dl-nlp.git && pip install -r stepik-dl-nlp/requirements.txt

In [0]:
import sys; sys.path.append('./stepik-dl-nlp')

import warnings
warnings.filterwarnings('ignore')

import sklearn
from sklearn.datasets import fetch_20newsgroups
from sklearn.metrics import accuracy_score

import numpy as np
import scipy.sparse
import pandas as pd

import matplotlib.pyplot as plt
%matplotlib inline

import collections

import torch
from torch import nn
from torch.nn import functional as F

import dlnlputils
from dlnlputils.data import tokenize_text_simple_regex, tokenize_corpus, build_vocabulary, vectorize_texts, SparseFeaturesDataset
from dlnlputils.pipeline import train_eval_loop, predict_with_model, init_random_seed

init_random_seed()

In [0]:
results = []

#VECTORIZATION_MODE: tfidf

In [0]:
VECTORIZATION_MODE = 'tfidf'
LR = 1e-1
L2 = 0
BATCH = 32
EPOCH = 200

In [0]:
train_source = fetch_20newsgroups(subset='train')
test_source = fetch_20newsgroups(subset='test')

train_tokenized = tokenize_corpus(train_source['data'])
test_tokenized = tokenize_corpus(test_source['data'])

MAX_DF = 0.8
MIN_COUNT = 5
vocabulary, word_doc_freq = build_vocabulary(train_tokenized, max_doc_freq=MAX_DF, min_count=MIN_COUNT)

train_vectors = vectorize_texts(train_tokenized, vocabulary, word_doc_freq, mode=VECTORIZATION_MODE)
test_vectors = vectorize_texts(test_tokenized, vocabulary, word_doc_freq, mode=VECTORIZATION_MODE)

train_dataset = SparseFeaturesDataset(train_vectors, train_source['target'])
test_dataset = SparseFeaturesDataset(test_vectors, test_source['target'])

UNIQUE_WORDS_N = len(vocabulary)
UNIQUE_LABELS_N = len(set(train_source['target']))
model = nn.Linear(UNIQUE_WORDS_N, UNIQUE_LABELS_N)

scheduler = lambda optim: torch.optim.lr_scheduler.ReduceLROnPlateau(optim, patience=5, factor=0.5, verbose=True)

best_val_loss, best_model = train_eval_loop(model=model,
                                            train_dataset=train_dataset,
                                            val_dataset=test_dataset,
                                            criterion=F.cross_entropy,
                                            lr=LR,
                                            epoch_n=EPOCH,
                                            batch_size=BATCH,
                                            l2_reg_alpha=L2,
                                            lr_scheduler_ctor=scheduler)

In [7]:
train_pred = predict_with_model(best_model, train_dataset)
train_loss = F.cross_entropy(torch.from_numpy(train_pred),torch.from_numpy(train_source['target']).long())
test_pred = predict_with_model(best_model, test_dataset)
test_loss = F.cross_entropy(torch.from_numpy(test_pred), torch.from_numpy(test_source['target']).long())

100%|██████████| 354/353.5625 [00:01<00:00, 184.61it/s]
236it [00:01, 180.98it/s]                             


In [8]:
print('Среднее значение функции потерь на обучении', float(train_loss))
print('Доля верных ответов', accuracy_score(train_source['target'], train_pred.argmax(-1)))
print()
print('Среднее значение функции потерь на валидации', float(test_loss))
print('Доля верных ответов', accuracy_score(test_source['target'], test_pred.argmax(-1)))

Среднее значение функции потерь на обучении 0.004174976609647274
Доля верных ответов 0.9992929114371575

Среднее значение функции потерь на валидации 0.9288815259933472
Доля верных ответов 0.7712426978226234


In [0]:
results.append({'mode': 'tfidf', 'train_accuracy': accuracy_score(train_source['target'], train_pred.argmax(-1)), 'test_accuracy': accuracy_score(test_source['target'], test_pred.argmax(-1)), 'train_loss': float(train_loss), 'test_loss': float(test_loss)})

#VECTORIZATION_MODE: tf

In [0]:
VECTORIZATION_MODE = 'tf'
LR = 1e-1
L2 = 0
BATCH = 32
EPOCH = 200

In [0]:
train_source = fetch_20newsgroups(subset='train')
test_source = fetch_20newsgroups(subset='test')

train_tokenized = tokenize_corpus(train_source['data'])
test_tokenized = tokenize_corpus(test_source['data'])

MAX_DF = 0.8
MIN_COUNT = 5
vocabulary, word_doc_freq = build_vocabulary(train_tokenized, max_doc_freq=MAX_DF, min_count=MIN_COUNT)

train_vectors = vectorize_texts(train_tokenized, vocabulary, word_doc_freq, mode=VECTORIZATION_MODE)
test_vectors = vectorize_texts(test_tokenized, vocabulary, word_doc_freq, mode=VECTORIZATION_MODE)

train_dataset = SparseFeaturesDataset(train_vectors, train_source['target'])
test_dataset = SparseFeaturesDataset(test_vectors, test_source['target'])

UNIQUE_WORDS_N = len(vocabulary)
UNIQUE_LABELS_N = len(set(train_source['target']))
model = nn.Linear(UNIQUE_WORDS_N, UNIQUE_LABELS_N)

scheduler = lambda optim: torch.optim.lr_scheduler.ReduceLROnPlateau(optim, patience=5, factor=0.5, verbose=True)

best_val_loss, best_model = train_eval_loop(model=model,
                                            train_dataset=train_dataset,
                                            val_dataset=test_dataset,
                                            criterion=F.cross_entropy,
                                            lr=LR,
                                            epoch_n=EPOCH,
                                            batch_size=BATCH,
                                            l2_reg_alpha=L2,
                                            lr_scheduler_ctor=scheduler)

In [12]:
train_pred = predict_with_model(best_model, train_dataset)
train_loss = F.cross_entropy(torch.from_numpy(train_pred),torch.from_numpy(train_source['target']).long())
test_pred = predict_with_model(best_model, test_dataset)
test_loss = F.cross_entropy(torch.from_numpy(test_pred), torch.from_numpy(test_source['target']).long())

100%|██████████| 354/353.5625 [00:01<00:00, 180.62it/s]
236it [00:01, 182.84it/s]                             


In [13]:
print('Среднее значение функции потерь на обучении', float(train_loss))
print('Доля верных ответов', accuracy_score(train_source['target'], train_pred.argmax(-1)))
print()
print('Среднее значение функции потерь на валидации', float(test_loss))
print('Доля верных ответов', accuracy_score(test_source['target'], test_pred.argmax(-1)))

Среднее значение функции потерь на обучении 0.005827018991112709
Доля верных ответов 0.9992045253668022

Среднее значение функции потерь на валидации 0.6565795540809631
Доля верных ответов 0.8145246946362188


In [0]:
results.append({'mode': 'tf', 'train_accuracy': accuracy_score(train_source['target'], train_pred.argmax(-1)), 'test_accuracy': accuracy_score(test_source['target'], test_pred.argmax(-1)), 'train_loss': float(train_loss), 'test_loss': float(test_loss)})

#VECTORIZATION_MODE: idf

In [0]:
VECTORIZATION_MODE = 'idf'
LR = 1e-1
L2 = 0
BATCH = 32
EPOCH = 200

In [0]:
train_source = fetch_20newsgroups(subset='train')
test_source = fetch_20newsgroups(subset='test')

train_tokenized = tokenize_corpus(train_source['data'])
test_tokenized = tokenize_corpus(test_source['data'])

MAX_DF = 0.8
MIN_COUNT = 5
vocabulary, word_doc_freq = build_vocabulary(train_tokenized, max_doc_freq=MAX_DF, min_count=MIN_COUNT)

train_vectors = vectorize_texts(train_tokenized, vocabulary, word_doc_freq, mode=VECTORIZATION_MODE)
test_vectors = vectorize_texts(test_tokenized, vocabulary, word_doc_freq, mode=VECTORIZATION_MODE)

train_dataset = SparseFeaturesDataset(train_vectors, train_source['target'])
test_dataset = SparseFeaturesDataset(test_vectors, test_source['target'])

UNIQUE_WORDS_N = len(vocabulary)
UNIQUE_LABELS_N = len(set(train_source['target']))
model = nn.Linear(UNIQUE_WORDS_N, UNIQUE_LABELS_N)

scheduler = lambda optim: torch.optim.lr_scheduler.ReduceLROnPlateau(optim, patience=5, factor=0.5, verbose=True)

best_val_loss, best_model = train_eval_loop(model=model,
                                            train_dataset=train_dataset,
                                            val_dataset=test_dataset,
                                            criterion=F.cross_entropy,
                                            lr=LR,
                                            epoch_n=EPOCH,
                                            batch_size=BATCH,
                                            l2_reg_alpha=L2,
                                            lr_scheduler_ctor=scheduler)

In [17]:
train_pred = predict_with_model(best_model, train_dataset)
train_loss = F.cross_entropy(torch.from_numpy(train_pred),torch.from_numpy(train_source['target']).long())
test_pred = predict_with_model(best_model, test_dataset)
test_loss = F.cross_entropy(torch.from_numpy(test_pred), torch.from_numpy(test_source['target']).long())

100%|██████████| 354/353.5625 [00:01<00:00, 189.35it/s]
236it [00:01, 188.52it/s]                             


In [18]:
print('Среднее значение функции потерь на обучении', float(train_loss))
print('Доля верных ответов', accuracy_score(train_source['target'], train_pred.argmax(-1)))
print()
print('Среднее значение функции потерь на валидации', float(test_loss))
print('Доля верных ответов', accuracy_score(test_source['target'], test_pred.argmax(-1)))

Среднее значение функции потерь на обучении 0.021615294739603996
Доля верных ответов 0.9984090507336044

Среднее значение функции потерь на валидации 1.0027966499328613
Доля верных ответов 0.7525225703664365


In [0]:
results.append({'mode': 'idf', 'train_accuracy': accuracy_score(train_source['target'], train_pred.argmax(-1)), 'test_accuracy': accuracy_score(test_source['target'], test_pred.argmax(-1)), 'train_loss': float(train_loss), 'test_loss': float(test_loss)})

#VECTORIZATION_MODE: binary

In [0]:
VECTORIZATION_MODE = 'bin'
LR = 1e-1
L2 = 0
BATCH = 32
EPOCH = 200

In [0]:
train_source = fetch_20newsgroups(subset='train')
test_source = fetch_20newsgroups(subset='test')

train_tokenized = tokenize_corpus(train_source['data'])
test_tokenized = tokenize_corpus(test_source['data'])

MAX_DF = 0.8
MIN_COUNT = 5
vocabulary, word_doc_freq = build_vocabulary(train_tokenized, max_doc_freq=MAX_DF, min_count=MIN_COUNT)

train_vectors = vectorize_texts(train_tokenized, vocabulary, word_doc_freq, mode=VECTORIZATION_MODE)
test_vectors = vectorize_texts(test_tokenized, vocabulary, word_doc_freq, mode=VECTORIZATION_MODE)

train_dataset = SparseFeaturesDataset(train_vectors, train_source['target'])
test_dataset = SparseFeaturesDataset(test_vectors, test_source['target'])

UNIQUE_WORDS_N = len(vocabulary)
UNIQUE_LABELS_N = len(set(train_source['target']))
model = nn.Linear(UNIQUE_WORDS_N, UNIQUE_LABELS_N)

scheduler = lambda optim: torch.optim.lr_scheduler.ReduceLROnPlateau(optim, patience=5, factor=0.5, verbose=True)

best_val_loss, best_model = train_eval_loop(model=model,
                                            train_dataset=train_dataset,
                                            val_dataset=test_dataset,
                                            criterion=F.cross_entropy,
                                            lr=LR,
                                            epoch_n=EPOCH,
                                            batch_size=BATCH,
                                            l2_reg_alpha=L2,
                                            lr_scheduler_ctor=scheduler)

In [22]:
train_pred = predict_with_model(best_model, train_dataset)
train_loss = F.cross_entropy(torch.from_numpy(train_pred),torch.from_numpy(train_source['target']).long())
test_pred = predict_with_model(best_model, test_dataset)
test_loss = F.cross_entropy(torch.from_numpy(test_pred), torch.from_numpy(test_source['target']).long())

100%|██████████| 354/353.5625 [00:01<00:00, 193.50it/s]
236it [00:01, 195.72it/s]                             


In [23]:
print('Среднее значение функции потерь на обучении', float(train_loss))
print('Доля верных ответов', accuracy_score(train_source['target'], train_pred.argmax(-1)))
print()
print('Среднее значение функции потерь на валидации', float(test_loss))
print('Доля верных ответов', accuracy_score(test_source['target'], test_pred.argmax(-1)))

Среднее значение функции потерь на обучении 0.0634421557188034
Доля верных ответов 0.9934594307937069

Среднее значение функции потерь на валидации 3.8096024990081787
Доля верных ответов 0.7308815719596389


In [0]:
results.append({'mode': 'bin', 'train_accuracy': accuracy_score(train_source['target'], train_pred.argmax(-1)), 'test_accuracy': accuracy_score(test_source['target'], test_pred.argmax(-1)), 'train_loss': float(train_loss), 'test_loss': float(test_loss)})

#VECTORIZATION_MODE: pmi

In [0]:
def vectorize_texts_pmi(tokenized_texts, word2id, target):
    word_count_per_label = scipy.sparse.dok_matrix((len(set(target)), len(word2id)), dtype='float32')
    for text_i, text in enumerate(tokenized_texts):
        for token in text:
            if token in word2id:
                word_count_per_label[target[text_i], word2id[token]] += 1

    _, counts_target = np.unique(target, return_counts=True)
    pwl = word_count_per_label / word_count_per_label.sum()
    pw = word_count_per_label.sum(0) / word_count_per_label.sum()
    pl = counts_target / counts_target.sum()
    
    pmi = scipy.sparse.dok_matrix((len(tokenized_texts), len(word2id)), dtype='float32')
    for text_i, text in enumerate(tokenized_texts):
        for token in text:
            if token in word2id:
                pmi[text_i, word2id[token]] = np.log2(pwl[target[text_i], word2id[token]] / (pw[0, word2id[token]] * pl[target[text_i]]))
      
    return scipy.sparse.csr_matrix(pmi)

In [0]:
train_source = fetch_20newsgroups(subset='train')
test_source = fetch_20newsgroups(subset='test')

train_tokenized = tokenize_corpus(train_source['data'])
test_tokenized = tokenize_corpus(test_source['data'])

MAX_DF = 0.8
MIN_COUNT = 5
vocabulary, word_doc_freq = build_vocabulary(train_tokenized, max_doc_freq=MAX_DF, min_count=MIN_COUNT)

train_vectors = vectorize_texts_pmi(train_tokenized, vocabulary, train_source['target'])
test_vectors = vectorize_texts_pmi(test_tokenized, vocabulary, test_source['target'])

train_dataset = SparseFeaturesDataset(train_vectors, train_source['target'])
test_dataset = SparseFeaturesDataset(test_vectors, test_source['target'])

UNIQUE_WORDS_N = len(vocabulary)
UNIQUE_LABELS_N = len(set(train_source['target']))
model = nn.Linear(UNIQUE_WORDS_N, UNIQUE_LABELS_N)

scheduler = lambda optim: torch.optim.lr_scheduler.ReduceLROnPlateau(optim, patience=5, factor=0.5, verbose=True)

In [0]:
LR = 1e-1
L2 = 1e-3
BATCH = 32
EPOCH = 200

In [0]:
best_val_loss, best_model = train_eval_loop(model=model,
                                            train_dataset=train_dataset,
                                            val_dataset=test_dataset,
                                            criterion=F.cross_entropy,
                                            lr=LR,
                                            epoch_n=EPOCH,
                                            batch_size=BATCH,
                                            l2_reg_alpha=L2,
                                            lr_scheduler_ctor=scheduler)

In [31]:
train_pred = predict_with_model(best_model, train_dataset)
train_loss = F.cross_entropy(torch.from_numpy(train_pred),torch.from_numpy(train_source['target']).long())
test_pred = predict_with_model(best_model, test_dataset)
test_loss = F.cross_entropy(torch.from_numpy(test_pred), torch.from_numpy(test_source['target']).long())

100%|██████████| 354/353.5625 [00:01<00:00, 178.81it/s]
236it [00:01, 177.61it/s]                             


In [32]:
print('Среднее значение функции потерь на обучении', float(train_loss))
print('Доля верных ответов', accuracy_score(train_source['target'], train_pred.argmax(-1)))
print()
print('Среднее значение функции потерь на валидации', float(test_loss))
print('Доля верных ответов', accuracy_score(test_source['target'], test_pred.argmax(-1)))

Среднее значение функции потерь на обучении 0.05344357341527939
Доля верных ответов 0.9992045253668022

Среднее значение функции потерь на валидации 0.317936509847641
Доля верных ответов 0.9358736059479554


In [0]:
results.append({'mode': 'pmi', 'train_accuracy': accuracy_score(train_source['target'], train_pred.argmax(-1)), 'test_accuracy': accuracy_score(test_source['target'], test_pred.argmax(-1)), 'train_loss': float(train_loss), 'test_loss': float(test_loss)})

#VECTORIZATION_MODE: lsa

In [0]:
def vectorize_texts_lsa(tokenized_texts_train, tokenized_texts_test, word2id, n_components=100, scale=True):

    train_word_count = scipy.sparse.dok_matrix((len(tokenized_texts_train), len(word2id)), dtype='float32')
    for text_i, text in enumerate(tokenized_texts_train):
        for token in text:
            if token in word2id:
                train_word_count[text_i, word2id[token]] += 1

    test_word_count = scipy.sparse.dok_matrix((len(tokenized_texts_test), len(word2id)), dtype='float32')
    for text_i, text in enumerate(tokenized_texts_test):
        for token in text:
            if token in word2id:
                test_word_count[text_i, word2id[token]] += 1

    lsa = sklearn.decomposition.TruncatedSVD(n_components)
    train_vectors = lsa.fit_transform(train_word_count)
    test_vectors = lsa.transform(test_word_count)

    if scale:
        norm = sklearn.preprocessing.Normalizer(copy=False)
        train_vectors = norm.fit_transform(train_vectors)
        test_vectors = norm.transform(test_vectors)

    return scipy.sparse.csr_matrix(train_vectors), scipy.sparse.csr_matrix(test_vectors)

In [0]:
train_source = fetch_20newsgroups(subset='train')
test_source = fetch_20newsgroups(subset='test')

train_tokenized = tokenize_corpus(train_source['data'])
test_tokenized = tokenize_corpus(test_source['data'])

MAX_DF = 0.8
MIN_COUNT = 5
vocabulary, word_doc_freq = build_vocabulary(train_tokenized, max_doc_freq=MAX_DF, min_count=MIN_COUNT)

train_vectors, test_vectors = vectorize_texts_lsa(train_tokenized, test_tokenized, vocabulary)

train_dataset = SparseFeaturesDataset(train_vectors, train_source['target'])
test_dataset = SparseFeaturesDataset(test_vectors, test_source['target'])

UNIQUE_WORDS_N = train_vectors.shape[1]
UNIQUE_LABELS_N = len(set(train_source['target']))
model = nn.Linear(UNIQUE_WORDS_N, UNIQUE_LABELS_N)

scheduler = lambda optim: torch.optim.lr_scheduler.ReduceLROnPlateau(optim, patience=5, factor=0.5, verbose=True)

In [0]:
LR = 1e-1
L2 = 0
BATCH = 32
EPOCH = 200

In [0]:
best_val_loss, best_model = train_eval_loop(model=model,
                                            train_dataset=train_dataset,
                                            val_dataset=test_dataset,
                                            criterion=F.cross_entropy,
                                            lr=LR,
                                            epoch_n=EPOCH,
                                            batch_size=BATCH,
                                            l2_reg_alpha=L2,
                                            lr_scheduler_ctor=scheduler)

In [38]:
train_pred = predict_with_model(best_model, train_dataset)
train_loss = F.cross_entropy(torch.from_numpy(train_pred),torch.from_numpy(train_source['target']).long())
test_pred = predict_with_model(best_model, test_dataset)
test_loss = F.cross_entropy(torch.from_numpy(test_pred), torch.from_numpy(test_source['target']).long())

100%|██████████| 354/353.5625 [00:01<00:00, 271.31it/s]
236it [00:00, 281.45it/s]                             


In [39]:
print('Среднее значение функции потерь на обучении', float(train_loss))
print('Доля верных ответов', accuracy_score(train_source['target'], train_pred.argmax(-1)))
print()
print('Среднее значение функции потерь на валидации', float(test_loss))
print('Доля верных ответов', accuracy_score(test_source['target'], test_pred.argmax(-1)))

Среднее значение функции потерь на обучении 0.9047325849533081
Доля верных ответов 0.72379353013965

Среднее значение функции потерь на валидации 1.313632845878601
Доля верных ответов 0.6072756240042485


In [0]:
results.append({'mode': 'lsa', 'train_accuracy': accuracy_score(train_source['target'], train_pred.argmax(-1)), 'test_accuracy': accuracy_score(test_source['target'], test_pred.argmax(-1)), 'train_loss': float(train_loss), 'test_loss': float(test_loss)})

# RESULTS: top vectorization mode

In [53]:
pd.DataFrame(results).sort_values(by=['test_accuracy'], ascending=False)

Unnamed: 0,mode,train_accuracy,test_accuracy,train_loss,test_loss
5,pmi,0.999205,0.935874,0.053444,0.317937
1,tf,0.999205,0.814525,0.005827,0.65658
0,tfidf,0.999293,0.771243,0.004175,0.928882
2,idf,0.998409,0.752523,0.021615,1.002797
3,bin,0.993459,0.730882,0.063442,3.809602
4,lsa,0.723794,0.607276,0.904733,1.313633


#STEMMING

In [0]:
import nltk
# nltk.download('wordnet')
sno = nltk.stem.SnowballStemmer('english')

In [0]:
results1 = []

#STEMMING: VECTORIZATION_MODE: pmi

In [0]:
train_source = fetch_20newsgroups(subset='train')
test_source = fetch_20newsgroups(subset='test')

train_tokenized = tokenize_corpus(train_source['data'])
test_tokenized = tokenize_corpus(test_source['data'])

train_tokenized = [[sno.stem(word) for word in doc] for doc in train_tokenized]
test_tokenized = [[sno.stem(word) for word in doc] for doc in test_tokenized]

MAX_DF = 0.8
MIN_COUNT = 5
vocabulary, word_doc_freq = build_vocabulary(train_tokenized, max_doc_freq=MAX_DF, min_count=MIN_COUNT)

train_vectors = vectorize_texts_pmi(train_tokenized, vocabulary, train_source['target'])
test_vectors = vectorize_texts_pmi(test_tokenized, vocabulary, test_source['target'])

train_dataset = SparseFeaturesDataset(train_vectors, train_source['target'])
test_dataset = SparseFeaturesDataset(test_vectors, test_source['target'])

UNIQUE_WORDS_N = len(vocabulary)
UNIQUE_LABELS_N = len(set(train_source['target']))
model = nn.Linear(UNIQUE_WORDS_N, UNIQUE_LABELS_N)

scheduler = lambda optim: torch.optim.lr_scheduler.ReduceLROnPlateau(optim, patience=5, factor=0.5, verbose=True)

In [0]:
LR = 1e-1
L2 = 1e-3
BATCH = 32
EPOCH = 200

In [0]:
best_val_loss, best_model = train_eval_loop(model=model,
                                            train_dataset=train_dataset,
                                            val_dataset=test_dataset,
                                            criterion=F.cross_entropy,
                                            lr=LR,
                                            epoch_n=EPOCH,
                                            batch_size=BATCH,
                                            l2_reg_alpha=L2,
                                            lr_scheduler_ctor=scheduler)

In [74]:
train_pred = predict_with_model(best_model, train_dataset)
train_loss = F.cross_entropy(torch.from_numpy(train_pred),torch.from_numpy(train_source['target']).long())
test_pred = predict_with_model(best_model, test_dataset)
test_loss = F.cross_entropy(torch.from_numpy(test_pred), torch.from_numpy(test_source['target']).long())

100%|██████████| 354/353.5625 [00:01<00:00, 202.29it/s]
236it [00:01, 206.04it/s]                             


In [75]:
print('Среднее значение функции потерь на обучении', float(train_loss))
print('Доля верных ответов', accuracy_score(train_source['target'], train_pred.argmax(-1)))
print()
print('Среднее значение функции потерь на валидации', float(test_loss))
print('Доля верных ответов', accuracy_score(test_source['target'], test_pred.argmax(-1)))

Среднее значение функции потерь на обучении 0.05469982698559761
Доля верных ответов 0.9993812975075128

Среднее значение функции потерь на валидации 0.3036949932575226
Доля верных ответов 0.942777482740308


In [0]:
results1.append({'mode': 'pmi', 'train_accuracy': accuracy_score(train_source['target'], train_pred.argmax(-1)), 'test_accuracy': accuracy_score(test_source['target'], test_pred.argmax(-1)), 'train_loss': float(train_loss), 'test_loss': float(test_loss)})

#STEMMING: VECTORIZATION_MODE: tf

In [0]:
VECTORIZATION_MODE = 'tf'
LR = 1e-1
L2 = 0
BATCH = 32
EPOCH = 200

In [0]:
train_source = fetch_20newsgroups(subset='train')
test_source = fetch_20newsgroups(subset='test')

train_tokenized = tokenize_corpus(train_source['data'])
test_tokenized = tokenize_corpus(test_source['data'])

train_tokenized = [[sno.stem(word) for word in doc] for doc in train_tokenized]
test_tokenized = [[sno.stem(word) for word in doc] for doc in test_tokenized]

MAX_DF = 0.8
MIN_COUNT = 5
vocabulary, word_doc_freq = build_vocabulary(train_tokenized, max_doc_freq=MAX_DF, min_count=MIN_COUNT)

train_vectors = vectorize_texts(train_tokenized, vocabulary, word_doc_freq, mode=VECTORIZATION_MODE)
test_vectors = vectorize_texts(test_tokenized, vocabulary, word_doc_freq, mode=VECTORIZATION_MODE)

train_dataset = SparseFeaturesDataset(train_vectors, train_source['target'])
test_dataset = SparseFeaturesDataset(test_vectors, test_source['target'])

UNIQUE_WORDS_N = len(vocabulary)
UNIQUE_LABELS_N = len(set(train_source['target']))
model = nn.Linear(UNIQUE_WORDS_N, UNIQUE_LABELS_N)

scheduler = lambda optim: torch.optim.lr_scheduler.ReduceLROnPlateau(optim, patience=5, factor=0.5, verbose=True)

best_val_loss, best_model = train_eval_loop(model=model,
                                            train_dataset=train_dataset,
                                            val_dataset=test_dataset,
                                            criterion=F.cross_entropy,
                                            lr=LR,
                                            epoch_n=EPOCH,
                                            batch_size=BATCH,
                                            l2_reg_alpha=L2,
                                            lr_scheduler_ctor=scheduler)

In [80]:
train_pred = predict_with_model(best_model, train_dataset)
train_loss = F.cross_entropy(torch.from_numpy(train_pred),torch.from_numpy(train_source['target']).long())
test_pred = predict_with_model(best_model, test_dataset)
test_loss = F.cross_entropy(torch.from_numpy(test_pred), torch.from_numpy(test_source['target']).long())

100%|██████████| 354/353.5625 [00:01<00:00, 189.97it/s]
236it [00:01, 186.77it/s]                             


In [81]:
print('Среднее значение функции потерь на обучении', float(train_loss))
print('Доля верных ответов', accuracy_score(train_source['target'], train_pred.argmax(-1)))
print()
print('Среднее значение функции потерь на валидации', float(test_loss))
print('Доля верных ответов', accuracy_score(test_source['target'], test_pred.argmax(-1)))

Среднее значение функции потерь на обучении 0.010306037962436676
Доля верных ответов 0.9992045253668022

Среднее значение функции потерь на валидации 0.6652761101722717
Доля верных ответов 0.8113382899628253


In [0]:
results1.append({'mode': 'tf', 'train_accuracy': accuracy_score(train_source['target'], train_pred.argmax(-1)), 'test_accuracy': accuracy_score(test_source['target'], test_pred.argmax(-1)), 'train_loss': float(train_loss), 'test_loss': float(test_loss)})

#STEMMING: VECTORIZATION_MODE: tfidf

In [0]:
VECTORIZATION_MODE = 'tfidf'
LR = 1e-1
L2 = 0
BATCH = 32
EPOCH = 200

In [0]:
train_source = fetch_20newsgroups(subset='train')
test_source = fetch_20newsgroups(subset='test')

train_tokenized = tokenize_corpus(train_source['data'])
test_tokenized = tokenize_corpus(test_source['data'])

train_tokenized = [[sno.stem(word) for word in doc] for doc in train_tokenized]
test_tokenized = [[sno.stem(word) for word in doc] for doc in test_tokenized]

MAX_DF = 0.8
MIN_COUNT = 5
vocabulary, word_doc_freq = build_vocabulary(train_tokenized, max_doc_freq=MAX_DF, min_count=MIN_COUNT)

train_vectors = vectorize_texts(train_tokenized, vocabulary, word_doc_freq, mode=VECTORIZATION_MODE)
test_vectors = vectorize_texts(test_tokenized, vocabulary, word_doc_freq, mode=VECTORIZATION_MODE)

train_dataset = SparseFeaturesDataset(train_vectors, train_source['target'])
test_dataset = SparseFeaturesDataset(test_vectors, test_source['target'])

UNIQUE_WORDS_N = len(vocabulary)
UNIQUE_LABELS_N = len(set(train_source['target']))
model = nn.Linear(UNIQUE_WORDS_N, UNIQUE_LABELS_N)

scheduler = lambda optim: torch.optim.lr_scheduler.ReduceLROnPlateau(optim, patience=5, factor=0.5, verbose=True)

best_val_loss, best_model = train_eval_loop(model=model,
                                            train_dataset=train_dataset,
                                            val_dataset=test_dataset,
                                            criterion=F.cross_entropy,
                                            lr=LR,
                                            epoch_n=EPOCH,
                                            batch_size=BATCH,
                                            l2_reg_alpha=L2,
                                            lr_scheduler_ctor=scheduler)

In [91]:
train_pred = predict_with_model(best_model, train_dataset)
train_loss = F.cross_entropy(torch.from_numpy(train_pred),torch.from_numpy(train_source['target']).long())
test_pred = predict_with_model(best_model, test_dataset)
test_loss = F.cross_entropy(torch.from_numpy(test_pred), torch.from_numpy(test_source['target']).long())

100%|██████████| 354/353.5625 [00:01<00:00, 203.15it/s]
236it [00:01, 208.02it/s]                             


In [92]:
print('Среднее значение функции потерь на обучении', float(train_loss))
print('Доля верных ответов', accuracy_score(train_source['target'], train_pred.argmax(-1)))
print()
print('Среднее значение функции потерь на валидации', float(test_loss))
print('Доля верных ответов', accuracy_score(test_source['target'], test_pred.argmax(-1)))

Среднее значение функции потерь на обучении 0.013925576582551003
Доля верных ответов 0.9991161392964468

Среднее значение функции потерь на валидации 0.9857187271118164
Доля верных ответов 0.7636749867233139


In [0]:
results1.append({'mode': 'tfidf', 'train_accuracy': accuracy_score(train_source['target'], train_pred.argmax(-1)), 'test_accuracy': accuracy_score(test_source['target'], test_pred.argmax(-1)), 'train_loss': float(train_loss), 'test_loss': float(test_loss)})

# RESULTS: top stemming

In [98]:
pd.DataFrame(results1).sort_values(by=['test_accuracy'], ascending=False)

Unnamed: 0,mode,train_accuracy,test_accuracy,train_loss,test_loss
0,pmi,0.999381,0.942777,0.0547,0.303695
1,tf,0.999205,0.811338,0.010306,0.665276
2,tfidf,0.999116,0.763675,0.013926,0.985719


#N-gram (word 2-gram)

In [0]:
result2 = []

#N-gram: VECTORIZATION_MODE: pmi

In [0]:
train_source = fetch_20newsgroups(subset='train')
test_source = fetch_20newsgroups(subset='test')

train_tokenized = tokenize_corpus(train_source['data'])
test_tokenized = tokenize_corpus(test_source['data'])

train_tokenized = [[f"{doc[i_word]}_{doc[i_word+1]}" for i_word in range(len(doc)-1)] for doc in train_tokenized]
test_tokenized = [[f"{doc[i_word]}_{doc[i_word+1]}" for i_word in range(len(doc)-1)] for doc in test_tokenized]

MAX_DF = 0.8
MIN_COUNT = 5
vocabulary, word_doc_freq = build_vocabulary(train_tokenized, max_doc_freq=MAX_DF, min_count=MIN_COUNT)

train_vectors = vectorize_texts_pmi(train_tokenized, vocabulary, train_source['target'])
test_vectors = vectorize_texts_pmi(test_tokenized, vocabulary, test_source['target'])

train_dataset = SparseFeaturesDataset(train_vectors, train_source['target'])
test_dataset = SparseFeaturesDataset(test_vectors, test_source['target'])

UNIQUE_WORDS_N = len(vocabulary)
UNIQUE_LABELS_N = len(set(train_source['target']))
model = nn.Linear(UNIQUE_WORDS_N, UNIQUE_LABELS_N)

scheduler = lambda optim: torch.optim.lr_scheduler.ReduceLROnPlateau(optim, patience=5, factor=0.5, verbose=True)

In [0]:
LR = 1e-1
L2 = 1e-3
BATCH = 32
EPOCH = 200

In [0]:
best_val_loss, best_model = train_eval_loop(model=model,
                                            train_dataset=train_dataset,
                                            val_dataset=test_dataset,
                                            criterion=F.cross_entropy,
                                            lr=LR,
                                            epoch_n=EPOCH,
                                            batch_size=BATCH,
                                            l2_reg_alpha=L2,
                                            lr_scheduler_ctor=scheduler)

Эпоха 0
Эпоха: 354 итераций, 7.10 сек
Среднее значение функции потерь на обучении 1.0902491315862552
Среднее значение функции потерь на валидации 2.6270006993564508
Новая лучшая модель!

Эпоха 1
Эпоха: 354 итераций, 6.76 сек
Среднее значение функции потерь на обучении 1.1059183079517831
Среднее значение функции потерь на валидации 2.315061278767505
Новая лучшая модель!

Эпоха 2
Эпоха: 354 итераций, 7.07 сек
Среднее значение функции потерь на обучении 1.1590289650958474
Среднее значение функции потерь на валидации 2.2983832215353592
Новая лучшая модель!

Эпоха 3
Эпоха: 354 итераций, 6.84 сек
Среднее значение функции потерь на обучении 1.0675788229215617
Среднее значение функции потерь на валидации 2.729440582505727

Эпоха 4
Эпоха: 354 итераций, 6.82 сек
Среднее значение функции потерь на обучении 1.1533394707735143
Среднее значение функции потерь на валидации 2.5741452092336394

Эпоха 5
Эпоха: 354 итераций, 6.79 сек
Среднее значение функции потерь на обучении 1.1349304500904123
Среднее 

In [0]:
train_pred = predict_with_model(best_model, train_dataset)
train_loss = F.cross_entropy(torch.from_numpy(train_pred),torch.from_numpy(train_source['target']).long())
test_pred = predict_with_model(best_model, test_dataset)
test_loss = F.cross_entropy(torch.from_numpy(test_pred), torch.from_numpy(test_source['target']).long())

In [0]:
print('Среднее значение функции потерь на обучении', float(train_loss))
print('Доля верных ответов', accuracy_score(train_source['target'], train_pred.argmax(-1)))
print()
print('Среднее значение функции потерь на валидации', float(test_loss))
print('Доля верных ответов', accuracy_score(test_source['target'], test_pred.argmax(-1)))

In [0]:
results2.append({'mode': 'pmi', 'train_accuracy': accuracy_score(train_source['target'], train_pred.argmax(-1)), 'test_accuracy': accuracy_score(test_source['target'], test_pred.argmax(-1)), 'train_loss': float(train_loss), 'test_loss': float(test_loss)})

#N-gram: VECTORIZATION_MODE: tf

In [0]:
VECTORIZATION_MODE = 'tf'
LR = 1e-1
L2 = 0
BATCH = 32
EPOCH = 200

In [0]:
train_source = fetch_20newsgroups(subset='train')
test_source = fetch_20newsgroups(subset='test')

train_tokenized = tokenize_corpus(train_source['data'])
test_tokenized = tokenize_corpus(test_source['data'])

train_tokenized = [[f"{doc[i_word]}_{doc[i_word+1]}" for i_word in range(len(doc)-1)] for doc in train_tokenized]
test_tokenized = [[f"{doc[i_word]}_{doc[i_word+1]}" for i_word in range(len(doc)-1)] for doc in test_tokenized]

MAX_DF = 0.8
MIN_COUNT = 5
vocabulary, word_doc_freq = build_vocabulary(train_tokenized, max_doc_freq=MAX_DF, min_count=MIN_COUNT)

train_vectors = vectorize_texts(train_tokenized, vocabulary, word_doc_freq, mode=VECTORIZATION_MODE)
test_vectors = vectorize_texts(test_tokenized, vocabulary, word_doc_freq, mode=VECTORIZATION_MODE)

train_dataset = SparseFeaturesDataset(train_vectors, train_source['target'])
test_dataset = SparseFeaturesDataset(test_vectors, test_source['target'])

UNIQUE_WORDS_N = len(vocabulary)
UNIQUE_LABELS_N = len(set(train_source['target']))
model = nn.Linear(UNIQUE_WORDS_N, UNIQUE_LABELS_N)

scheduler = lambda optim: torch.optim.lr_scheduler.ReduceLROnPlateau(optim, patience=5, factor=0.5, verbose=True)

best_val_loss, best_model = train_eval_loop(model=model,
                                            train_dataset=train_dataset,
                                            val_dataset=test_dataset,
                                            criterion=F.cross_entropy,
                                            lr=LR,
                                            epoch_n=EPOCH,
                                            batch_size=BATCH,
                                            l2_reg_alpha=L2,
                                            lr_scheduler_ctor=scheduler)

In [0]:
train_pred = predict_with_model(best_model, train_dataset)
train_loss = F.cross_entropy(torch.from_numpy(train_pred),torch.from_numpy(train_source['target']).long())
test_pred = predict_with_model(best_model, test_dataset)
test_loss = F.cross_entropy(torch.from_numpy(test_pred), torch.from_numpy(test_source['target']).long())

In [0]:
print('Среднее значение функции потерь на обучении', float(train_loss))
print('Доля верных ответов', accuracy_score(train_source['target'], train_pred.argmax(-1)))
print()
print('Среднее значение функции потерь на валидации', float(test_loss))
print('Доля верных ответов', accuracy_score(test_source['target'], test_pred.argmax(-1)))

In [0]:
results2.append({'mode': 'tf', 'train_accuracy': accuracy_score(train_source['target'], train_pred.argmax(-1)), 'test_accuracy': accuracy_score(test_source['target'], test_pred.argmax(-1)), 'train_loss': float(train_loss), 'test_loss': float(test_loss)})

#N-gram: VECTORIZATION_MODE: tfidf

In [0]:
VECTORIZATION_MODE = 'tfidf'
LR = 1e-1
L2 = 0
BATCH = 32
EPOCH = 200

In [0]:
train_source = fetch_20newsgroups(subset='train')
test_source = fetch_20newsgroups(subset='test')

train_tokenized = tokenize_corpus(train_source['data'])
test_tokenized = tokenize_corpus(test_source['data'])

train_tokenized = [[f"{doc[i_word]}_{doc[i_word+1]}" for i_word in range(len(doc)-1)] for doc in train_tokenized]
test_tokenized = [[f"{doc[i_word]}_{doc[i_word+1]}" for i_word in range(len(doc)-1)] for doc in test_tokenized]

MAX_DF = 0.8
MIN_COUNT = 5
vocabulary, word_doc_freq = build_vocabulary(train_tokenized, max_doc_freq=MAX_DF, min_count=MIN_COUNT)

train_vectors = vectorize_texts(train_tokenized, vocabulary, word_doc_freq, mode=VECTORIZATION_MODE)
test_vectors = vectorize_texts(test_tokenized, vocabulary, word_doc_freq, mode=VECTORIZATION_MODE)

train_dataset = SparseFeaturesDataset(train_vectors, train_source['target'])
test_dataset = SparseFeaturesDataset(test_vectors, test_source['target'])

UNIQUE_WORDS_N = len(vocabulary)
UNIQUE_LABELS_N = len(set(train_source['target']))
model = nn.Linear(UNIQUE_WORDS_N, UNIQUE_LABELS_N)

scheduler = lambda optim: torch.optim.lr_scheduler.ReduceLROnPlateau(optim, patience=5, factor=0.5, verbose=True)

best_val_loss, best_model = train_eval_loop(model=model,
                                            train_dataset=train_dataset,
                                            val_dataset=test_dataset,
                                            criterion=F.cross_entropy,
                                            lr=LR,
                                            epoch_n=EPOCH,
                                            batch_size=BATCH,
                                            l2_reg_alpha=L2,
                                            lr_scheduler_ctor=scheduler)

In [0]:
train_pred = predict_with_model(best_model, train_dataset)
train_loss = F.cross_entropy(torch.from_numpy(train_pred),torch.from_numpy(train_source['target']).long())
test_pred = predict_with_model(best_model, test_dataset)
test_loss = F.cross_entropy(torch.from_numpy(test_pred), torch.from_numpy(test_source['target']).long())

In [0]:
print('Среднее значение функции потерь на обучении', float(train_loss))
print('Доля верных ответов', accuracy_score(train_source['target'], train_pred.argmax(-1)))
print()
print('Среднее значение функции потерь на валидации', float(test_loss))
print('Доля верных ответов', accuracy_score(test_source['target'], test_pred.argmax(-1)))

In [0]:
results2.append({'mode': 'tfidf', 'train_accuracy': accuracy_score(train_source['target'], train_pred.argmax(-1)), 'test_accuracy': accuracy_score(test_source['target'], test_pred.argmax(-1)), 'train_loss': float(train_loss), 'test_loss': float(test_loss)})

#RESULTS: top n-gram

In [0]:
pd.DataFrame(results2).sort_values(by=['test_accuracy'], ascending=False)

In [0]:
import numpy as np
import scipy.sparse
import torch

tokenized_texts = train_tokenized
word2id = vocabulary
word2freq = word_doc_freq

result1 = scipy.sparse.dok_matrix((len(tokenized_texts), len(word2id)), dtype='float32')
for text_i, text in enumerate(tokenized_texts):
    for token in text:
        if token in word2id:
            result1[text_i, word2id[token]] += 1

In [0]:
result = scipy.sparse.dok_matrix((len(set(train_source['target'])), len(word2id)), dtype='float32')
for text_i, text in enumerate(tokenized_texts):
    for token in text:
        if token in word2id:
            result[train_source['target'][text_i], word2id[token]] += 1

In [53]:
result.shape

(20, 21628)

In [0]:
result1_tf = result1.tocsr()
result1_tf = result1_tf.multiply(1 / result1_tf.sum(1))

In [76]:
result1_tf.sum(0).shape

(1, 21628)

In [0]:
_, counts_target = np.unique(train_source['target'], return_counts=True)
# pwl = result / result.sum()
pwl = result / counts_target.reshape(20, 1)
pw = result.sum(0) / result.sum()
pl = counts_target / counts_target.sum()
# pmi = np.log2(pwl / pw / np.array([pl] * pwl.shape[1]).T)

In [55]:
pwl.shape, pw.shape, pl.shape

((20, 21628), (1, 21628), (20,))

In [0]:
pmi = scipy.sparse.dok_matrix((len(tokenized_texts), len(word2id)), dtype='float32')
for text_i, text in enumerate(tokenized_texts):
    for token in text:
        if token in word2id:
            pmi[text_i, word2id[token]] = np.log2(pwl[train_source['target'][text_i], word2id[token]] / (pw[0, word2id[token]] * pl[train_source['target'][text_i]]))
# pmi = pmi.tocoo()

In [78]:
result1_tf[0][0] / (result1_tf[0][0] * pl[0])

TypeError: ignored

In [70]:
pwl[:,0] / (pw[0, 0] * pl.reshape(20, 1))

matrix([[6844.33551162],
        [1472.90225213],
        [1379.60255916],
        [1517.69073931],
        [1529.0496814 ],
        [1897.13707402],
        [ 423.14372555],
        [2052.09164423],
        [1647.71190308],
        [1979.68065522],
        [2057.02841064],
        [3795.60846068],
        [1666.96019066],
        [3065.4003458 ],
        [2368.58131697],
        [6113.77143135],
        [4671.59159706],
        [6043.71757308],
        [8648.6347432 ],
        [8551.44512561]])

In [69]:
pw[0, 0]

0.022657633

In [0]:
print(type(pmi))
print(pmi.shape)

<class 'scipy.sparse.dok.dok_matrix'>
(11314, 21628)


In [0]:
_pmi = pmi.tocsc()
_pmi -= np.array([_pmi.min()] * _pmi.shape[1]).reshape(1, -1)
# _pmi /= (_pmi.max() + 1e-6)

In [0]:
type(_pmi)

numpy.matrix

In [0]:
torch.from_numpy(pmi.toarray()).float().min()

tensor(-7.9936)

In [0]:
pwl / np.array([pl] * pwl.shape[1]).T

matrix([[0.04295434, 0.01653973, 0.01579164, ..., 0.        , 0.        ,
         0.        ],
        [0.0112466 , 0.00941316, 0.0074232 , ..., 0.        , 0.        ,
         0.        ],
        [0.01066046, 0.00707015, 0.00871617, ..., 0.        , 0.        ,
         0.        ],
        ...,
        [0.04456745, 0.01430789, 0.0144005 , ..., 0.        , 0.        ,
         0.        ],
        [0.05258175, 0.01990946, 0.01938996, ..., 0.        , 0.        ,
         0.        ],
        [0.04215173, 0.01560341, 0.01378504, ..., 0.        , 0.        ,
         0.        ]])

In [0]:
test = vectorize_texts(train_tokenized, vocabulary, word_doc_freq, mode='tfidf')

In [0]:
test.shape

(11314, 21628)

In [0]:
test = test.tocsc()
print(type(test.min()))
test -= test.min()
test /= (test.max() + 1e-6)

<class 'numpy.float32'>


In [0]:
# torch.from_numpy(pwl.toarray()).max()
pw.max()

0.022657633

In [0]:
pwc = (result > 0).astype('float32').power(-1).multiply(word2freq)
pw = result.sum(0) / result.sum()
pwc1 = result.sum(0) / result.shape[0]

In [0]:
len(set(train_source['target']))

20

In [0]:
torch.from_numpy(pwc.toarray())[0].max()
# pwc1.shape

tensor(0.6015)

In [0]:
word2freq.shape

(21628,)

In [0]:
result = result.tocsr()
p_w_l = result / result.shape[0]

In [0]:
result = result.tocsr()

<11314x21628 sparse matrix of type '<class 'numpy.float32'>'
	with 1126792 stored elements in Compressed Sparse Row format>

In [0]:
result.shape[0]

11314

In [0]:
word2freq.shape

(21628,)

In [0]:
1 / word2freq.max()

1.412484426837751

In [0]:

torch.from_numpy((result > 0).astype('float32').toarray()).float()

tensor([[0., 1., 1.,  ..., 0., 0., 0.],
        [0., 1., 1.,  ..., 0., 0., 0.],
        [1., 1., 1.,  ..., 0., 0., 0.],
        ...,
        [1., 1., 1.,  ..., 0., 0., 0.],
        [1., 1., 1.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.]])

In [0]:
(result > 0).astype('float32').multiply(1 / word2freq).shape

(11314, 21628)

In [0]:
torch.from_numpy((result > 0).astype('float32').multiply(1 / word2freq).toarray()).float()

tensor([[0.0000, 1.6626, 1.6907,  ..., 0.0000, 0.0000, 0.0000],
        [0.0000, 1.6626, 1.6907,  ..., 0.0000, 0.0000, 0.0000],
        [1.4125, 1.6626, 1.6907,  ..., 0.0000, 0.0000, 0.0000],
        ...,
        [1.4125, 1.6626, 1.6907,  ..., 0.0000, 0.0000, 0.0000],
        [1.4125, 1.6626, 1.6907,  ..., 0.0000, 0.0000, 0.0000],
        [0.0000, 0.0000, 0.0000,  ..., 0.0000, 0.0000, 0.0000]])

In [0]:
result.shape

(11314, 21628)

In [0]:
result.sum(0)

matrix([[3.9264e+04, 2.0121e+04, 1.8074e+04, ..., 1.0000e+01, 1.0000e+01,
         8.0000e+00]], dtype=float32)

In [0]:
torch.from_numpy(train_vectors.toarray()).float().shape

torch.Size([11314, 21628])

In [0]:
train_source['target'].shape

(11314,)