In [105]:
import nltk
import string
from datasets import load_dataset
import re
from nltk.corpus import stopwords
from nltk import pos_tag
import numpy as np
from sklearn.tree import DecisionTreeClassifier
from scipy.sparse import csr_matrix
from sklearn.metrics import f1_score
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer

In [10]:
ag_news_dataset = load_dataset("ag_news")
print(ag_news_dataset)

DatasetDict({
    train: Dataset({
        features: ['text', 'label'],
        num_rows: 120000
    })
    test: Dataset({
        features: ['text', 'label'],
        num_rows: 7600
    })
})


In [11]:
ag_news_texts = ag_news_dataset["train"]["text"]

In [14]:
stop_words = set(stopwords.words("english"))

In [34]:
def final_ag_news_preprocess(text):
    tokens = text.lower()

    # Удаление спец слов
    special_words = ['reuters', 'afp', 'ap', 'usatoday.com', 'forbes.com', 'target=/stocks/quickinfo/fullquote"' ]
    for word in special_words:
        tokens = tokens.replace(word, '')
    
    pattern = r'[&lt][^<>]*&gt'
    tokens = re.sub(pattern, '', tokens)
    
    # Удаление пунктуации и цифр
    #tokens = ''.join(i if i not in set(string.punctuation) - set('-') | set(string.digits) else ' ' for i in tokens)
    tokens = ''.join(i if i not in set(string.punctuation)  | set(string.digits) else ' ' for i in tokens)
    
    # Токенизация
    tokens = nltk.word_tokenize(tokens)
    
    # Удаление стоп слов
    stop_wordsL = stop_words - {'no','not'}
    tokens = [word for word in tokens if (word not in stop_wordsL and word != '-')]
    
    # Применение лемматизации
    lemmatizer = nltk.WordNetLemmatizer()
    tokens = [lemmatizer.lemmatize(token) for token in tokens]

    # Обработка частей речи 
    tokens = pos_tag(tokens)
    tokens = [word for word, tag in tokens if tag.startswith('N') or tag.startswith('J') or word in {'no', 'not'}]
    
    return tokens

In [35]:
words = set()
for text in ag_news_texts:
    tokens = final_ag_news_preprocess(text)
    for token in tokens:
        if token not in words:
            words.add(token)

In [38]:
token_length = len(words)

## Векторизация слов

#### One-hot encoding


In [43]:
word_list = sorted(words)

In [48]:
words_dict = {}
for idx, word in enumerate(word_list):
    words_dict[word] = idx

In [46]:
for i, text in enumerate(ag_news_texts):
    ag_news_texts[i] = final_ag_news_preprocess(text)

In [53]:
x_data_OHE = np.zeros(shape=(len(ag_news_texts), token_length), dtype=np.int8)
for i, tokens in enumerate(ag_news_texts):
    for token in tokens:
        x_data_OHE[i, words_dict[token]] = 1

In [99]:
x_data_OHE = csr_matrix(x_data_OHE, dtype=np.int8)
train_labels = ag_news_dataset["train"]["label"]

In [68]:
clf_OHE = DecisionTreeClassifier()
clf_OHE = clf_OHE.fit(x_data_OHE, train_labels)

#### Тест

In [73]:
test_texts = ag_news_dataset["test"]["text"]
test_labels = ag_news_dataset["test"]["label"]

In [76]:
for i, text in enumerate(test_texts):
    test_texts[i] = final_ag_news_preprocess(text)

In [80]:
test_data_OHE = np.zeros(shape=(len(test_texts), token_length),dtype=np.int8)
for i, tokens in enumerate(test_texts):
    for token in tokens:
        if token in words:
            test_data_OHE[i, words_dict[token]] = 1

In [100]:
prediction_OHE = clf_OHE.predict(test_data_OHE)

In [134]:
micro_score_OHE = f1_score(test_labels, prediction_OHE, labels=[0,1,2,3], average='micro')
macro_score_OHE = f1_score(test_labels, prediction_OHE, labels=[0,1,2,3], average='macro')
weighted_score_OHE = f1_score(test_labels, prediction_OHE, labels=[0,1,2,3], average='weighted')
print(micro_score_OHE, macro_score_OHE, weighted_score_OHE)

0.8196052631578947 0.8192163840661241 0.8192163840661241


#### CountVectorizer

In [110]:
def dummy(doc):
    return doc

In [119]:
vectorizer = CountVectorizer(vocabulary=words_dict, tokenizer=dummy, preprocessor=dummy, dtype=np.int8)
x_train_COUNT = vectorizer.fit_transform(ag_news_texts)

In [120]:
clf_COUNT = DecisionTreeClassifier()
clf_COUNT = clf_COUNT.fit(x_train_COUNT, train_labels)

#### Тест

In [130]:
test_data_COUNT = vectorizer.transform(test_texts)

In [131]:
prediction_COUNT = clf_COUNT.predict(test_data_COUNT)

In [133]:
micro_score_COUNT = f1_score(test_labels, prediction_COUNT, labels=[0,1,2,3], average='micro')
macro_score_COUNT = f1_score(test_labels, prediction_COUNT, labels=[0,1,2,3], average='macro')
weighted_score_COUNT = f1_score(test_labels, prediction_COUNT, labels=[0,1,2,3], average='weighted')
print(micro_score_COUNT, macro_score_COUNT, weighted_score_COUNT)

0.8218421052631579 0.8214085479144546 0.8214085479144545


#### TFIDF

In [128]:
vectorizer_tfidf = TfidfVectorizer(vocabulary=words_dict, preprocessor=dummy, tokenizer=dummy, dtype=np.float32)
x_train_TFIDF = vectorizer_tfidf.fit_transform(ag_news_texts)



In [129]:
clf_TFIDF = DecisionTreeClassifier()
clf_TFIDF = clf_TFIDF.fit(x_train_TFIDF, train_labels)

#### Тест

In [135]:
test_data_TFIDF = vectorizer_tfidf.transform(test_texts)

In [136]:
prediction_TFIDF = clf_TFIDF.predict(test_data_TFIDF)

In [137]:
micro_score_TFIDF = f1_score(test_labels, prediction_TFIDF, labels=[0,1,2,3], average='micro')
macro_score_TFIDF = f1_score(test_labels, prediction_TFIDF, labels=[0,1,2,3], average='macro')
weighted_score_TFIDF = f1_score(test_labels, prediction_TFIDF, labels=[0,1,2,3], average='weighted')
print(micro_score_TFIDF, macro_score_TFIDF, weighted_score_TFIDF)

0.8097368421052632 0.8091861069724451 0.8091861069724451
