In [1]:
import nltk
import string
from datasets import load_dataset
import re
from nltk.corpus import stopwords
from nltk import pos_tag
import numpy as np
from sklearn.naive_bayes import BernoulliNB, MultinomialNB, ComplementNB 
from sklearn.metrics import f1_score, make_scorer
from sklearn.feature_extraction.text import TfidfVectorizer
import mlflow
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from sklearnex import patch_sklearn
from warnings import filterwarnings
patch_sklearn()

Intel(R) Extension for Scikit-learn* enabled (https://github.com/uxlfoundation/scikit-learn-intelex)


In [2]:
filterwarnings("ignore")

In [3]:
dataset = "ag_news"
ag_news_dataset = load_dataset(dataset)
stop_words = set(stopwords.words("english"))

In [4]:
text_preprocess_types = [None]

words_classes = ['ALL']

frequency_filtration_types = ['low']

In [5]:
iterations_num = len(text_preprocess_types) * len(words_classes) * len(frequency_filtration_types)
print(iterations_num)

1


In [6]:
def base_preprocess(text):
    tokens = text.lower()

    # Удаление спец слов
    if dataset == 'ag_news':
        special_words = ['reuters', 'afp', 'ap', 'usatoday.com', 'forbes.com', 'target=/stocks/quickinfo/fullquote"' ]
        for word in special_words:
            tokens = tokens.replace(word, '')
        
        pattern = r'[&lt][^<>]*&gt'
        tokens = re.sub(pattern, '', tokens)
    elif dataset == 'imdb':
        special_words = ['<br /><br />'] 
        for word in special_words:
            tokens = tokens.replace(word, '')
    
    # Удаление пунктуации
    tokens = ''.join(i if i not in set(string.punctuation)  else ' ' for i in tokens)
    
    return tokens

In [7]:
txt0 = ag_news_dataset['train']['text'][0]
txt1 = base_preprocess(txt0)

print(txt0)
print(txt1)

Wall St. Bears Claw Back Into the Black (Reuters) Reuters - Short-sellers, Wall Street's dwindling\band of ultra-cynics, are seeing green again.
wall st  bears claw back into the black       short sellers  wall street s dwindling band of ultra cynics  are seeing green again 


In [8]:
def different_preprocess(tokens, preprocess_type, words_class):
    tokenizer = nltk.tokenize.TreebankWordTokenizer()
    tokens = tokenizer.tokenize(tokens)
    # Обработка частей речи
    if words_class != 'ALL':
        tokens = pos_tag(tokens)
        if words_class == 'N':
            tokens = [word for word, tag in tokens if tag.startswith('N')]
        elif words_class == 'NJ':
            tokens = [word for word, tag in tokens if tag.startswith('N') or tag.startswith('J')]
        elif words_class == 'NJV':
            tokens = [word for word, tag in tokens if tag.startswith('N') or tag.startswith('J') or tag.startswith('V')]
    
    # Обработка слов
    if preprocess_type == 'лемматизация':
        lemmatizer = nltk.WordNetLemmatizer()
        tokens = [lemmatizer.lemmatize(token) for token in tokens]
    elif preprocess_type == 'стемминг':
        stemmer = nltk.PorterStemmer()
        tokens = [stemmer.stem(token) for token in tokens]
    
    return ' '.join(tokens)  

In [9]:
txt2 = different_preprocess(txt1, 'лемматизация', 'ALL')

print(different_preprocess(txt1, 'ничего', 'ALL'))
print(different_preprocess(txt1, 'стемминг', 'ALL'))
print(txt2)

wall st bears claw back into the black short sellers wall street s dwindling band of ultra cynics are seeing green again
wall st bear claw back into the black short seller wall street s dwindl band of ultra cynic are see green again
wall st bear claw back into the black short seller wall street s dwindling band of ultra cynic are seeing green again


In [10]:
def frequency_filtration(words_dictionary, frequency_filtration_type):
    if frequency_filtration_type == 'low':
        return dict([(key, value) for key, value in words_dictionary.items() if value >= 3 ])
    else:
        return words_dictionary

In [11]:
def dummy(doc):
    return doc

In [12]:
def final_preprocess(dataset, model_type):
    # Подготовка данных
    x_train = dataset['train']['text']
    y_train = dataset['train']['label']
    
    x_test = dataset['test']['text']
    y_test = dataset['test']['label']
    
    # Базовая обработка
    xtr = [base_preprocess(text) for text in x_train]
    xte = [base_preprocess(text) for text in x_test]
    
    index = 0    
    
    # Части речи + приведение
    xtr = [different_preprocess(tokens, text_preprocess_types, 'ALL') for tokens in xtr]
    xte = [different_preprocess(tokens, text_preprocess_types, 'ALL') for tokens in xte]
    
    vectorizer = TfidfVectorizer(stop_words="english",ngram_range=(1,2), min_df=2, dtype=np.float32)
    vectorizer.fit(xtr)
    
    xtr = vectorizer.transform(xtr)
    xte = vectorizer.transform(xte)
        
    # Обучение классификаторов
    clf.fit(xtr, y_train)
    
    # Тестирование
    predictions = clf.predict(xte)
    
    # Метрики
    score = f1_score(y_test, predictions, average='macro')
    
    mlflow.start_run(run_name=f'{model_type}')
    mlflow.log_param('model', clf.__class__.__name__)
    mlflow.log_param('preprocess_type', 'ничего')
    mlflow.log_param('words_class', 'ALL')
    mlflow.log_metric('macro_score', score)
    mlflow.end_run()
    
    index += 1
    print(f'Итерация {index}')

In [13]:
dataset = ag_news_dataset
preprocess_type = 'лемматизация'
words_class = 'ALL'

# Подготовка данных
x_train = dataset['train']['text']
y_train = dataset['train']['label']
    
x_test = dataset['test']['text']
y_test = dataset['test']['label']
    
# Базовая обработка
xtr = [base_preprocess(text) for text in x_train]
xte = [base_preprocess(text) for text in x_test]
    
# Части речи + приведение
xtr1 = [different_preprocess(tokens, preprocess_type, words_class) for tokens in xtr]
xte1 = [different_preprocess(tokens, preprocess_type, words_class) for tokens in xte]

In [14]:
vectorizer = TfidfVectorizer(stop_words="english",ngram_range=(1,2), min_df=5, dtype=np.float32)
vectorizer.fit(xtr1)

xtr = vectorizer.transform(xtr1)
xte = vectorizer.transform(xte1) 

In [15]:
print(xtr.shape)

(120000, 104479)


#### BernoulliNB

In [21]:
clf_bnb = BernoulliNB()
clf_bnb.fit(xtr, y_train)

In [22]:
predictions_bnb = clf_bnb.predict(xte)
macro_score_bnb = f1_score(y_test, predictions_bnb, average='macro')
print(macro_score_bnb)

0.9029985699067992


#### MultinomialNB

In [19]:
clf_mnb = MultinomialNB()
clf_mnb.fit(xtr, y_train)

In [23]:
predictions_mnb = clf_mnb.predict(xte)
macro_score_mnb = f1_score(y_test, predictions_mnb, average='macro')
print(macro_score_mnb)

0.9081803865496524


#### ComplementNB

In [24]:
clf_cnb = ComplementNB()
clf_cnb.fit(xtr, y_train)

In [25]:
predictions_cnb = clf_cnb.predict(xte)
macro_score_cnb = f1_score(y_test, predictions_cnb, average='macro')
print(macro_score_cnb)

0.9100047285733461


## Подбор гипер-параметров

#### NB - alpha 

In [26]:
f1_macro_scorer = make_scorer(f1_score, average='macro')

In [27]:
parameters = { # 10
    'alpha':[0.01, 0.1, 0.5, 1, 10]
} 

#### BernoulliNB

In [28]:
# Инициализация RandomizedSearchCV
search_bnb = GridSearchCV(
    BernoulliNB(),
    param_grid=parameters,
    cv=5,                       # Количество фолдов
    scoring=f1_macro_scorer,    # Метрика
    n_jobs=-1,                  # Использовать все ядра
    verbose=1,
    refit=True 
)

In [29]:
search_bnb.fit(xtr, y_train)

Fitting 5 folds for each of 5 candidates, totalling 25 fits


In [30]:
print("Лучшие параметры:", search_bnb.best_params_)
print()
print("Лучший F1 macro score:", search_bnb.best_score_)

Лучшие параметры: {'alpha': 0.1}

Лучший F1 macro score: 0.8989902220495362


In [31]:
y_pred_bnb = search_bnb.best_estimator_.predict(xte)
sc_bnb = f1_score(y_test, y_pred_bnb, average='macro')
print(sc_bnb)

0.9052098234087924


#### MultinomialNB

In [32]:
# Инициализация RandomizedSearchCV
search_mnb = GridSearchCV(
    MultinomialNB(),
    param_grid=parameters,
    cv=5,                       # Количество фолдов
    scoring=f1_macro_scorer,    # Метрика
    n_jobs=-1,                  # Использовать все ядра
    verbose=1,
    refit=True 
)

In [33]:
search_mnb.fit(xtr, y_train)

Fitting 5 folds for each of 5 candidates, totalling 25 fits


In [34]:
print("Лучшие параметры:", search_mnb.best_params_)
print()
print("Лучший F1 macro score:", search_mnb.best_score_)

Лучшие параметры: {'alpha': 0.5}

Лучший F1 macro score: 0.9038849439007478


In [35]:
y_pred_mnb = search_mnb.best_estimator_.predict(xte)
sc_mnb = f1_score(y_test, y_pred_mnb, average='macro')
print(sc_mnb)

0.9100324176320633


#### ComplementNB

In [40]:
# Инициализация RandomizedSearchCV
search_cnb = GridSearchCV(
    ComplementNB(),
    param_grid=parameters,
    cv=5,                       # Количество фолдов
    scoring=f1_macro_scorer,    # Метрика
    n_jobs=-1,                  # Использовать все ядра
    verbose=1,
    refit=True 
)

In [41]:
search_cnb.fit(xtr, y_train)

Fitting 5 folds for each of 5 candidates, totalling 25 fits


In [42]:
print("Лучшие параметры:", search_cnb.best_params_)
print()
print("Лучший F1 macro score:", search_cnb.best_score_)

Лучшие параметры: {'alpha': 0.5}

Лучший F1 macro score: 0.9041486880173781


In [43]:
y_pred_cnb = search_cnb.best_estimator_.predict(xte)
sc_cnb = f1_score(y_test, y_pred_cnb, average='macro')
print(sc_cnb)

0.9100104260389827
