In [1]:
from sklearn.naive_bayes import GaussianNB, BernoulliNB, MultinomialNB, ComplementNB 
import nltk
import string
from datasets import load_dataset
import re
from nltk.corpus import stopwords
from nltk import pos_tag
import numpy as np
from sklearn.decomposition import TruncatedSVD
from sklearn.metrics import f1_score
from sklearn.feature_extraction.text import TfidfVectorizer
import mlflow
from sklearnex import patch_sklearn
from warnings import filterwarnings
patch_sklearn()

Intel(R) Extension for Scikit-learn* enabled (https://github.com/uxlfoundation/scikit-learn-intelex)


In [2]:
filterwarnings("ignore")

In [3]:
ag_news_dataset = load_dataset("ag_news")
stop_words = set(stopwords.words("english"))

In [4]:
text_preprocess_types = [None, '—Å—Ç–µ–º–º–∏–Ω–≥', '–ª–µ–º–º–∞—Ç–∏–∑–∞—Ü–∏—è']
#text_preprocess_types = ['–ª–µ–º–º–∞—Ç–∏–∑–∞—Ü–∏—è']

#words_classes = ['N', 'NJ', 'NJV', 'ALL']
words_classes = ['ALL']

#frequency_filtration_types = [None, 'low', 'high', 'both']
frequency_filtration_types = [None, 'low']

n_components = [150, 250, 375, 500, 750]

#hidden_layer_size = [375, 500, 750]

In [5]:
def base_ag_news_preprocess(text):
    tokens = text.lower()

    # –£–¥–∞–ª–µ–Ω–∏–µ —Å–ø–µ—Ü —Å–ª–æ–≤
    special_words = ['reuters', 'afp', 'ap', 'usatoday.com', 'forbes.com', 'target=/stocks/quickinfo/fullquote"' ]
    for word in special_words:
        tokens = tokens.replace(word, '')
    
    pattern = r'[&lt][^<>]*&gt'
    tokens = re.sub(pattern, '', tokens)
    
    # –£–¥–∞–ª–µ–Ω–∏–µ –ø—É–Ω–∫—Ç—É–∞—Ü–∏–∏ –∏ —Ü–∏—Ñ—Ä
    #tokens = ''.join(i if i not in set(string.punctuation) - set('-') | set(string.digits) else ' ' for i in tokens)
    tokens = ''.join(i if i not in set(string.punctuation)  | set(string.digits) else ' ' for i in tokens)
    
    # –¢–æ–∫–µ–Ω–∏–∑–∞—Ü–∏—è
    tokens = nltk.word_tokenize(tokens)
    
    # –£–¥–∞–ª–µ–Ω–∏–µ —Å—Ç–æ–ø —Å–ª–æ–≤
    #stop_wordsL = stop_words - {'no','not'}
    stop_wordsL = stop_words
    tokens = [word for word in tokens if (word not in stop_wordsL and word != '-')]
    return tokens

In [6]:
def different_ag_news_preprocess(tokens, preprocess_type, words_class):
    
    # –û–±—Ä–∞–±–æ—Ç–∫–∞ —Å–ª–æ–≤
    if preprocess_type == '–ª–µ–º–º–∞—Ç–∏–∑–∞—Ü–∏—è':
        lemmatizer = nltk.WordNetLemmatizer()
        tokens = [lemmatizer.lemmatize(token) for token in tokens]
    elif preprocess_type == '—Å—Ç–µ–º–º–∏–Ω–≥':
        stemmer = nltk.PorterStemmer()
        tokens = [stemmer.stem(token) for token in tokens]
    
    # –û–±—Ä–∞–±–æ—Ç–∫–∞ —á–∞—Å—Ç–µ–π —Ä–µ—á–∏
    tokens = pos_tag(tokens)
    if words_class == 'N':
        tokens = [word for word, tag in tokens if tag.startswith('N')]
    elif words_class == 'NJ':
        tokens = [word for word, tag in tokens if tag.startswith('N') or tag.startswith('J')]
    elif words_class == 'NJV':
        tokens = [word for word, tag in tokens if tag.startswith('N') or tag.startswith('J') or tag.startswith('V')]
    elif words_class == 'ALL':
        tokens = [word for word, _ in tokens]
    
    return tokens  

In [7]:
def frequency_filtration(words_dictionary, frequency_filtration_type):
    if frequency_filtration_type == 'low':
        return dict([(key, value) for key, value in words_dictionary.items() if value >= 10 ])
    elif frequency_filtration_type == 'high':
        return dict([(key, value) for key, value in words_dictionary.items() if value <= 3000])
    elif frequency_filtration_type == 'both':
        return dict([(key, value) for key, value in words_dictionary.items() if 10 <= value <= 3000])
    else:
        return words_dictionary

In [8]:
def dummy(doc):
    return doc

In [9]:
def final_ag_news_preprocess(dataset, model_type):
    # –ü–æ–¥–≥–æ—Ç–æ–≤–∫–∞ –¥–∞–Ω–Ω—ã—Ö
    x_train = dataset['train']['text']
    y_train = dataset['train']['label']
    
    x_test = dataset['test']['text']
    y_test = dataset['test']['label']
    
    # –ë–∞–∑–æ–≤–∞—è –æ–±—Ä–∞–±–æ—Ç–∫–∞
    for i, text in enumerate(x_train):
        x_train[i] = base_ag_news_preprocess(text)
        
    for i, text in enumerate(x_test):
        x_test[i] = base_ag_news_preprocess(text)
    
    index = 0
    # –í–∞—Ä–∏–∞—Ç–∏–≤–Ω–∞—è –æ–±—Ä–∞–±–æ—Ç–∫–∞
    for preprocess_type in text_preprocess_types: 
        for words_class in words_classes:         
            words = {}
            xtr = x_train
            xte = x_test
            
            # –û–±—Ä–∞–±–æ—Ç–∫–∞ —Ç–µ–∫—Å—Ç–æ–≤
            for i, tokens in enumerate(xtr):
                final_tokens = different_ag_news_preprocess(tokens, preprocess_type, words_class)
                xtr[i] = final_tokens
                
                # –ó–∞–ø–æ–ª–Ω–µ–Ω–∏–µ —Å–ª–æ–≤–∞—Ä—è
                for token in final_tokens:
                    if token not in words:
                        words[token] = 1
                    else:
                        words[token] += 1
            
            xte = [different_ag_news_preprocess(tokens, preprocess_type, words_class) for tokens in xte]
                    
            # –§–∏–ª—å—Ç—Ä–∞—Ü–∏—è –ø–æ —á–∞—Å—Ç–æ—Ç–µ
            for frequency_filtration_type in frequency_filtration_types: 
                filtered_words = frequency_filtration(words, frequency_filtration_type)
                token_length = len(filtered_words)
                
                # –í–µ–∫—Ç–æ—Ä–∏–∑–∞—Ü–∏—è —Å–ª–æ–≤
                word_list = sorted(filtered_words.keys())
                # –ü—Ä–∏—Å–≤–æ–µ–Ω–∏–µ —Å–ª–æ–≤–∞–º –∏–Ω–¥–µ–∫—Å–æ–≤
                words_indexed = {}
                for idx, word in enumerate(word_list):
                    words_indexed[word] = idx
                
                # TF-IDF
                vectorizer_TFIDF = TfidfVectorizer(vocabulary=words_indexed, preprocessor=dummy, tokenizer=dummy, dtype=np.float32)
                x_train_TFIDF = vectorizer_TFIDF.fit_transform(xtr)
                x_test_TFIDF = vectorizer_TFIDF.transform(xte)
                
                #LSI
                #for n_num in n_components:
                    #SVD_TFIDF = TruncatedSVD(n_components=n_num)
                    
                    #SVD_TFIDF.fit(x_train_TFIDF)
                    
                    #x_train_TFIDF_SVD = SVD_TFIDF.transform(x_train_TFIDF)
                    #x_test_TFIDF_SVD = SVD_TFIDF.transform(x_test_TFIDF)

                    # –ü–æ—Å—Ç—Ä–æ–µ–Ω–∏–µ –∫–ª–∞—Å—Å–∏—Ñ–∏–∫–∞—Ç–æ—Ä–æ–≤
                if frequency_filtration_type is not None:
                    clf_G = GaussianNB()
                    
                    # –û–±—É—á–µ–Ω–∏–µ –∫–ª–∞—Å—Å–∏—Ñ–∏–∫–∞—Ç–æ—Ä–æ–≤
                    clf_G.fit(x_train_TFIDF.toarray(), y_train)
             
                    # –¢–µ—Å—Ç–∏—Ä–æ–≤–∞–Ω–∏–µ
                    prediction_G = clf_G.predict(x_test_TFIDF.toarray())
                        
                    # –ú–µ—Ç—Ä–∏–∫–∏
                    score_G = f1_score(y_test, prediction_G, average='macro')
                            
                    mlflow.start_run(run_name=f'Gaussian_TFIDF_{preprocess_type}_{words_class}_{frequency_filtration_type}')
                    mlflow.log_param('model', clf_G.__class__.__name__)
                    mlflow.log_param('preprocess_type', preprocess_type)
                    mlflow.log_param('words_class', words_class)
                    mlflow.log_param('frequency_filtration_type', frequency_filtration_type)
                    mlflow.log_param('token_length', token_length)
                    #mlflow.log_param('n_components', n_num)
                    mlflow.log_param('word vectorizer', 'TFIDF')
                    mlflow.log_metric('macro_score', score_G)
                    mlflow.end_run()
                
                clf_B = BernoulliNB()
                clf_B.fit(x_train_TFIDF, y_train)
                prediction_B = clf_B.predict(x_test_TFIDF)
                score_B = f1_score(y_test, prediction_B, average='macro')
                
                mlflow.start_run(run_name=f'Bernoulli_TFIDF_{preprocess_type}_{words_class}_{frequency_filtration_type}')
                mlflow.log_param('model', clf_B.__class__.__name__)
                mlflow.log_param('preprocess_type', preprocess_type)
                mlflow.log_param('words_class', words_class)
                mlflow.log_param('frequency_filtration_type', frequency_filtration_type)
                mlflow.log_param('token_length', token_length)
                #mlflow.log_param('n_components', n_num)
                mlflow.log_param('word vectorizer', 'TFIDF')
                mlflow.log_metric('macro_score', score_B)
                mlflow.end_run()
                    
                index += 1
                print(f'–ò—Ç–µ—Ä–∞—Ü–∏—è {index}')

In [99]:
mlflow.set_tracking_uri("http://127.0.0.1:5000")
mlflow.set_experiment(experiment_name="agNews_NB")

2025/04/08 17:47:33 INFO mlflow.tracking.fluent: Experiment with name 'agNews_NB' does not exist. Creating a new experiment.


<Experiment: artifact_location='mlflow-artifacts:/729937831637931560', creation_time=1744123653173, experiment_id='729937831637931560', last_update_time=1744123653173, lifecycle_stage='active', name='agNews_NB', tags={}>

In [138]:
final_ag_news_preprocess(ag_news_dataset, 'MLP')

üèÉ View run Bernoulli_TFIDF_None_ALL_None at: http://127.0.0.1:5000/#/experiments/729937831637931560/runs/1ff0c2286500450b950740b5d881cb67
üß™ View experiment at: http://127.0.0.1:5000/#/experiments/729937831637931560
–ò—Ç–µ—Ä–∞—Ü–∏—è 1
üèÉ View run Gaussian_TFIDF_None_ALL_low at: http://127.0.0.1:5000/#/experiments/729937831637931560/runs/8e138d84148f4e9594f90ea7bd91e151
üß™ View experiment at: http://127.0.0.1:5000/#/experiments/729937831637931560
üèÉ View run Bernoulli_TFIDF_None_ALL_low at: http://127.0.0.1:5000/#/experiments/729937831637931560/runs/363c7deb881b43e9bbe0f2be76a09839
üß™ View experiment at: http://127.0.0.1:5000/#/experiments/729937831637931560
–ò—Ç–µ—Ä–∞—Ü–∏—è 2
üèÉ View run Bernoulli_TFIDF_—Å—Ç–µ–º–º–∏–Ω–≥_ALL_None at: http://127.0.0.1:5000/#/experiments/729937831637931560/runs/cf8c1d19761949a487c76d650d078ba2
üß™ View experiment at: http://127.0.0.1:5000/#/experiments/729937831637931560
–ò—Ç–µ—Ä–∞—Ü–∏—è 3
üèÉ View run Gaussian_TFIDF_—Å—Ç–µ–º–º–∏–Ω–≥_ALL_

–†—É—á–Ω—ã–µ —Ç–µ—Å—Ç—ã


In [27]:
dataset = ag_news_dataset
preprocess_type = None
words_class = 'ALL'
frequency_filtration_type = None

# –ü–æ–¥–≥–æ—Ç–æ–≤–∫–∞ –¥–∞–Ω–Ω—ã—Ö
x_train = dataset['train']['text']
y_train = dataset['train']['label']
    
x_test = dataset['test']['text']
y_test = dataset['test']['label']
    
# –ë–∞–∑–æ–≤–∞—è –æ–±—Ä–∞–±–æ—Ç–∫–∞
for i, text in enumerate(x_train):
    x_train[i] = base_ag_news_preprocess(text)
        
for i, text in enumerate(x_test):
    x_test[i] = base_ag_news_preprocess(text)
    
words = {}
xtr = x_train
xte = x_test
            
# –û–±—Ä–∞–±–æ—Ç–∫–∞ —Ç–µ–∫—Å—Ç–æ–≤
for i, tokens in enumerate(xtr):
    final_tokens = different_ag_news_preprocess(tokens, preprocess_type, words_class)
    xtr[i] = final_tokens
                
    # –ó–∞–ø–æ–ª–Ω–µ–Ω–∏–µ —Å–ª–æ–≤–∞—Ä—è
    for token in final_tokens:
        if token not in words:
            words[token] = 1
        else:
            words[token] += 1
            
xte = [different_ag_news_preprocess(tokens, preprocess_type, words_class) for tokens in xte]
                    
# –§–∏–ª—å—Ç—Ä–∞—Ü–∏—è –ø–æ —á–∞—Å—Ç–æ—Ç–µ
filtered_words = frequency_filtration(words, frequency_filtration_type)
token_length = len(filtered_words)
                
# –í–µ–∫—Ç–æ—Ä–∏–∑–∞—Ü–∏—è —Å–ª–æ–≤
word_list = sorted(filtered_words.keys())
# –ü—Ä–∏—Å–≤–æ–µ–Ω–∏–µ —Å–ª–æ–≤–∞–º –∏–Ω–¥–µ–∫—Å–æ–≤
words_indexed = {}
for idx, word in enumerate(word_list):
    words_indexed[word] = idx
    
# TF-IDF
vectorizer_TFIDF = TfidfVectorizer(vocabulary=words_indexed,max_features=5000, preprocessor=dummy, tokenizer=dummy, dtype=np.float32)
x_train_TFIDF = vectorizer_TFIDF.fit_transform(xtr)
x_test_TFIDF = vectorizer_TFIDF.transform(xte)



In [28]:
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import make_scorer

In [30]:
f1_macro_scorer = make_scorer(f1_score, average='macro')

In [36]:
clf = MultinomialNB()

In [37]:
clf = clf.fit(x_train_TFIDF, y_train)

In [38]:
predictions = clf.predict(x_test_TFIDF.toarray())
macro_score = f1_score(y_test, predictions, average='macro')

In [39]:
print(macro_score)

0.8984204774199408


In [40]:
clf = ComplementNB()

In [41]:
clf = clf.fit(x_train_TFIDF, y_train)

In [42]:
predictions = clf.predict(x_test_TFIDF.toarray())
macro_score = f1_score(y_test, predictions, average='macro')

In [43]:
print(macro_score)

0.9011009276712132


In [29]:
parameters = {
    'alpha':[0.01, 0.1, 0.5, 1.0]
}

In [45]:
# –ò–Ω–∏—Ü–∏–∞–ª–∏–∑–∞—Ü–∏—è RandomizedSearchCV
search = GridSearchCV(
    ComplementNB(),
    param_grid=parameters,
    scoring=f1_macro_scorer,          # –ú–µ—Ç—Ä–∏–∫–∞
    n_jobs=-1,                  # –ò—Å–ø–æ–ª—å–∑–æ–≤–∞—Ç—å –≤—Å–µ —è–¥—Ä–∞
)

In [46]:
search.fit(x_train_TFIDF, y_train)

In [47]:
print("–õ—É—á—à–∏–µ –ø–∞—Ä–∞–º–µ—Ç—Ä—ã:", search.best_params_)
print()
print("–õ—É—á—à–∏–π F1 macro score:", search.best_score_)


–õ—É—á—à–∏–µ –ø–∞—Ä–∞–º–µ—Ç—Ä—ã: {'alpha': 0.5}

–õ—É—á—à–∏–π F1 macro score: 0.8934474245948845
