### Оптимальная настройка TfidfVectorizer и RandomForest

In [1]:
from sklearn.neural_network import MLPClassifier
import pandas as pd
import nltk
import string
from datasets import load_dataset
import re
from nltk.corpus import stopwords
from nltk import pos_tag
import numpy as np
from sklearn.metrics import f1_score, make_scorer
from sklearn.feature_extraction.text import TfidfVectorizer
import mlflow
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from sklearnex import patch_sklearn
from warnings import filterwarnings
import matplotlib.pyplot as plt
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
import scipy.stats as stats
patch_sklearn()

Intel(R) Extension for Scikit-learn* enabled (https://github.com/uxlfoundation/scikit-learn-intelex)


In [2]:
filterwarnings("ignore")

In [3]:
ag_news_dataset = load_dataset("ag_news")
stop_words = set(stopwords.words("english"))

In [4]:
#text_preprocess_types = [None, 'стемминг', 'лемматизация']
text_preprocess_types = [None, 'лемматизация']

#words_classes = ['N', 'NJ', 'NJV', 'ALL']
words_classes = ['ALL']

#frequency_filtration_types = [None, 'low', 'high', 'both']
frequency_filtration_types = [None, 'low']

n_components = [250, 375, 500, 750]

hidden_layer_size = [375, 500, 750]

In [5]:
def base_ag_news_preprocess(text):
    """ Удаление спец. слов, html-тегов, пунктуации и цифр"""
    tokens = text.lower()

    # Удаление спец слов
    special_words = ['reuters', 'afp', 'ap', 'usatoday.com', 'forbes.com', 'target=/stocks/quickinfo/fullquote"' ]
    for word in special_words:
        tokens = tokens.replace(word, '')
    
    pattern = r'[&lt][^<>]*&gt'
    tokens = re.sub(pattern, '', tokens)
    
    # Удаление пунктуации и цифр
    #tokens = ''.join(i if i not in set(string.punctuation) - set('-') | set(string.digits) else ' ' for i in tokens)
    tokens = ''.join(i if i not in set(string.punctuation)  | set(string.digits) else ' ' for i in tokens)
    
    # Токенизация
    tokens = nltk.word_tokenize(tokens)
    
    # Удаление стоп слов
    #stop_wordsL = stop_words - {'no','not'}
    #stop_wordsL = stop_words
    #tokens = [word for word in tokens if (word not in stop_wordsL and word != '-')]
    return tokens

In [6]:
def different_ag_news_preprocess(tokens, preprocess_type, words_class):
    
    # Обработка слов
    if preprocess_type == 'лемматизация':
        lemmatizer = nltk.WordNetLemmatizer()
        tokens = [lemmatizer.lemmatize(token) for token in tokens]
    elif preprocess_type == 'стемминг':
        stemmer = nltk.PorterStemmer()
        tokens = [stemmer.stem(token) for token in tokens]
    
    # Обработка частей речи
    tokens = pos_tag(tokens)
    if words_class == 'N':
        tokens = [word for word, tag in tokens if tag.startswith('N')]
    elif words_class == 'NJ':
        tokens = [word for word, tag in tokens if tag.startswith('N') or tag.startswith('J')]
    elif words_class == 'NJV':
        tokens = [word for word, tag in tokens if tag.startswith('N') or tag.startswith('J') or tag.startswith('V')]
    elif words_class == 'ALL':
        tokens = [word for word, _ in tokens]
    
    return ' '.join(tokens)  

In [7]:
def frequency_filtration(words_dictionary, frequency_filtration_type):
    if frequency_filtration_type == 'low':
        return dict([(key, value) for key, value in words_dictionary.items() if value >= 10 ])
    elif frequency_filtration_type == 'high':
        return dict([(key, value) for key, value in words_dictionary.items() if value <= 3000])
    elif frequency_filtration_type == 'both':
        return dict([(key, value) for key, value in words_dictionary.items() if 10 <= value <= 3000])
    else:
        return words_dictionary

In [8]:
def dummy(doc):
    return doc

In [9]:
def final_ag_news_preprocess(dataset, model_type):
    # Подготовка данных
    x_train = dataset['train']['text']
    y_train = dataset['train']['label']
    
    x_test = dataset['test']['text']
    y_test = dataset['test']['label']
    
    # Базовая обработка
    for i, text in enumerate(x_train):
        x_train[i] = base_ag_news_preprocess(text)
        
    for i, text in enumerate(x_test):
        x_test[i] = base_ag_news_preprocess(text)
    
    index = 0
    # Вариативная обработка
    for preprocess_type in text_preprocess_types: 
        for words_class in words_classes:         
            words = {}
            xtr = x_train
            xte = x_test
            
            # Обработка текстов
            for i, tokens in enumerate(xtr):
                final_tokens = different_ag_news_preprocess(tokens, preprocess_type, words_class)
                xtr[i] = final_tokens
                
                # Заполнение словаря
                for token in final_tokens:
                    if token not in words:
                        words[token] = 1
                    else:
                        words[token] += 1
            
            xte = [different_ag_news_preprocess(tokens, preprocess_type, words_class) for tokens in xte]
                    
            # Фильтрация по частоте
            for frequency_filtration_type in frequency_filtration_types: 
                filtered_words = frequency_filtration(words, frequency_filtration_type)
                token_length = len(filtered_words)
                
                # Векторизация слов
                word_list = sorted(filtered_words.keys())
                # Присвоение словам индексов
                words_indexed = {}
                for idx, word in enumerate(word_list):
                    words_indexed[word] = idx
                
                # TF-IDF
                vectorizer_TFIDF = TfidfVectorizer(vocabulary=words_indexed, preprocessor=dummy, tokenizer=dummy, dtype=np.float32)
                x_train_TFIDF = vectorizer_TFIDF.fit_transform(xtr)
                x_test_TFIDF = vectorizer_TFIDF.transform(xte)
                
                #LSI
                for n_num in n_components:
                    SVD_TFIDF = TruncatedSVD(n_components=n_num)
                    
                    SVD_TFIDF.fit(x_train_TFIDF)
                    
                    x_train_TFIDF_SVD = SVD_TFIDF.transform(x_train_TFIDF)
                    x_test_TFIDF_SVD = SVD_TFIDF.transform(x_test_TFIDF)

                    # Построение классификаторов
                    for size in hidden_layer_size:
                        mlp_clf_TFIDF = MLPClassifier(hidden_layer_sizes=(size,), activation='relu', solver='adam', max_iter=20, random_state=42)
                    
                        # Обучение классификаторов
                        mlp_clf_TFIDF.fit(x_train_TFIDF_SVD, y_train)
                        
                        # Тестирование
                        predictions_TFIDF = mlp_clf_TFIDF.predict(x_test_TFIDF_SVD)
                        
                        # Метрики
                        macro_score_TFIDF = f1_score(y_test, predictions_TFIDF, average='macro')
                        
                        mlflow.start_run(run_name=f'{model_type}_{size}_SVD{n_num}_TFIDF_Test3')
                        mlflow.log_param('model', mlp_clf_TFIDF.__class__.__name__)
                        mlflow.log_param('preprocess_type', preprocess_type)
                        mlflow.log_param('words_class', words_class)
                        mlflow.log_param('frequency_filtration_type', frequency_filtration_type)
                        mlflow.log_param('token_length', token_length)
                        mlflow.log_param('n_components', n_num)
                        mlflow.log_param('hidden_layer_sizes', size)
                        mlflow.log_param('word vectorizer', 'TFIDF')
                        #mlflow.sklearn.log_model(clf_SVD, 'ADA(forest)')
                        mlflow.log_metric('macro_score', macro_score_TFIDF)
                        mlflow.end_run()
                        
                        index += 1
                        print(f'Итерация {index}')

In [4]:
mlflow.set_tracking_uri("http://127.0.0.1:5000")
mlflow.set_experiment(experiment_name="agNewsHyperParams")

2025/04/23 15:32:23 INFO mlflow.tracking.fluent: Experiment with name 'agNewsHyperParams' does not exist. Creating a new experiment.


<Experiment: artifact_location='mlflow-artifacts:/263959826627955483', creation_time=1745411543826, experiment_id='263959826627955483', last_update_time=1745411543826, lifecycle_stage='active', name='agNewsHyperParams', tags={}>

In [160]:
final_ag_news_preprocess(ag_news_dataset, 'MLP')

🏃 View run MLP_375_SVD250_TFIDF_Test3 at: http://127.0.0.1:5000/#/experiments/277054424505203466/runs/562e69c269354769b098e400f39adfc9
🧪 View experiment at: http://127.0.0.1:5000/#/experiments/277054424505203466
Итерация 1
🏃 View run MLP_500_SVD250_TFIDF_Test3 at: http://127.0.0.1:5000/#/experiments/277054424505203466/runs/50811e8238644a10b27c2a71f787fdd3
🧪 View experiment at: http://127.0.0.1:5000/#/experiments/277054424505203466
Итерация 2
🏃 View run MLP_750_SVD250_TFIDF_Test3 at: http://127.0.0.1:5000/#/experiments/277054424505203466/runs/b263704760b54eb3a49eca388a5a54da
🧪 View experiment at: http://127.0.0.1:5000/#/experiments/277054424505203466
Итерация 3
🏃 View run MLP_375_SVD375_TFIDF_Test3 at: http://127.0.0.1:5000/#/experiments/277054424505203466/runs/523e7cf83e924540b40f1f112ad9e93b
🧪 View experiment at: http://127.0.0.1:5000/#/experiments/277054424505203466
Итерация 4
🏃 View run MLP_500_SVD375_TFIDF_Test3 at: http://127.0.0.1:5000/#/experiments/277054424505203466/runs/180f68

Ручные тесты


In [10]:
dataset = ag_news_dataset
preprocess_type = 'лемматизация'
words_class = 'ALL'

In [11]:
# Подготовка данных
x_train = dataset['train']['text']
y_train = dataset['train']['label']
    
x_test = dataset['test']['text']
y_test = dataset['test']['label']

In [12]:
# Базовая обработка
for i, text in enumerate(x_train):
    x_train[i] = base_ag_news_preprocess(text)
        
for i, text in enumerate(x_test):
    x_test[i] = base_ag_news_preprocess(text)

In [13]:
xtr = x_train
xte = x_test

# Обработка текстов
xtr = [different_ag_news_preprocess(tokens, preprocess_type, words_class) for tokens in xtr]
xte = [different_ag_news_preprocess(tokens, preprocess_type, words_class) for tokens in xte]

In [14]:
TFIDF_parameters = {
    'tfidf__max_df': [0.75, 0.80, 0.85, 0.90, 0.95, 0.99, 1], 
    'tfidf__min_df': stats.randint(1, 10),     # 1-10
    'tfidf__ngram_range': [(1, 1), (1, 2), (1, 3)],
    'tfidf__max_features': [None, 1000, 5000, 10000, 15000],
}

In [15]:
f1_macro_scorer = make_scorer(f1_score, average='macro')

In [16]:
pipeline = Pipeline([
    ('tfidf', TfidfVectorizer(stop_words='english', norm='l2', sublinear_tf=True)),
    ('clf', LogisticRegression(max_iter=500))
])

In [17]:
# Инициализация RandomizedSearchCV
random_search = RandomizedSearchCV(
    pipeline,
    param_distributions=TFIDF_parameters,
    n_iter=30,                  # Количество итераций
    cv=7,                       # Количество фолдов
    scoring=f1_macro_scorer,          # Метрика
    n_jobs=-1,                  # Использовать все ядра
    random_state=42,
    refit=True 
)

In [18]:
random_search.fit(xtr, y_train)

Fitting 7 folds for each of 30 candidates, totalling 210 fits


In [19]:
print("Лучшие параметры:", random_search.best_params_)
print()
print("Лучший F1 macro score:", random_search.best_score_)


Лучшие параметры: {'tfidf__max_df': 0.85, 'tfidf__max_features': None, 'tfidf__min_df': 4, 'tfidf__ngram_range': (1, 2)}

Лучший F1 macro score: 0.9017956189538373


In [20]:
pd.DataFrame(random_search.cv_results_)

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_tfidf__max_df,param_tfidf__max_features,param_tfidf__min_df,param_tfidf__ngram_range,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,split5_test_score,split6_test_score,mean_test_score,std_test_score,rank_test_score
0,4.907923,0.091819,0.0,0.0,1.0,10000.0,8,"(1, 1)","{'tfidf__max_df': 1, 'tfidf__max_features': 10...",,,,,,,,,,26
1,53.046672,2.436265,1.766834,0.298872,0.95,1000.0,3,"(1, 3)","{'tfidf__max_df': 0.95, 'tfidf__max_features':...",0.843051,0.864555,0.86183,0.86238,0.877554,0.856636,0.858741,0.860678,0.009523,20
2,12.586709,0.81789,1.403513,0.563617,0.85,5000.0,8,"(1, 1)","{'tfidf__max_df': 0.85, 'tfidf__max_features':...",0.873092,0.89569,0.890724,0.894258,0.903842,0.891507,0.896059,0.892167,0.008739,16
3,11.298095,0.907018,0.739323,0.166041,0.9,5000.0,6,"(1, 1)","{'tfidf__max_df': 0.9, 'tfidf__max_features': ...",0.87304,0.895397,0.890003,0.89391,0.904353,0.891207,0.896096,0.892001,0.008853,18
4,26.522366,1.002697,1.242224,0.203829,0.8,10000.0,6,"(1, 2)","{'tfidf__max_df': 0.8, 'tfidf__max_features': ...",0.881327,0.899163,0.896802,0.897495,0.909234,0.895828,0.902343,0.897456,0.007826,7
5,27.291676,1.420292,1.301739,0.077175,0.9,15000.0,1,"(1, 2)","{'tfidf__max_df': 0.9, 'tfidf__max_features': ...",0.882357,0.901272,0.899286,0.899334,0.909634,0.897433,0.903571,0.898984,0.007735,3
6,11.589283,0.683653,0.751458,0.044401,0.99,15000.0,9,"(1, 1)","{'tfidf__max_df': 0.99, 'tfidf__max_features':...",0.87883,0.900699,0.894927,0.897107,0.907159,0.896648,0.901439,0.896687,0.008198,10
7,46.212748,1.272055,1.568787,0.098498,0.85,5000.0,3,"(1, 3)","{'tfidf__max_df': 0.85, 'tfidf__max_features':...",0.873509,0.895059,0.89192,0.892442,0.903145,0.891789,0.896659,0.892075,0.00843,17
8,12.413423,1.620551,0.944015,0.238046,0.9,,3,"(1, 1)","{'tfidf__max_df': 0.9, 'tfidf__max_features': ...",0.879375,0.901032,0.896087,0.897564,0.908992,0.896184,0.90136,0.897228,0.008376,8
9,53.997126,2.622517,2.223571,0.509859,0.85,15000.0,9,"(1, 3)","{'tfidf__max_df': 0.85, 'tfidf__max_features':...",0.881892,0.901777,0.899276,0.899469,0.910114,0.897435,0.90327,0.899033,0.007972,2


In [34]:
vectorizer0 = TfidfVectorizer(stop_words='english', ngram_range=(1,2))
vectorizer0.fit(xtr)

xtrain = vectorizer0.transform(xtr)
modell = LogisticRegression()
modell.fit(xtrain, y_train)

xtest = vectorizer0.transform(xte)
y_predl = modell.predict(xtest)

sc = f1_score(y_test, y_predl, average='macro')

In [35]:
print(sc)

0.9112595241853377
