In [4]:
import nltk
import string
from datasets import load_dataset
import re
from nltk.corpus import stopwords
from nltk import pos_tag
import numpy as np
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier, HistGradientBoostingClassifier
from sklearn.decomposition import TruncatedSVD
from scipy.sparse import csr_matrix
from sklearn.metrics import f1_score
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
import mlflow
from sklearnex import patch_sklearn
from warnings import filterwarnings
patch_sklearn()

Intel(R) Extension for Scikit-learn* enabled (https://github.com/uxlfoundation/scikit-learn-intelex)


In [5]:
filterwarnings("ignore")

In [6]:
ag_news_dataset = load_dataset("ag_news")
stop_words = set(stopwords.words("english"))

In [7]:
#text_preprocess_types = [None, 'стемминг', 'лемматизация']
text_preprocess_types = [None, 'лемматизация']
#words_classes = ['N', 'NJ', 'NJV', 'ALL']
words_classes = ['NJ', 'ALL']
#n_components = [10, 25, 50, 100, 250 ,500, 1000]
n_components = [10, 50, 100, 250]

#frequency_filtration_types = [None, 'low', 'high', 'both']
frequency_filtration_types = [None, 'low']

vector_representation_types = ['binary', 'count', 'tfidf']

In [8]:
iterations_num = len(text_preprocess_types) * len(words_classes) * len(frequency_filtration_types)
print(iterations_num)

8


In [9]:
def base_ag_news_preprocess(text):
    tokens = text.lower()

    # Удаление спец слов
    special_words = ['reuters', 'afp', 'ap', 'usatoday.com', 'forbes.com', 'target=/stocks/quickinfo/fullquote"' ]
    for word in special_words:
        tokens = tokens.replace(word, '')
    
    pattern = r'[&lt][^<>]*&gt'
    tokens = re.sub(pattern, '', tokens)
    
    # Удаление пунктуации и цифр
    #tokens = ''.join(i if i not in set(string.punctuation) - set('-') | set(string.digits) else ' ' for i in tokens)
    tokens = ''.join(i if i not in set(string.punctuation)  | set(string.digits) else ' ' for i in tokens)
    
    # Токенизация
    tokens = nltk.word_tokenize(tokens)
    
    # Удаление стоп слов
    #stop_wordsL = stop_words - {'no','not'}
    stop_wordsL = stop_words
    tokens = [word for word in tokens if (word not in stop_wordsL and word != '-')]
    return tokens

In [10]:
def different_ag_news_preprocess(tokens, preprocess_type, words_class):
    
    # Обработка слов
    if preprocess_type == 'лемматизация':
        lemmatizer = nltk.WordNetLemmatizer()
        tokens = [lemmatizer.lemmatize(token) for token in tokens]
    elif preprocess_type == 'стемминг':
        stemmer = nltk.PorterStemmer()
        tokens = [stemmer.stem(token) for token in tokens]
    
    # Обработка частей речи
    tokens = pos_tag(tokens)
    if words_class == 'N':
        tokens = [word for word, tag in tokens if tag.startswith('N')]
    elif words_class == 'NJ':
        tokens = [word for word, tag in tokens if tag.startswith('N') or tag.startswith('J')]
    elif words_class == 'NJV':
        tokens = [word for word, tag in tokens if tag.startswith('N') or tag.startswith('J') or tag.startswith('V')]
    elif words_class == 'ALL':
        tokens = [word for word, _ in tokens]
    
    return tokens  

In [11]:
def frequency_filtration(words_dictionary, frequency_filtration_type):
    if frequency_filtration_type == 'low':
        return dict([(key, value) for key, value in words_dictionary.items() if value >= 10 ])
    elif frequency_filtration_type == 'high':
        return dict([(key, value) for key, value in words_dictionary.items() if value <= 3000])
    elif frequency_filtration_type == 'both':
        return dict([(key, value) for key, value in words_dictionary.items() if 10 <= value <= 3000])
    else:
        return words_dictionary

In [12]:
def dummy(doc):
    return doc

In [13]:
def final_ag_news_preprocess(dataset, model_type):
    # Подготовка данных
    x_train = dataset['train']['text']
    y_train = dataset['train']['label']
    
    x_test = dataset['test']['text']
    y_test = dataset['test']['label']
    
    # Базовая обработка
    for i, text in enumerate(x_train):
        x_train[i] = base_ag_news_preprocess(text)
        
    for i, text in enumerate(x_test):
        x_test[i] = base_ag_news_preprocess(text)
    
    index = 0
    # Вариативная обработка
    for preprocess_type in text_preprocess_types: 
        for words_class in words_classes:         
            words = {}
            xtr = x_train
            xte = x_test
            
            # Обработка текстов
            for i, tokens in enumerate(xtr):
                final_tokens = different_ag_news_preprocess(tokens, preprocess_type, words_class)
                xtr[i] = final_tokens
                
                # Заполнение словаря
                for token in final_tokens:
                    if token not in words:
                        words[token] = 1
                    else:
                        words[token] += 1
            
            xte = [different_ag_news_preprocess(tokens, preprocess_type, words_class) for tokens in xte]
                    
            # Фильтрация по частоте
            for frequency_filtration_type in frequency_filtration_types: 
                filtered_words = frequency_filtration(words, frequency_filtration_type)
                token_length = len(filtered_words)
                
                # Векторизация слов
                word_list = sorted(filtered_words.keys())
                # Присвоение словам индексов
                words_indexed = {}
                for idx, word in enumerate(word_list):
                    words_indexed[word] = idx
                
                # TF-IDF
                vectorizer_TFIDF = TfidfVectorizer(vocabulary=words_indexed, preprocessor=dummy, tokenizer=dummy, dtype=np.float32)
                x_train_TFIDF = vectorizer_TFIDF.fit_transform(xtr)
                x_test_TFIDF = vectorizer_TFIDF.transform(xte)
                
                #LSA
                for n_num in n_components:
                    SVD = TruncatedSVD(n_components=n_num)
                    
                    SVD.fit(x_train_TFIDF)
                    
                    x_train_SVD = SVD.transform(x_train_TFIDF)
                    x_test_SVD = SVD.transform(x_test_TFIDF)

                    # Построение классификаторов
                    if model_type == 'ADA':
                        clf_SVD =  AdaBoostClassifier(estimator=RandomForestClassifier(n_estimators=12, max_depth=25, n_jobs=-1), n_estimators=50, learning_rate=0.3)
                    elif model_type=='GB':
                        clf_SVD = HistGradientBoostingClassifier()
                    
                    # Обучение классификаторов
                    clf_SVD.fit(x_train_SVD, y_train)
                    
                    # Тестирование
                    predictions_SVD = clf_SVD.predict(x_test_SVD)
                    
                    # Метрики
                    macro_score_SVD = f1_score(y_test, predictions_SVD, average='macro')
                    
                    mlflow.start_run(run_name=f'{model_type}_{preprocess_type}_{words_class}_{frequency_filtration_type}_SVD')
                    mlflow.log_param('model', clf_SVD.__class__.__name__)
                    mlflow.log_param('preprocess_type', preprocess_type)
                    mlflow.log_param('words_class', words_class)
                    mlflow.log_param('frequency_filtration_type', frequency_filtration_type)
                    mlflow.log_param('token_length', token_length)
                    mlflow.log_param('n_components', n_num)
                    #mlflow.sklearn.log_model(clf_SVD, 'ADA(forest)')
                    mlflow.log_metric('macro_score', macro_score_SVD)
                    mlflow.end_run()
                    
                    index += 1
                    print(f'Итерация {index} / 42')

In [10]:
def one_run(dataset, preprocess_type, words_class, frequency_filtration_type, model_type):
    # Подготовка данных
    x_train = dataset['train']['text']
    y_train = dataset['train']['label']
    
    x_test = dataset['test']['text']
    y_test = dataset['test']['label']
    
    # Базовая обработка
    for i, text in enumerate(x_train):
        x_train[i] = base_ag_news_preprocess(text)
        
    for i, text in enumerate(x_test):
        x_test[i] = base_ag_news_preprocess(text)
    
    words = {}
    xtr = x_train
    xte = x_test
            
    # Обработка текстов
    for i, tokens in enumerate(xtr):
        final_tokens = different_ag_news_preprocess(tokens, preprocess_type, words_class)
        xtr[i] = final_tokens
                
        # Заполнение словаря
        for token in final_tokens:
            if token not in words:
                words[token] = 1
            else:
                words[token] += 1
            
    xte = [different_ag_news_preprocess(tokens, preprocess_type, words_class) for tokens in xte]
                    
    # Фильтрация по частоте
    filtered_words = frequency_filtration(words, frequency_filtration_type)
    token_length = len(filtered_words)
                
    # Векторизация слов
    word_list = sorted(filtered_words.keys())
    # Присвоение словам индексов
    words_indexed = {}
    for idx, word in enumerate(word_list):
        words_indexed[word] = idx
        
    # OHE
    vectorizer_OHE = CountVectorizer(vocabulary=words_indexed, tokenizer=dummy, preprocessor=dummy, dtype=np.int8, binary=True)
    x_train_OHE = vectorizer_OHE.fit_transform(xtr)
    x_test_OHE = vectorizer_OHE.transform(xte)
                
    # COUNT
    vectorizer_COUNT = CountVectorizer(vocabulary=words_indexed, tokenizer=dummy, preprocessor=dummy, dtype=np.int8)
    x_train_COUNT = vectorizer_COUNT.fit_transform(xtr)
    x_test_COUNT = vectorizer_COUNT.transform(xte)
                
    # TF-IDF
    vectorizer_TFIDF = TfidfVectorizer(vocabulary=words_indexed, preprocessor=dummy, tokenizer=dummy, dtype=np.float32)
    x_train_TFIDF = vectorizer_TFIDF.fit_transform(xtr)
    x_test_TFIDF = vectorizer_TFIDF.transform(xte)
    
    # Построение классификаторов
    if model_type == 'DT':
        clf_OHE = DecisionTreeClassifier()
        clf_COUNT = DecisionTreeClassifier()
        clf_TFIDF = DecisionTreeClassifier()
    elif model_type == 'RF':
        clf_OHE = RandomForestClassifier(n_estimators=50, n_jobs=-1)
        clf_COUNT = RandomForestClassifier(n_estimators=50, n_jobs=-1)
        clf_TFIDF = RandomForestClassifier(n_estimators=50, n_jobs=-1)
    
    # Обучение классификаторов
    clf_OHE = clf_OHE.fit(x_train_OHE, y_train)
    clf_COUNT = clf_COUNT.fit(x_train_COUNT, y_train)
    clf_TFIDF = clf_TFIDF.fit(x_train_TFIDF, y_train)
                
    # Тестирование
    predictions_OHE = clf_OHE.predict(x_test_OHE)
    predictions_COUNT = clf_COUNT.predict(x_test_COUNT)
    predictions_TFIDF = clf_TFIDF.predict(x_test_TFIDF)
                
    # Метрики
    macro_score_OHE = f1_score(y_test, predictions_OHE, average='macro')
    macro_score_COUNT = f1_score(y_test, predictions_COUNT, average='macro')
    macro_score_TFIDF = f1_score(y_test, predictions_TFIDF, average='macro')
    
    mlflow.start_run(run_name=f'test_{preprocess_type}_{words_class}_{frequency_filtration_type}_OHE')
    mlflow.log_param('model', clf_OHE.__class__.__name__)
    mlflow.log_param('tree_depth', clf_OHE.get_depth())
    mlflow.log_param('preprocess_type', preprocess_type)
    mlflow.log_param('words_class', words_class)
    mlflow.log_param('frequency_filtration_type', frequency_filtration_type)
    mlflow.log_param('token_length', token_length)
    mlflow.sklearn.log_model(clf_OHE, 'DecisionTreeClassifier')
    mlflow.log_metric('macro_score', macro_score_OHE)
    mlflow.end_run()
    
    mlflow.start_run(run_name=f'test_{preprocess_type}_{words_class}_{frequency_filtration_type}_COUNT')
    mlflow.log_param('model', clf_COUNT.__class__.__name__)
    mlflow.log_param('tree_depth', clf_TFIDF.get_depth())
    mlflow.log_param('preprocess_type', preprocess_type)
    mlflow.log_param('words_class', words_class)
    mlflow.log_param('frequency_filtration_type', frequency_filtration_type)
    mlflow.log_param('token_length', token_length)
    mlflow.sklearn.log_model(clf_COUNT, 'DecisionTreeClassifier')
    mlflow.log_metric('macro_score', macro_score_COUNT)
    mlflow.end_run()
    
    mlflow.start_run(run_name=f'test_{preprocess_type}_{words_class}_{frequency_filtration_type}_TFIDF')
    mlflow.log_param('model', clf_TFIDF.__class__.__name__)
    mlflow.log_param('tree_depth', clf_TFIDF.get_depth())
    mlflow.log_param('preprocess_type', preprocess_type)
    mlflow.log_param('words_class', words_class)
    mlflow.log_param('frequency_filtration_type', frequency_filtration_type)
    mlflow.log_param('token_length', token_length)
    mlflow.sklearn.log_model(clf_TFIDF, 'DecisionTreeClassifier')
    mlflow.log_metric('macro_score', macro_score_TFIDF)
    mlflow.end_run()
    

In [14]:
mlflow.set_tracking_uri("http://127.0.0.1:5000")
mlflow.set_experiment(experiment_name="agNewsENSEMBLE")

<Experiment: artifact_location='mlflow-artifacts:/799133258121577462', creation_time=1742305645886, experiment_id='799133258121577462', last_update_time=1742305645886, lifecycle_stage='active', name='agNewsENSEMBLE', tags={}>

In [15]:
final_ag_news_preprocess(ag_news_dataset, model_type='GB')

🏃 View run GB_None_NJ_None_SVD at: http://127.0.0.1:5000/#/experiments/799133258121577462/runs/b47595abea4e46c9b4a2ed3d3533f002
🧪 View experiment at: http://127.0.0.1:5000/#/experiments/799133258121577462
Итерация 1 / 42
🏃 View run GB_None_NJ_None_SVD at: http://127.0.0.1:5000/#/experiments/799133258121577462/runs/285104c9b68c456089b0e8e06bf9791f
🧪 View experiment at: http://127.0.0.1:5000/#/experiments/799133258121577462
Итерация 2 / 42
🏃 View run GB_None_NJ_None_SVD at: http://127.0.0.1:5000/#/experiments/799133258121577462/runs/35eaf735db5f4d02ae760a9bc7f50ade
🧪 View experiment at: http://127.0.0.1:5000/#/experiments/799133258121577462
Итерация 3 / 42
🏃 View run GB_None_NJ_None_SVD at: http://127.0.0.1:5000/#/experiments/799133258121577462/runs/2a5b6a42989d4d77bc36932429eef5cc
🧪 View experiment at: http://127.0.0.1:5000/#/experiments/799133258121577462
Итерация 4 / 42
🏃 View run GB_None_NJ_low_SVD at: http://127.0.0.1:5000/#/experiments/799133258121577462/runs/289b8d0168fd450a8e91f9

In [14]:
#one_run(ag_news_dataset, 'лемматизация', 'NJ', 'low', 'RF')

In [16]:
cc = DecisionTreeClassifier()

In [15]:
dataset = ag_news_dataset
preprocess_type = None
words_class = 'ALL'
frequency_filtration_type = None

# Подготовка данных
x_train = dataset['train']['text']
y_train = dataset['train']['label']
    
x_test = dataset['test']['text']
y_test = dataset['test']['label']
    
# Базовая обработка
for i, text in enumerate(x_train):
    x_train[i] = base_ag_news_preprocess(text)
        
for i, text in enumerate(x_test):
    x_test[i] = base_ag_news_preprocess(text)
    
words = {}
xtr = x_train
xte = x_test
            
# Обработка текстов
for i, tokens in enumerate(xtr):
    final_tokens = different_ag_news_preprocess(tokens, preprocess_type, words_class)
    xtr[i] = final_tokens
                
    # Заполнение словаря
    for token in final_tokens:
        if token not in words:
            words[token] = 1
        else:
            words[token] += 1
            
xte = [different_ag_news_preprocess(tokens, preprocess_type, words_class) for tokens in xte]
                    
# Фильтрация по частоте
filtered_words = frequency_filtration(words, frequency_filtration_type)
token_length = len(filtered_words)
                
# Векторизация слов
word_list = sorted(filtered_words.keys())
# Присвоение словам индексов
words_indexed = {}
for idx, word in enumerate(word_list):
    words_indexed[word] = idx
    
"""# OHE
vectorizer_OHE = CountVectorizer(vocabulary=words_indexed, tokenizer=dummy, preprocessor=dummy, dtype=np.int8, binary=True)
x_train_OHE = vectorizer_OHE.fit_transform(xtr)
x_test_OHE = vectorizer_OHE.transform(xte)
            
# COUNT
vectorizer_COUNT = CountVectorizer(vocabulary=words_indexed, tokenizer=dummy, preprocessor=dummy, dtype=np.int8)
x_train_COUNT = vectorizer_COUNT.fit_transform(xtr)
x_test_COUNT = vectorizer_COUNT.transform(xte)"""
            
# TF-IDF
vectorizer_TFIDF = TfidfVectorizer(vocabulary=words_indexed, preprocessor=dummy, tokenizer=dummy, dtype=np.float32)
x_train_TFIDF = vectorizer_TFIDF.fit_transform(xtr)
x_test_TFIDF = vectorizer_TFIDF.transform(xte)

In [None]:
k=1

In [298]:
SVD = TruncatedSVD(n_components=1000, random_state=42)

In [299]:
SVD.fit(x_train_TFIDF)

In [300]:
x_train_SVD = SVD.transform(x_train_TFIDF)
x_test_SVD = SVD.transform(x_test_TFIDF)

In [304]:
clf_SVD = AdaBoostClassifier(estimator=RandomForestClassifier(n_estimators=12, max_depth=15, n_jobs=-1), n_estimators=50, learning_rate=0.3)
#clf_SVD = HistGradientBoostingClassifier()
clf_SVD = clf_SVD.fit(x_train_SVD, y_train)

In [305]:
predictions_ADA_SVD = clf_SVD.predict(x_test_SVD)
macro_score_ADA_SVD = f1_score(y_test, predictions_ADA_SVD, average='macro')

In [306]:
print(macro_score_ADA_SVD)

0.86164617144008


873

In [26]:
mlflow.start_run(run_name=f'test_{k}_SVD')
mlflow.log_param('model', clf_SVD.__class__.__name__)
mlflow.log_param('preprocess_type', preprocess_type)
mlflow.log_param('words_class', words_class)
mlflow.log_param('frequency_filtration_type', frequency_filtration_type)
mlflow.log_param('token_length', token_length)
mlflow.sklearn.log_model(clf_SVD)
mlflow.log_metric('macro_score', macro_score_ADA_SVD)
mlflow.end_run()
k+=1



🏃 View run test_1_TFIDF at: http://127.0.0.1:5000/#/experiments/799133258121577462/runs/bafba86b133149248c1c83ef61f5feec
🧪 View experiment at: http://127.0.0.1:5000/#/experiments/799133258121577462


In [177]:
clf_OHE_ADA.__class__.__name__

'AdaBoostClassifier'