In [1]:
import nltk
import string
from datasets import load_dataset
import re
from nltk.corpus import stopwords
from nltk import pos_tag
import numpy as np
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier
from scipy.sparse import csr_matrix
from sklearn.metrics import f1_score
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
import mlflow
from sklearnex import patch_sklearn
from warnings import filterwarnings
patch_sklearn()

Intel(R) Extension for Scikit-learn* enabled (https://github.com/uxlfoundation/scikit-learn-intelex)


In [2]:
filterwarnings("ignore")

In [3]:
ag_news_dataset = load_dataset("ag_news")
stop_words = set(stopwords.words("english"))

#### –ù—É–∂–Ω–æ: (–Ω–∏—á–µ–≥–æ, –°—Ç–µ–º–º–∏–Ω–≥, –õ–µ–º–º–∞—Ç–∏–∑–∞—Ü–∏—è) * (binary, count, tf-idf)) -> F1_Macro

In [4]:
text_preprocess_types = [None, '—Å—Ç–µ–º–º–∏–Ω–≥', '–ª–µ–º–º–∞—Ç–∏–∑–∞—Ü–∏—è']
#words_classes = ['N', 'NJ', 'NJV', 'ALL']
words_classes = ['NJ', 'NJV', 'ALL']

#frequency_filtration_types = [None, 'low', 'high', 'both']
frequency_filtration_types = [None]

vector_representation_types = ['binary', 'count', 'tfidf']

In [5]:
iterations_num = len(text_preprocess_types) * len(words_classes) * len(frequency_filtration_types)
print(iterations_num)

24


In [6]:
def base_ag_news_preprocess(text):
    tokens = text.lower()

    # –£–¥–∞–ª–µ–Ω–∏–µ —Å–ø–µ—Ü —Å–ª–æ–≤
    special_words = ['reuters', 'afp', 'ap', 'usatoday.com', 'forbes.com', 'target=/stocks/quickinfo/fullquote"' ]
    for word in special_words:
        tokens = tokens.replace(word, '')
    
    pattern = r'[&lt][^<>]*&gt'
    tokens = re.sub(pattern, '', tokens)
    
    # –£–¥–∞–ª–µ–Ω–∏–µ –ø—É–Ω–∫—Ç—É–∞—Ü–∏–∏ –∏ —Ü–∏—Ñ—Ä
    #tokens = ''.join(i if i not in set(string.punctuation) - set('-') | set(string.digits) else ' ' for i in tokens)
    tokens = ''.join(i if i not in set(string.punctuation)  | set(string.digits) else ' ' for i in tokens)
    
    # –¢–æ–∫–µ–Ω–∏–∑–∞—Ü–∏—è
    tokens = nltk.word_tokenize(tokens)
    
    # –£–¥–∞–ª–µ–Ω–∏–µ —Å—Ç–æ–ø —Å–ª–æ–≤
    #stop_wordsL = stop_words - {'no','not'}
    stop_wordsL = stop_words
    tokens = [word for word in tokens if (word not in stop_wordsL and word != '-')]
    return tokens

In [7]:
def different_ag_news_preprocess(tokens, preprocess_type, words_class):
    
    # –û–±—Ä–∞–±–æ—Ç–∫–∞ —Å–ª–æ–≤
    if preprocess_type == '–ª–µ–º–º–∞—Ç–∏–∑–∞—Ü–∏—è':
        lemmatizer = nltk.WordNetLemmatizer()
        tokens = [lemmatizer.lemmatize(token) for token in tokens]
    elif preprocess_type == '—Å—Ç–µ–º–º–∏–Ω–≥':
        stemmer = nltk.PorterStemmer()
        tokens = [stemmer.stem(token) for token in tokens]
    
    # –û–±—Ä–∞–±–æ—Ç–∫–∞ —á–∞—Å—Ç–µ–π —Ä–µ—á–∏
    tokens = pos_tag(tokens)
    if words_class == 'N':
        tokens = [word for word, tag in tokens if tag.startswith('N')]
    elif words_class == 'NJ':
        tokens = [word for word, tag in tokens if tag.startswith('N') or tag.startswith('J')]
    elif words_class == 'NJV':
        tokens = [word for word, tag in tokens if tag.startswith('N') or tag.startswith('J') or tag.startswith('V')]
    elif words_class == 'ALL':
        tokens = [word for word, _ in tokens]
    
    return tokens  

In [8]:
def frequency_filtration(words_dictionary, frequency_filtration_type):
    if frequency_filtration_type == 'low':
        return dict([(key, value) for key, value in words_dictionary.items() if value >= 10 ])
    elif frequency_filtration_type == 'high':
        return dict([(key, value) for key, value in words_dictionary.items() if value <= 3000])
    elif frequency_filtration_type == 'both':
        return dict([(key, value) for key, value in words_dictionary.items() if 10 <= value <= 3000])
    else:
        return words_dictionary

In [9]:
def dummy(doc):
    return doc

In [178]:
def final_ag_news_preprocess(dataset, model_type):
    # –ü–æ–¥–≥–æ—Ç–æ–≤–∫–∞ –¥–∞–Ω–Ω—ã—Ö
    x_train = dataset['train']['text']
    y_train = dataset['train']['label']
    
    x_test = dataset['test']['text']
    y_test = dataset['test']['label']
    
    # –ë–∞–∑–æ–≤–∞—è –æ–±—Ä–∞–±–æ—Ç–∫–∞
    for i, text in enumerate(x_train):
        x_train[i] = base_ag_news_preprocess(text)
        
    for i, text in enumerate(x_test):
        x_test[i] = base_ag_news_preprocess(text)
    
    index = 2
    # –í–∞—Ä–∏–∞—Ç–∏–≤–Ω–∞—è –æ–±—Ä–∞–±–æ—Ç–∫–∞
    for preprocess_type in text_preprocess_types: # 3 –≤–∞—Ä–∏–∞–Ω—Ç–∞
        for words_class in words_classes:         # 4 –≤–∞—Ä–∏–∞–Ω—Ç–∞
            words = {}
            xtr = x_train
            xte = x_test
            
            # –û–±—Ä–∞–±–æ—Ç–∫–∞ —Ç–µ–∫—Å—Ç–æ–≤
            for i, tokens in enumerate(xtr):
                final_tokens = different_ag_news_preprocess(tokens, preprocess_type, words_class)
                xtr[i] = final_tokens
                
                # –ó–∞–ø–æ–ª–Ω–µ–Ω–∏–µ —Å–ª–æ–≤–∞—Ä—è
                for token in final_tokens:
                    if token not in words:
                        words[token] = 1
                    else:
                        words[token] += 1
            
            xte = [different_ag_news_preprocess(tokens, preprocess_type, words_class) for tokens in xte]
                    
            # –§–∏–ª—å—Ç—Ä–∞—Ü–∏—è –ø–æ —á–∞—Å—Ç–æ—Ç–µ
            for frequency_filtration_type in frequency_filtration_types: # 4 –≤–∞—Ä–∏–∞–Ω—Ç–∞
                filtered_words = frequency_filtration(words, frequency_filtration_type)
                token_length = len(filtered_words)
                
                # –í–µ–∫—Ç–æ—Ä–∏–∑–∞—Ü–∏—è —Å–ª–æ–≤
                word_list = sorted(filtered_words.keys())
                # –ü—Ä–∏—Å–≤–æ–µ–Ω–∏–µ —Å–ª–æ–≤–∞–º –∏–Ω–¥–µ–∫—Å–æ–≤
                words_indexed = {}
                for idx, word in enumerate(word_list):
                    words_indexed[word] = idx
                
                # OHE
                vectorizer_OHE = CountVectorizer(vocabulary=words_indexed, tokenizer=dummy, preprocessor=dummy, dtype=np.int8, binary=True)
                x_train_OHE = vectorizer_OHE.fit_transform(xtr)
                x_test_OHE = vectorizer_OHE.transform(xte)
                
                # COUNT
                vectorizer_COUNT = CountVectorizer(vocabulary=words_indexed, tokenizer=dummy, preprocessor=dummy, dtype=np.int8)
                x_train_COUNT = vectorizer_COUNT.fit_transform(xtr)
                x_test_COUNT = vectorizer_COUNT.transform(xte)
                
                # TF-IDF
                vectorizer_TFIDF = TfidfVectorizer(vocabulary=words_indexed, preprocessor=dummy, tokenizer=dummy, dtype=np.float32)
                x_train_TFIDF = vectorizer_TFIDF.fit_transform(xtr)
                x_test_TFIDF = vectorizer_TFIDF.transform(xte)

                # –ü–æ—Å—Ç—Ä–æ–µ–Ω–∏–µ –∫–ª–∞—Å—Å–∏—Ñ–∏–∫–∞—Ç–æ—Ä–æ–≤
                if model_type == 'DT':
                    clf_OHE = DecisionTreeClassifier()
                    clf_COUNT = DecisionTreeClassifier()
                    clf_TFIDF = DecisionTreeClassifier()
                elif model_type == 'RF':
                    clf_OHE = RandomForestClassifier(n_estimators=50, n_jobs=-1)
                    clf_COUNT = RandomForestClassifier(n_estimators=50, n_jobs=-1)
                    clf_TFIDF = RandomForestClassifier(n_estimators=50, n_jobs=-1)
                elif model_type == 'ADA':
                    clf_OHE =  AdaBoostClassifier(estimator=RandomForestClassifier(n_estimators=12, max_depth=25, n_jobs=-1), n_estimators=50, learning_rate=0.3)
                    clf_COUNT =  AdaBoostClassifier(estimator=RandomForestClassifier(n_estimators=12, max_depth=25, n_jobs=-1), n_estimators=50, learning_rate=0.3)
                    clf_TFIDF =  AdaBoostClassifier(estimator=RandomForestClassifier(n_estimators=12, max_depth=25, n_jobs=-1), n_estimators=50, learning_rate=0.3)
                
                # –û–±—É—á–µ–Ω–∏–µ –∫–ª–∞—Å—Å–∏—Ñ–∏–∫–∞—Ç–æ—Ä–æ–≤
                clf_OHE = clf_OHE.fit(x_train_OHE, y_train)
                clf_COUNT = clf_COUNT.fit(x_train_COUNT, y_train)
                clf_TFIDF = clf_TFIDF.fit(x_train_TFIDF, y_train)
                
                # –¢–µ—Å—Ç–∏—Ä–æ–≤–∞–Ω–∏–µ
                predictions_OHE = clf_OHE.predict(x_test_OHE)
                predictions_COUNT = clf_COUNT.predict(x_test_COUNT)
                predictions_TFIDF = clf_TFIDF.predict(x_test_TFIDF)
                
                # –ú–µ—Ç—Ä–∏–∫–∏
                macro_score_OHE = f1_score(y_test, predictions_OHE, average='macro')
                macro_score_COUNT = f1_score(y_test, predictions_COUNT, average='macro')
                macro_score_TFIDF = f1_score(y_test, predictions_TFIDF, average='macro')
                
                mlflow.start_run(run_name=f'{model_type}_{preprocess_type}_{words_class}_{frequency_filtration_type}_OHE')
                mlflow.log_param('model', clf_OHE.__class__.__name__)
                mlflow.log_param('preprocess_type', preprocess_type)
                mlflow.log_param('words_class', words_class)
                mlflow.log_param('frequency_filtration_type', frequency_filtration_type)
                mlflow.log_param('token_length', token_length)
                mlflow.sklearn.log_model(clf_OHE, 'DecisionTreeClassifier')
                mlflow.log_metric('macro_score', macro_score_OHE)
                mlflow.end_run()
                
                mlflow.start_run(run_name=f'{model_type}_{preprocess_type}_{words_class}_{frequency_filtration_type}_COUNT')
                mlflow.log_param('model', clf_COUNT.__class__.__name__)
                mlflow.log_param('preprocess_type', preprocess_type)
                mlflow.log_param('words_class', words_class)
                mlflow.log_param('frequency_filtration_type', frequency_filtration_type)
                mlflow.log_param('token_length', token_length)
                mlflow.sklearn.log_model(clf_COUNT, 'DecisionTreeClassifier')
                mlflow.log_metric('macro_score', macro_score_COUNT)
                mlflow.end_run()
                
                mlflow.start_run(run_name=f'{model_type}_{preprocess_type}_{words_class}_{frequency_filtration_type}_TFIDF')
                mlflow.log_param('model', clf_TFIDF.__class__.__name__)
                mlflow.log_param('preprocess_type', preprocess_type)
                mlflow.log_param('words_class', words_class)
                mlflow.log_param('frequency_filtration_type', frequency_filtration_type)
                mlflow.log_param('token_length', token_length)
                mlflow.sklearn.log_model(clf_TFIDF, 'DecisionTreeClassifier')
                mlflow.log_metric('macro_score', macro_score_TFIDF)
                mlflow.end_run()
                
                index += 1
                print(f'–ò—Ç–µ—Ä–∞—Ü–∏—è {index} / {iterations_num}')

In [11]:
def one_run(dataset, preprocess_type, words_class, frequency_filtration_type, model_type):
    # –ü–æ–¥–≥–æ—Ç–æ–≤–∫–∞ –¥–∞–Ω–Ω—ã—Ö
    x_train = dataset['train']['text']
    y_train = dataset['train']['label']
    
    x_test = dataset['test']['text']
    y_test = dataset['test']['label']
    
    # –ë–∞–∑–æ–≤–∞—è –æ–±—Ä–∞–±–æ—Ç–∫–∞
    for i, text in enumerate(x_train):
        x_train[i] = base_ag_news_preprocess(text)
        
    for i, text in enumerate(x_test):
        x_test[i] = base_ag_news_preprocess(text)
    
    words = {}
    xtr = x_train
    xte = x_test
            
    # –û–±—Ä–∞–±–æ—Ç–∫–∞ —Ç–µ–∫—Å—Ç–æ–≤
    for i, tokens in enumerate(xtr):
        final_tokens = different_ag_news_preprocess(tokens, preprocess_type, words_class)
        xtr[i] = final_tokens
                
        # –ó–∞–ø–æ–ª–Ω–µ–Ω–∏–µ —Å–ª–æ–≤–∞—Ä—è
        for token in final_tokens:
            if token not in words:
                words[token] = 1
            else:
                words[token] += 1
            
    xte = [different_ag_news_preprocess(tokens, preprocess_type, words_class) for tokens in xte]
                    
    # –§–∏–ª—å—Ç—Ä–∞—Ü–∏—è –ø–æ —á–∞—Å—Ç–æ—Ç–µ
    filtered_words = frequency_filtration(words, frequency_filtration_type)
    token_length = len(filtered_words)
                
    # –í–µ–∫—Ç–æ—Ä–∏–∑–∞—Ü–∏—è —Å–ª–æ–≤
    word_list = sorted(filtered_words.keys())
    # –ü—Ä–∏—Å–≤–æ–µ–Ω–∏–µ —Å–ª–æ–≤–∞–º –∏–Ω–¥–µ–∫—Å–æ–≤
    words_indexed = {}
    for idx, word in enumerate(word_list):
        words_indexed[word] = idx
        
    # OHE
    vectorizer_OHE = CountVectorizer(vocabulary=words_indexed, tokenizer=dummy, preprocessor=dummy, dtype=np.int8, binary=True)
    x_train_OHE = vectorizer_OHE.fit_transform(xtr)
    x_test_OHE = vectorizer_OHE.transform(xte)
                
    # COUNT
    vectorizer_COUNT = CountVectorizer(vocabulary=words_indexed, tokenizer=dummy, preprocessor=dummy, dtype=np.int8)
    x_train_COUNT = vectorizer_COUNT.fit_transform(xtr)
    x_test_COUNT = vectorizer_COUNT.transform(xte)
                
    # TF-IDF
    vectorizer_TFIDF = TfidfVectorizer(vocabulary=words_indexed, preprocessor=dummy, tokenizer=dummy, dtype=np.float32)
    x_train_TFIDF = vectorizer_TFIDF.fit_transform(xtr)
    x_test_TFIDF = vectorizer_TFIDF.transform(xte)
    
    # –ü–æ—Å—Ç—Ä–æ–µ–Ω–∏–µ –∫–ª–∞—Å—Å–∏—Ñ–∏–∫–∞—Ç–æ—Ä–æ–≤
    if model_type == 'DT':
        clf_OHE = DecisionTreeClassifier()
        clf_COUNT = DecisionTreeClassifier()
        clf_TFIDF = DecisionTreeClassifier()
    elif model_type == 'RF':
        clf_OHE = RandomForestClassifier(n_estimators=50, n_jobs=-1)
        clf_COUNT = RandomForestClassifier(n_estimators=50, n_jobs=-1)
        clf_TFIDF = RandomForestClassifier(n_estimators=50, n_jobs=-1)
    
    # –û–±—É—á–µ–Ω–∏–µ –∫–ª–∞—Å—Å–∏—Ñ–∏–∫–∞—Ç–æ—Ä–æ–≤
    clf_OHE = clf_OHE.fit(x_train_OHE, y_train)
    clf_COUNT = clf_COUNT.fit(x_train_COUNT, y_train)
    clf_TFIDF = clf_TFIDF.fit(x_train_TFIDF, y_train)
                
    # –¢–µ—Å—Ç–∏—Ä–æ–≤–∞–Ω–∏–µ
    predictions_OHE = clf_OHE.predict(x_test_OHE)
    predictions_COUNT = clf_COUNT.predict(x_test_COUNT)
    predictions_TFIDF = clf_TFIDF.predict(x_test_TFIDF)
                
    # –ú–µ—Ç—Ä–∏–∫–∏
    macro_score_OHE = f1_score(y_test, predictions_OHE, average='macro')
    macro_score_COUNT = f1_score(y_test, predictions_COUNT, average='macro')
    macro_score_TFIDF = f1_score(y_test, predictions_TFIDF, average='macro')
    
    mlflow.start_run(run_name=f'test_{preprocess_type}_{words_class}_{frequency_filtration_type}_OHE')
    mlflow.log_param('model', clf_OHE.__class__.__name__)
    mlflow.log_param('tree_depth', clf_OHE.get_depth())
    mlflow.log_param('preprocess_type', preprocess_type)
    mlflow.log_param('words_class', words_class)
    mlflow.log_param('frequency_filtration_type', frequency_filtration_type)
    mlflow.log_param('token_length', token_length)
    mlflow.sklearn.log_model(clf_OHE, 'DecisionTreeClassifier')
    mlflow.log_metric('macro_score', macro_score_OHE)
    mlflow.end_run()
    
    mlflow.start_run(run_name=f'test_{preprocess_type}_{words_class}_{frequency_filtration_type}_COUNT')
    mlflow.log_param('model', clf_COUNT.__class__.__name__)
    mlflow.log_param('tree_depth', clf_TFIDF.get_depth())
    mlflow.log_param('preprocess_type', preprocess_type)
    mlflow.log_param('words_class', words_class)
    mlflow.log_param('frequency_filtration_type', frequency_filtration_type)
    mlflow.log_param('token_length', token_length)
    mlflow.sklearn.log_model(clf_COUNT, 'DecisionTreeClassifier')
    mlflow.log_metric('macro_score', macro_score_COUNT)
    mlflow.end_run()
    
    mlflow.start_run(run_name=f'test_{preprocess_type}_{words_class}_{frequency_filtration_type}_TFIDF')
    mlflow.log_param('model', clf_TFIDF.__class__.__name__)
    mlflow.log_param('tree_depth', clf_TFIDF.get_depth())
    mlflow.log_param('preprocess_type', preprocess_type)
    mlflow.log_param('words_class', words_class)
    mlflow.log_param('frequency_filtration_type', frequency_filtration_type)
    mlflow.log_param('token_length', token_length)
    mlflow.sklearn.log_model(clf_TFIDF, 'DecisionTreeClassifier')
    mlflow.log_metric('macro_score', macro_score_TFIDF)
    mlflow.end_run()
    

In [179]:
mlflow.set_tracking_uri("http://127.0.0.1:5000")
mlflow.set_experiment(experiment_name="agNewsADA")

2025/03/12 00:10:31 INFO mlflow.tracking.fluent: Experiment with name 'agNewsADA' does not exist. Creating a new experiment.


<Experiment: artifact_location='mlflow-artifacts:/793654491394688587', creation_time=1741727431630, experiment_id='793654491394688587', last_update_time=1741727431630, lifecycle_stage='active', name='agNewsADA', tags={}>

In [180]:
final_ag_news_preprocess(ag_news_dataset, model_type='ADA')



üèÉ View run ADA_None_N_None_OHE at: http://127.0.0.1:5000/#/experiments/793654491394688587/runs/2f9f6fa7205b4302bafd231eb383eb56
üß™ View experiment at: http://127.0.0.1:5000/#/experiments/793654491394688587




üèÉ View run ADA_None_N_None_COUNT at: http://127.0.0.1:5000/#/experiments/793654491394688587/runs/ec6294b3c04c46d2ac594ce3b832cc5d
üß™ View experiment at: http://127.0.0.1:5000/#/experiments/793654491394688587




üèÉ View run ADA_None_N_None_TFIDF at: http://127.0.0.1:5000/#/experiments/793654491394688587/runs/9c0733dc616d4630844eb12fa3b37fdf
üß™ View experiment at: http://127.0.0.1:5000/#/experiments/793654491394688587
–ò—Ç–µ—Ä–∞—Ü–∏—è 3 / 24




üèÉ View run ADA_None_N_low_OHE at: http://127.0.0.1:5000/#/experiments/793654491394688587/runs/9c7e4d7fec274f929cd6ed2e0219d865
üß™ View experiment at: http://127.0.0.1:5000/#/experiments/793654491394688587




üèÉ View run ADA_None_N_low_COUNT at: http://127.0.0.1:5000/#/experiments/793654491394688587/runs/7caf2601c201439da7f761f23e107801
üß™ View experiment at: http://127.0.0.1:5000/#/experiments/793654491394688587




üèÉ View run ADA_None_N_low_TFIDF at: http://127.0.0.1:5000/#/experiments/793654491394688587/runs/23d71cd0da82418c9cbe69fd642ec542
üß™ View experiment at: http://127.0.0.1:5000/#/experiments/793654491394688587
–ò—Ç–µ—Ä–∞—Ü–∏—è 4 / 24




üèÉ View run ADA_None_NJ_None_OHE at: http://127.0.0.1:5000/#/experiments/793654491394688587/runs/802d0353191e41b39aab4b34b169c318
üß™ View experiment at: http://127.0.0.1:5000/#/experiments/793654491394688587




üèÉ View run ADA_None_NJ_None_COUNT at: http://127.0.0.1:5000/#/experiments/793654491394688587/runs/dd3848289984405cb180bf98c2735d29
üß™ View experiment at: http://127.0.0.1:5000/#/experiments/793654491394688587




üèÉ View run ADA_None_NJ_None_TFIDF at: http://127.0.0.1:5000/#/experiments/793654491394688587/runs/ea69445e48994e15a3c9d6192f09af82
üß™ View experiment at: http://127.0.0.1:5000/#/experiments/793654491394688587
–ò—Ç–µ—Ä–∞—Ü–∏—è 5 / 24




üèÉ View run ADA_None_NJ_low_OHE at: http://127.0.0.1:5000/#/experiments/793654491394688587/runs/26dd64ba75544d12a7abd6ec6d583f1e
üß™ View experiment at: http://127.0.0.1:5000/#/experiments/793654491394688587




üèÉ View run ADA_None_NJ_low_COUNT at: http://127.0.0.1:5000/#/experiments/793654491394688587/runs/b7979e4db2e649b4bd04ff76ee6ef252
üß™ View experiment at: http://127.0.0.1:5000/#/experiments/793654491394688587




üèÉ View run ADA_None_NJ_low_TFIDF at: http://127.0.0.1:5000/#/experiments/793654491394688587/runs/b2e2ee2190f249c1b2dbc2f5a49881dc
üß™ View experiment at: http://127.0.0.1:5000/#/experiments/793654491394688587
–ò—Ç–µ—Ä–∞—Ü–∏—è 6 / 24




üèÉ View run ADA_None_NJV_None_OHE at: http://127.0.0.1:5000/#/experiments/793654491394688587/runs/084cd3785fb041b28bacc6b0528613a9
üß™ View experiment at: http://127.0.0.1:5000/#/experiments/793654491394688587




üèÉ View run ADA_None_NJV_None_COUNT at: http://127.0.0.1:5000/#/experiments/793654491394688587/runs/c08d774cb8af44d0ae7547a2ce2fcad9
üß™ View experiment at: http://127.0.0.1:5000/#/experiments/793654491394688587




üèÉ View run ADA_None_NJV_None_TFIDF at: http://127.0.0.1:5000/#/experiments/793654491394688587/runs/fae72237056641d4b36ded1db04667ab
üß™ View experiment at: http://127.0.0.1:5000/#/experiments/793654491394688587
–ò—Ç–µ—Ä–∞—Ü–∏—è 7 / 24




üèÉ View run ADA_None_NJV_low_OHE at: http://127.0.0.1:5000/#/experiments/793654491394688587/runs/fa1bb6a43f3342f484203f1070bdc299
üß™ View experiment at: http://127.0.0.1:5000/#/experiments/793654491394688587




üèÉ View run ADA_None_NJV_low_COUNT at: http://127.0.0.1:5000/#/experiments/793654491394688587/runs/2798134494094a2888ff42ced64a1c61
üß™ View experiment at: http://127.0.0.1:5000/#/experiments/793654491394688587




üèÉ View run ADA_None_NJV_low_TFIDF at: http://127.0.0.1:5000/#/experiments/793654491394688587/runs/c7f6131001b848fd9c2262dee583ec1e
üß™ View experiment at: http://127.0.0.1:5000/#/experiments/793654491394688587
–ò—Ç–µ—Ä–∞—Ü–∏—è 8 / 24




üèÉ View run ADA_None_ALL_None_OHE at: http://127.0.0.1:5000/#/experiments/793654491394688587/runs/a403402143564422bbbb74375e19c583
üß™ View experiment at: http://127.0.0.1:5000/#/experiments/793654491394688587




üèÉ View run ADA_None_ALL_None_COUNT at: http://127.0.0.1:5000/#/experiments/793654491394688587/runs/9322e06559094478bcaa0db5f6233606
üß™ View experiment at: http://127.0.0.1:5000/#/experiments/793654491394688587




üèÉ View run ADA_None_ALL_None_TFIDF at: http://127.0.0.1:5000/#/experiments/793654491394688587/runs/0fff36a3664341c3a062e6c32d5ccca3
üß™ View experiment at: http://127.0.0.1:5000/#/experiments/793654491394688587
–ò—Ç–µ—Ä–∞—Ü–∏—è 9 / 24




üèÉ View run ADA_None_ALL_low_OHE at: http://127.0.0.1:5000/#/experiments/793654491394688587/runs/2510a3fc87e04bacb19bcf3283e5cc1e
üß™ View experiment at: http://127.0.0.1:5000/#/experiments/793654491394688587




üèÉ View run ADA_None_ALL_low_COUNT at: http://127.0.0.1:5000/#/experiments/793654491394688587/runs/efcf3bff68cb4b6d8479531ac1c9c668
üß™ View experiment at: http://127.0.0.1:5000/#/experiments/793654491394688587




üèÉ View run ADA_None_ALL_low_TFIDF at: http://127.0.0.1:5000/#/experiments/793654491394688587/runs/77777c300f4a4b0b914506657f17cf7d
üß™ View experiment at: http://127.0.0.1:5000/#/experiments/793654491394688587
–ò—Ç–µ—Ä–∞—Ü–∏—è 10 / 24




üèÉ View run ADA_—Å—Ç–µ–º–º–∏–Ω–≥_N_None_OHE at: http://127.0.0.1:5000/#/experiments/793654491394688587/runs/91b068c92b534931a63e27b414b24098
üß™ View experiment at: http://127.0.0.1:5000/#/experiments/793654491394688587




üèÉ View run ADA_—Å—Ç–µ–º–º–∏–Ω–≥_N_None_COUNT at: http://127.0.0.1:5000/#/experiments/793654491394688587/runs/1ef5bcf520b646a290d72e69a6dbdaaa
üß™ View experiment at: http://127.0.0.1:5000/#/experiments/793654491394688587




üèÉ View run ADA_—Å—Ç–µ–º–º–∏–Ω–≥_N_None_TFIDF at: http://127.0.0.1:5000/#/experiments/793654491394688587/runs/9d846df31b894f0ea2cd548ebad31406
üß™ View experiment at: http://127.0.0.1:5000/#/experiments/793654491394688587
–ò—Ç–µ—Ä–∞—Ü–∏—è 11 / 24




üèÉ View run ADA_—Å—Ç–µ–º–º–∏–Ω–≥_N_low_OHE at: http://127.0.0.1:5000/#/experiments/793654491394688587/runs/a98c601d6add445586f1abbdece53b1f
üß™ View experiment at: http://127.0.0.1:5000/#/experiments/793654491394688587




üèÉ View run ADA_—Å—Ç–µ–º–º–∏–Ω–≥_N_low_COUNT at: http://127.0.0.1:5000/#/experiments/793654491394688587/runs/ccdfb3ef28e04bd68d10fa5fe1766052
üß™ View experiment at: http://127.0.0.1:5000/#/experiments/793654491394688587




üèÉ View run ADA_—Å—Ç–µ–º–º–∏–Ω–≥_N_low_TFIDF at: http://127.0.0.1:5000/#/experiments/793654491394688587/runs/819cda4d0ea14750a2d5413e10dc41f4
üß™ View experiment at: http://127.0.0.1:5000/#/experiments/793654491394688587
–ò—Ç–µ—Ä–∞—Ü–∏—è 12 / 24




üèÉ View run ADA_—Å—Ç–µ–º–º–∏–Ω–≥_NJ_None_OHE at: http://127.0.0.1:5000/#/experiments/793654491394688587/runs/403fb031d8f040069fe825dc2f91ea07
üß™ View experiment at: http://127.0.0.1:5000/#/experiments/793654491394688587




üèÉ View run ADA_—Å—Ç–µ–º–º–∏–Ω–≥_NJ_None_COUNT at: http://127.0.0.1:5000/#/experiments/793654491394688587/runs/678e2e20ad96487ebe1d0ff30927eebb
üß™ View experiment at: http://127.0.0.1:5000/#/experiments/793654491394688587




üèÉ View run ADA_—Å—Ç–µ–º–º–∏–Ω–≥_NJ_None_TFIDF at: http://127.0.0.1:5000/#/experiments/793654491394688587/runs/43e1dc8bef17452e83bb42d6450832f1
üß™ View experiment at: http://127.0.0.1:5000/#/experiments/793654491394688587
–ò—Ç–µ—Ä–∞—Ü–∏—è 13 / 24




üèÉ View run ADA_—Å—Ç–µ–º–º–∏–Ω–≥_NJ_low_OHE at: http://127.0.0.1:5000/#/experiments/793654491394688587/runs/37dd6eb846e34ff4a3d26a793b174976
üß™ View experiment at: http://127.0.0.1:5000/#/experiments/793654491394688587




üèÉ View run ADA_—Å—Ç–µ–º–º–∏–Ω–≥_NJ_low_COUNT at: http://127.0.0.1:5000/#/experiments/793654491394688587/runs/7dc2984a53dd466eacab5c1c027415d6
üß™ View experiment at: http://127.0.0.1:5000/#/experiments/793654491394688587




üèÉ View run ADA_—Å—Ç–µ–º–º–∏–Ω–≥_NJ_low_TFIDF at: http://127.0.0.1:5000/#/experiments/793654491394688587/runs/6c291ce385574ae09d4eba291da5a896
üß™ View experiment at: http://127.0.0.1:5000/#/experiments/793654491394688587
–ò—Ç–µ—Ä–∞—Ü–∏—è 14 / 24




üèÉ View run ADA_—Å—Ç–µ–º–º–∏–Ω–≥_NJV_None_OHE at: http://127.0.0.1:5000/#/experiments/793654491394688587/runs/622498ff51a9404982fc4711ca99391a
üß™ View experiment at: http://127.0.0.1:5000/#/experiments/793654491394688587




üèÉ View run ADA_—Å—Ç–µ–º–º–∏–Ω–≥_NJV_None_COUNT at: http://127.0.0.1:5000/#/experiments/793654491394688587/runs/58dbc6ca8bb64431a83bcbf55903e69f
üß™ View experiment at: http://127.0.0.1:5000/#/experiments/793654491394688587




üèÉ View run ADA_—Å—Ç–µ–º–º–∏–Ω–≥_NJV_None_TFIDF at: http://127.0.0.1:5000/#/experiments/793654491394688587/runs/2451314ae85948c9b8554f8ca7692d71
üß™ View experiment at: http://127.0.0.1:5000/#/experiments/793654491394688587
–ò—Ç–µ—Ä–∞—Ü–∏—è 15 / 24




üèÉ View run ADA_—Å—Ç–µ–º–º–∏–Ω–≥_NJV_low_OHE at: http://127.0.0.1:5000/#/experiments/793654491394688587/runs/dc7bbf0c81db4ca88d276d715d19b442
üß™ View experiment at: http://127.0.0.1:5000/#/experiments/793654491394688587




üèÉ View run ADA_—Å—Ç–µ–º–º–∏–Ω–≥_NJV_low_COUNT at: http://127.0.0.1:5000/#/experiments/793654491394688587/runs/c911e131b4134142bb27ea7e26a63449
üß™ View experiment at: http://127.0.0.1:5000/#/experiments/793654491394688587




üèÉ View run ADA_—Å—Ç–µ–º–º–∏–Ω–≥_NJV_low_TFIDF at: http://127.0.0.1:5000/#/experiments/793654491394688587/runs/94bd8e174ed046dbbc923efd2c253736
üß™ View experiment at: http://127.0.0.1:5000/#/experiments/793654491394688587
–ò—Ç–µ—Ä–∞—Ü–∏—è 16 / 24




üèÉ View run ADA_—Å—Ç–µ–º–º–∏–Ω–≥_ALL_None_OHE at: http://127.0.0.1:5000/#/experiments/793654491394688587/runs/2a11d27eb4fe4f529043bce2fa0cabd6
üß™ View experiment at: http://127.0.0.1:5000/#/experiments/793654491394688587




üèÉ View run ADA_—Å—Ç–µ–º–º–∏–Ω–≥_ALL_None_COUNT at: http://127.0.0.1:5000/#/experiments/793654491394688587/runs/84f90be712f4483abcb69c073c303911
üß™ View experiment at: http://127.0.0.1:5000/#/experiments/793654491394688587




üèÉ View run ADA_—Å—Ç–µ–º–º–∏–Ω–≥_ALL_None_TFIDF at: http://127.0.0.1:5000/#/experiments/793654491394688587/runs/b4a36391369e418694724e82075b48c3
üß™ View experiment at: http://127.0.0.1:5000/#/experiments/793654491394688587
–ò—Ç–µ—Ä–∞—Ü–∏—è 17 / 24




üèÉ View run ADA_—Å—Ç–µ–º–º–∏–Ω–≥_ALL_low_OHE at: http://127.0.0.1:5000/#/experiments/793654491394688587/runs/54d091dac38b43229a43f0f9c0f10e55
üß™ View experiment at: http://127.0.0.1:5000/#/experiments/793654491394688587




üèÉ View run ADA_—Å—Ç–µ–º–º–∏–Ω–≥_ALL_low_COUNT at: http://127.0.0.1:5000/#/experiments/793654491394688587/runs/2356fa1e025c487d87e85d18666e0926
üß™ View experiment at: http://127.0.0.1:5000/#/experiments/793654491394688587




üèÉ View run ADA_—Å—Ç–µ–º–º–∏–Ω–≥_ALL_low_TFIDF at: http://127.0.0.1:5000/#/experiments/793654491394688587/runs/172cf01416fa40a982d0bd91c524add0
üß™ View experiment at: http://127.0.0.1:5000/#/experiments/793654491394688587
–ò—Ç–µ—Ä–∞—Ü–∏—è 18 / 24




üèÉ View run ADA_–ª–µ–º–º–∞—Ç–∏–∑–∞—Ü–∏—è_N_None_OHE at: http://127.0.0.1:5000/#/experiments/793654491394688587/runs/597d31e5cc4547f0b5a459a96eb7d158
üß™ View experiment at: http://127.0.0.1:5000/#/experiments/793654491394688587




üèÉ View run ADA_–ª–µ–º–º–∞—Ç–∏–∑–∞—Ü–∏—è_N_None_COUNT at: http://127.0.0.1:5000/#/experiments/793654491394688587/runs/3a016c3fc3d54154a90ddafe2e1b11ce
üß™ View experiment at: http://127.0.0.1:5000/#/experiments/793654491394688587




üèÉ View run ADA_–ª–µ–º–º–∞—Ç–∏–∑–∞—Ü–∏—è_N_None_TFIDF at: http://127.0.0.1:5000/#/experiments/793654491394688587/runs/aa13ebc1b42e4fdbafb0c010a0baa395
üß™ View experiment at: http://127.0.0.1:5000/#/experiments/793654491394688587
–ò—Ç–µ—Ä–∞—Ü–∏—è 19 / 24




üèÉ View run ADA_–ª–µ–º–º–∞—Ç–∏–∑–∞—Ü–∏—è_N_low_OHE at: http://127.0.0.1:5000/#/experiments/793654491394688587/runs/0aa067f71167451197f61426c6d72451
üß™ View experiment at: http://127.0.0.1:5000/#/experiments/793654491394688587




üèÉ View run ADA_–ª–µ–º–º–∞—Ç–∏–∑–∞—Ü–∏—è_N_low_COUNT at: http://127.0.0.1:5000/#/experiments/793654491394688587/runs/66b2eda4bc85451ebf382df8dc16ac0c
üß™ View experiment at: http://127.0.0.1:5000/#/experiments/793654491394688587




üèÉ View run ADA_–ª–µ–º–º–∞—Ç–∏–∑–∞—Ü–∏—è_N_low_TFIDF at: http://127.0.0.1:5000/#/experiments/793654491394688587/runs/6bb32740b0484d258479fcfa9a702c75
üß™ View experiment at: http://127.0.0.1:5000/#/experiments/793654491394688587
–ò—Ç–µ—Ä–∞—Ü–∏—è 20 / 24




üèÉ View run ADA_–ª–µ–º–º–∞—Ç–∏–∑–∞—Ü–∏—è_NJ_None_OHE at: http://127.0.0.1:5000/#/experiments/793654491394688587/runs/0b349b622ba046ffb18b0f8d5092d92e
üß™ View experiment at: http://127.0.0.1:5000/#/experiments/793654491394688587




üèÉ View run ADA_–ª–µ–º–º–∞—Ç–∏–∑–∞—Ü–∏—è_NJ_None_COUNT at: http://127.0.0.1:5000/#/experiments/793654491394688587/runs/8b2f9bc15c48434dad5de6c87618c334
üß™ View experiment at: http://127.0.0.1:5000/#/experiments/793654491394688587




üèÉ View run ADA_–ª–µ–º–º–∞—Ç–∏–∑–∞—Ü–∏—è_NJ_None_TFIDF at: http://127.0.0.1:5000/#/experiments/793654491394688587/runs/27ecd5c9afbc45099cec89ef7c551993
üß™ View experiment at: http://127.0.0.1:5000/#/experiments/793654491394688587
–ò—Ç–µ—Ä–∞—Ü–∏—è 21 / 24




üèÉ View run ADA_–ª–µ–º–º–∞—Ç–∏–∑–∞—Ü–∏—è_NJ_low_OHE at: http://127.0.0.1:5000/#/experiments/793654491394688587/runs/a50c04fcd394427c9451e4e30ee3d90e
üß™ View experiment at: http://127.0.0.1:5000/#/experiments/793654491394688587




üèÉ View run ADA_–ª–µ–º–º–∞—Ç–∏–∑–∞—Ü–∏—è_NJ_low_COUNT at: http://127.0.0.1:5000/#/experiments/793654491394688587/runs/6950eace46eb4c1bb4574e6f56f8c89a
üß™ View experiment at: http://127.0.0.1:5000/#/experiments/793654491394688587




üèÉ View run ADA_–ª–µ–º–º–∞—Ç–∏–∑–∞—Ü–∏—è_NJ_low_TFIDF at: http://127.0.0.1:5000/#/experiments/793654491394688587/runs/3a01abf5f19641dca25598c962256f4a
üß™ View experiment at: http://127.0.0.1:5000/#/experiments/793654491394688587
–ò—Ç–µ—Ä–∞—Ü–∏—è 22 / 24




üèÉ View run ADA_–ª–µ–º–º–∞—Ç–∏–∑–∞—Ü–∏—è_NJV_None_OHE at: http://127.0.0.1:5000/#/experiments/793654491394688587/runs/eaaa668022f149aa9082fe4349b07b2d
üß™ View experiment at: http://127.0.0.1:5000/#/experiments/793654491394688587




üèÉ View run ADA_–ª–µ–º–º–∞—Ç–∏–∑–∞—Ü–∏—è_NJV_None_COUNT at: http://127.0.0.1:5000/#/experiments/793654491394688587/runs/9e802cbd2ad84f8d9e06de75df983391
üß™ View experiment at: http://127.0.0.1:5000/#/experiments/793654491394688587




üèÉ View run ADA_–ª–µ–º–º–∞—Ç–∏–∑–∞—Ü–∏—è_NJV_None_TFIDF at: http://127.0.0.1:5000/#/experiments/793654491394688587/runs/878e2c140dba48d798da5c15f67696db
üß™ View experiment at: http://127.0.0.1:5000/#/experiments/793654491394688587
–ò—Ç–µ—Ä–∞—Ü–∏—è 23 / 24




üèÉ View run ADA_–ª–µ–º–º–∞—Ç–∏–∑–∞—Ü–∏—è_NJV_low_OHE at: http://127.0.0.1:5000/#/experiments/793654491394688587/runs/edaa3041328f49da86018c0e7030c7d8
üß™ View experiment at: http://127.0.0.1:5000/#/experiments/793654491394688587




üèÉ View run ADA_–ª–µ–º–º–∞—Ç–∏–∑–∞—Ü–∏—è_NJV_low_COUNT at: http://127.0.0.1:5000/#/experiments/793654491394688587/runs/3db149e7c62b407f98008f1a6dfdb152
üß™ View experiment at: http://127.0.0.1:5000/#/experiments/793654491394688587




üèÉ View run ADA_–ª–µ–º–º–∞—Ç–∏–∑–∞—Ü–∏—è_NJV_low_TFIDF at: http://127.0.0.1:5000/#/experiments/793654491394688587/runs/13834928ee64497c86d3e9c7ac96af58
üß™ View experiment at: http://127.0.0.1:5000/#/experiments/793654491394688587
–ò—Ç–µ—Ä–∞—Ü–∏—è 24 / 24




üèÉ View run ADA_–ª–µ–º–º–∞—Ç–∏–∑–∞—Ü–∏—è_ALL_None_OHE at: http://127.0.0.1:5000/#/experiments/793654491394688587/runs/1f693c8b55464a6ab7d9788ff20ed2ab
üß™ View experiment at: http://127.0.0.1:5000/#/experiments/793654491394688587




üèÉ View run ADA_–ª–µ–º–º–∞—Ç–∏–∑–∞—Ü–∏—è_ALL_None_COUNT at: http://127.0.0.1:5000/#/experiments/793654491394688587/runs/485197ffc05849ed8e54af80bbb74fef
üß™ View experiment at: http://127.0.0.1:5000/#/experiments/793654491394688587




üèÉ View run ADA_–ª–µ–º–º–∞—Ç–∏–∑–∞—Ü–∏—è_ALL_None_TFIDF at: http://127.0.0.1:5000/#/experiments/793654491394688587/runs/24a1351c781747ed9a8f7edd53531d73
üß™ View experiment at: http://127.0.0.1:5000/#/experiments/793654491394688587
–ò—Ç–µ—Ä–∞—Ü–∏—è 25 / 24




üèÉ View run ADA_–ª–µ–º–º–∞—Ç–∏–∑–∞—Ü–∏—è_ALL_low_OHE at: http://127.0.0.1:5000/#/experiments/793654491394688587/runs/103cc6a184484693bbfbcd7aaf57a018
üß™ View experiment at: http://127.0.0.1:5000/#/experiments/793654491394688587




üèÉ View run ADA_–ª–µ–º–º–∞—Ç–∏–∑–∞—Ü–∏—è_ALL_low_COUNT at: http://127.0.0.1:5000/#/experiments/793654491394688587/runs/1873838f0ce74569a3186c5ee45539e3
üß™ View experiment at: http://127.0.0.1:5000/#/experiments/793654491394688587




üèÉ View run ADA_–ª–µ–º–º–∞—Ç–∏–∑–∞—Ü–∏—è_ALL_low_TFIDF at: http://127.0.0.1:5000/#/experiments/793654491394688587/runs/75580dcd2a114361921b0faaebbcd608
üß™ View experiment at: http://127.0.0.1:5000/#/experiments/793654491394688587
–ò—Ç–µ—Ä–∞—Ü–∏—è 26 / 24


In [13]:
one_run(ag_news_dataset, '–ª–µ–º–º–∞—Ç–∏–∑–∞—Ü–∏—è', 'NJ', 'low', 'RF')



üèÉ View run test_–ª–µ–º–º–∞—Ç–∏–∑–∞—Ü–∏—è_NJ_low_OHE at: http://127.0.0.1:5000/#/experiments/537408113752698407/runs/23ea94598c4143bfb298c45c0cf0974a
üß™ View experiment at: http://127.0.0.1:5000/#/experiments/537408113752698407




üèÉ View run test_–ª–µ–º–º–∞—Ç–∏–∑–∞—Ü–∏—è_NJ_low_COUNT at: http://127.0.0.1:5000/#/experiments/537408113752698407/runs/9198223fdacc4638acea6f8344de3f75
üß™ View experiment at: http://127.0.0.1:5000/#/experiments/537408113752698407




üèÉ View run test_–ª–µ–º–º–∞—Ç–∏–∑–∞—Ü–∏—è_NJ_low_TFIDF at: http://127.0.0.1:5000/#/experiments/537408113752698407/runs/4b6ae5d8cddc46358b79466b1a52e854
üß™ View experiment at: http://127.0.0.1:5000/#/experiments/537408113752698407


In [16]:
cc = DecisionTreeClassifier()

In [21]:
dataset = ag_news_dataset
preprocess_type = '–ª–µ–º–º–∞—Ç–∏–∑–∞—Ü–∏—è'
words_class = 'NJ'
frequency_filtration_type = 'low'

# –ü–æ–¥–≥–æ—Ç–æ–≤–∫–∞ –¥–∞–Ω–Ω—ã—Ö
x_train = dataset['train']['text']
y_train = dataset['train']['label']
    
x_test = dataset['test']['text']
y_test = dataset['test']['label']
    
# –ë–∞–∑–æ–≤–∞—è –æ–±—Ä–∞–±–æ—Ç–∫–∞
for i, text in enumerate(x_train):
    x_train[i] = base_ag_news_preprocess(text)
        
for i, text in enumerate(x_test):
    x_test[i] = base_ag_news_preprocess(text)
    
words = {}
xtr = x_train
xte = x_test
            
# –û–±—Ä–∞–±–æ—Ç–∫–∞ —Ç–µ–∫—Å—Ç–æ–≤
for i, tokens in enumerate(xtr):
    final_tokens = different_ag_news_preprocess(tokens, preprocess_type, words_class)
    xtr[i] = final_tokens
                
    # –ó–∞–ø–æ–ª–Ω–µ–Ω–∏–µ —Å–ª–æ–≤–∞—Ä—è
    for token in final_tokens:
        if token not in words:
            words[token] = 1
        else:
            words[token] += 1
            
xte = [different_ag_news_preprocess(tokens, preprocess_type, words_class) for tokens in xte]
                    
# –§–∏–ª—å—Ç—Ä–∞—Ü–∏—è –ø–æ —á–∞—Å—Ç–æ—Ç–µ
filtered_words = frequency_filtration(words, frequency_filtration_type)
token_length = len(filtered_words)
                
# –í–µ–∫—Ç–æ—Ä–∏–∑–∞—Ü–∏—è —Å–ª–æ–≤
word_list = sorted(filtered_words.keys())
# –ü—Ä–∏—Å–≤–æ–µ–Ω–∏–µ —Å–ª–æ–≤–∞–º –∏–Ω–¥–µ–∫—Å–æ–≤
words_indexed = {}
for idx, word in enumerate(word_list):
    words_indexed[word] = idx
    
# OHE
vectorizer_OHE = CountVectorizer(vocabulary=words_indexed, tokenizer=dummy, preprocessor=dummy, dtype=np.int8, binary=True)
x_train_OHE = vectorizer_OHE.fit_transform(xtr)
x_test_OHE = vectorizer_OHE.transform(xte)
            
# COUNT
vectorizer_COUNT = CountVectorizer(vocabulary=words_indexed, tokenizer=dummy, preprocessor=dummy, dtype=np.int8)
x_train_COUNT = vectorizer_COUNT.fit_transform(xtr)
x_test_COUNT = vectorizer_COUNT.transform(xte)
            
# TF-IDF
vectorizer_TFIDF = TfidfVectorizer(vocabulary=words_indexed, preprocessor=dummy, tokenizer=dummy, dtype=np.float32)
x_train_TFIDF = vectorizer_TFIDF.fit_transform(xtr)
x_test_TFIDF = vectorizer_TFIDF.transform(xte)

In [73]:
clf_OHE = RandomForestClassifier(n_estimators=50, n_jobs=-1)
clf_OHE = clf_OHE.fit(x_train_OHE, y_train)

In [74]:
predictions_OHE = clf_OHE.predict(x_test_OHE)
macro_score_OHE = f1_score(y_test, predictions_OHE, average='macro')

In [75]:
print(macro_score_OHE)

0.8811401647147361


In [12]:
clf_COUNT = RandomForestClassifier(n_estimators=50, n_jobs=-1)
#clf_COUNT = clf_COUNT.fit(x_train_COUNT, y_train)

In [79]:
predictions_COUNT = clf_COUNT.predict(x_test_OHE)
macro_score_COUNT = f1_score(y_test, predictions_COUNT, average='macro')

In [80]:
print(macro_score_COUNT)

0.8793384460507578


In [81]:
clf_TFIDF = RandomForestClassifier(n_estimators=50, n_jobs=-1)
clf_TFIDF = clf_TFIDF.fit(x_train_TFIDF, y_train)

In [82]:
predictions_TFIDF = clf_TFIDF.predict(x_test_TFIDF)
macro_score_TFIDF = f1_score(y_test, predictions_TFIDF, average='macro')

In [84]:
print(macro_score_TFIDF)

0.8800442265813493


In [72]:
clf_OHE_GB = GradientBoostingClassifier()
clf_OHE_GB = clf_OHE_GB.fit(x_train_OHE, y_train)

In [75]:
predictions_GB_OHE = clf_OHE_GB.predict(x_test_OHE)
macro_score_OHE_GB = f1_score(y_test, predictions_GB_OHE, average='macro')

In [76]:
print(macro_score_OHE_GB)

0.8202423408128906


In [174]:
clf_OHE_ADA = AdaBoostClassifier(estimator=RandomForestClassifier(n_estimators=12, max_depth=25, n_jobs=-1), n_estimators=50, learning_rate=0.3)
clf_OHE_ADA = clf_OHE_ADA.fit(x_train_OHE, y_train)

In [175]:
predictions_ADA_OHE = clf_OHE_ADA.predict(x_test_OHE)
macro_score_ADA_OHE = f1_score(y_test, predictions_ADA_OHE, average='macro')

In [176]:
print(macro_score_ADA_OHE)

0.8810723137667106


In [171]:
clf_COUNT_ADA = AdaBoostClassifier(estimator=RandomForestClassifier(n_estimators=12, max_depth=25, n_jobs=-1), n_estimators=50, learning_rate=0.3)
clf_COUNT_ADA = clf_OHE_ADA.fit(x_train_COUNT, y_train)

In [172]:
predictions_ADA_COUNT = clf_COUNT_ADA.predict(x_test_COUNT)
macro_score_ADA_COUNT = f1_score(y_test, predictions_ADA_COUNT, average='macro')

In [173]:
print(macro_score_ADA_COUNT)

0.8812967726235373


In [168]:
clf_TFIDF_ADA = AdaBoostClassifier(estimator=RandomForestClassifier(n_estimators=12, max_depth=25, n_jobs=-1), n_estimators=50, learning_rate=0.3)
clf_TFIDF_ADA = clf_OHE_ADA.fit(x_train_TFIDF, y_train)

In [169]:
predictions_ADA_TFIDF = clf_TFIDF_ADA.predict(x_test_TFIDF)
macro_score_ADA_TFIDF = f1_score(y_test, predictions_ADA_TFIDF, average='macro')

In [170]:
print(macro_score_ADA_TFIDF)

0.8857382437564187


In [177]:
clf_OHE_ADA.__class__.__name__

'AdaBoostClassifier'