In [1]:
import nltk
import string
from datasets import load_dataset
import re
from nltk.corpus import stopwords
from nltk import pos_tag
import numpy as np
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier
from xgboost import XGBClassifier
from sklearn.decomposition import TruncatedSVD
from scipy.sparse import csr_matrix
from sklearn.metrics import f1_score, make_scorer
from sklearn.feature_extraction.text import TfidfVectorizer
import mlflow
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from sklearnex import patch_sklearn
from warnings import filterwarnings
patch_sklearn()

Intel(R) Extension for Scikit-learn* enabled (https://github.com/uxlfoundation/scikit-learn-intelex)


In [2]:
filterwarnings("ignore")

In [3]:
dataset = "imdb"
ag_news_dataset = load_dataset(dataset)
stop_words = set(stopwords.words("english"))

In [4]:
text_preprocess_types = [None]

words_classes = ['ALL']

frequency_filtration_types = ['low']

In [5]:
iterations_num = len(text_preprocess_types) * len(words_classes) * len(frequency_filtration_types)
print(iterations_num)

1


In [6]:
def base_preprocess(text):
    tokens = text.lower()

    # Удаление спец слов
    if dataset == 'ag_news':
        special_words = ['reuters', 'afp', 'ap', 'usatoday.com', 'forbes.com', 'target=/stocks/quickinfo/fullquote"' ]
        for word in special_words:
            tokens = tokens.replace(word, '')
        
        pattern = r'[&lt][^<>]*&gt'
        tokens = re.sub(pattern, '', tokens)
    elif dataset == 'imdb':
        special_words = ['<br /><br />'] 
        for word in special_words:
            tokens = tokens.replace(word, '')
    
    # Удаление пунктуации
    tokens = ''.join(i if i not in set(string.punctuation)  else ' ' for i in tokens)
    
    return tokens

In [7]:
txt0 = ag_news_dataset['train']['text'][0]
txt1 = base_preprocess(txt0)

print(txt0)
print(txt1)

I rented I AM CURIOUS-YELLOW from my video store because of all the controversy that surrounded it when it was first released in 1967. I also heard that at first it was seized by U.S. customs if it ever tried to enter this country, therefore being a fan of films considered "controversial" I really had to see this for myself.<br /><br />The plot is centered around a young Swedish drama student named Lena who wants to learn everything she can about life. In particular she wants to focus her attentions to making some sort of documentary on what the average Swede thought about certain political issues such as the Vietnam War and race issues in the United States. In between asking politicians and ordinary denizens of Stockholm about their opinions on politics, she has sex with her drama teacher, classmates, and married men.<br /><br />What kills me about I AM CURIOUS-YELLOW is that 40 years ago, this was considered pornographic. Really, the sex and nudity scenes are few and far between, eve

In [8]:
def different_preprocess(tokens, preprocess_type, words_class):
    tokenizer = nltk.tokenize.TreebankWordTokenizer()
    tokens = tokenizer.tokenize(tokens)
    # Обработка частей речи
    if words_class != 'ALL':
        tokens = pos_tag(tokens)
        if words_class == 'N':
            tokens = [word for word, tag in tokens if tag.startswith('N')]
        elif words_class == 'NJ':
            tokens = [word for word, tag in tokens if tag.startswith('N') or tag.startswith('J')]
        elif words_class == 'NJV':
            tokens = [word for word, tag in tokens if tag.startswith('N') or tag.startswith('J') or tag.startswith('V')]
    
    # Обработка слов
    if preprocess_type == 'лемматизация':
        lemmatizer = nltk.WordNetLemmatizer()
        tokens = [lemmatizer.lemmatize(token) for token in tokens]
    elif preprocess_type == 'стемминг':
        stemmer = nltk.PorterStemmer()
        tokens = [stemmer.stem(token) for token in tokens]
    
    return ' '.join(tokens)  

In [9]:
txt2 = different_preprocess(txt1, 'лемматизация', 'ALL')

print(different_preprocess(txt1, 'ничего', 'ALL'))
print(different_preprocess(txt1, 'стемминг', 'ALL'))
print(txt2)

i rented i am curious yellow from my video store because of all the controversy that surrounded it when it was first released in 1967 i also heard that at first it was seized by u s customs if it ever tried to enter this country therefore being a fan of films considered controversial i really had to see this for myself the plot is centered around a young swedish drama student named lena who wants to learn everything she can about life in particular she wants to focus her attentions to making some sort of documentary on what the average swede thought about certain political issues such as the vietnam war and race issues in the united states in between asking politicians and ordinary denizens of stockholm about their opinions on politics she has sex with her drama teacher classmates and married men what kills me about i am curious yellow is that 40 years ago this was considered pornographic really the sex and nudity scenes are few and far between even then it s not shot like some cheaply

In [10]:
def frequency_filtration(words_dictionary, frequency_filtration_type):
    if frequency_filtration_type == 'low':
        return dict([(key, value) for key, value in words_dictionary.items() if value >= 3 ])
    else:
        return words_dictionary

In [11]:
def dummy(doc):
    return doc

In [12]:
def final_preprocess(dataset, model_type):
    # Подготовка данных
    x_train = dataset['train']['text']
    y_train = dataset['train']['label']
    
    x_test = dataset['test']['text']
    y_test = dataset['test']['label']
    
    # Базовая обработка
    xtr = [base_preprocess(text) for text in x_train]
    xte = [base_preprocess(text) for text in x_test]
    
    index = 0    
    
    # Части речи + приведение
    xtr = [different_preprocess(tokens, text_preprocess_types, 'ALL') for tokens in xtr]
    xte = [different_preprocess(tokens, text_preprocess_types, 'ALL') for tokens in xte]
    
    vectorizer = TfidfVectorizer(stop_words="english",ngram_range=(1,2), min_df=2, dtype=np.float32)
    vectorizer.fit(xtr)
    
    xtr = vectorizer.transform(xtr)
    xte = vectorizer.transform(xte)

    # Построение классификаторов
    if model_type == 'ADA':
        clf = AdaBoostClassifier()
       # clf =  AdaBoostClassifier(estimator=RandomForestClassifier(n_estimators=12, max_depth=25, n_jobs=-1), n_estimators=50, learning_rate=0.3)
    elif model_type=='GBM':
        clf = GradientBoostingClassifier()
    elif model_type == 'XGB':
        clf = XGBClassifier()
        
    # Обучение классификаторов
    clf.fit(xtr, y_train)
    
    # Тестирование
    predictions = clf.predict(xte)
    
    # Метрики
    score = f1_score(y_test, predictions, average='macro')
    
    mlflow.start_run(run_name=f'{model_type}')
    mlflow.log_param('model', clf.__class__.__name__)
    mlflow.log_param('preprocess_type', 'ничего')
    mlflow.log_param('words_class', 'ALL')
    mlflow.log_metric('macro_score', score)
    mlflow.end_run()
    
    index += 1
    print(f'Итерация {index}')

In [None]:
mlflow.set_tracking_uri("http://127.0.0.1:5000")
mlflow.set_experiment(experiment_name="agNewsENSEMBLE")

In [18]:
final_preprocess(ag_news_dataset, model_type='GB')

In [60]:
dataset = ag_news_dataset
preprocess_type = 'лемматизация'
words_class = 'ALL'

# Подготовка данных
x_train = dataset['train']['text']
y_train = dataset['train']['label']
    
x_test = dataset['test']['text']
y_test = dataset['test']['label']
    
# Базовая обработка
xtr = [base_preprocess(text) for text in x_train]
xte = [base_preprocess(text) for text in x_test]
    
# Части речи + приведение
xtr = [different_preprocess(tokens, preprocess_type, words_class) for tokens in xtr]
xte = [different_preprocess(tokens, preprocess_type, words_class) for tokens in xte]

vectorizer = TfidfVectorizer(stop_words="english",ngram_range=(1,2), min_df=3, dtype=np.float32, max_features=100000)
vectorizer.fit(xtr)

xtr = vectorizer.transform(xtr)
xte = vectorizer.transform(xte) 

In [61]:
print(xtr.shape)

(25000, 100000)


In [65]:
clf_ada = AdaBoostClassifier(estimator=RandomForestClassifier(max_depth=30, n_estimators=50, min_samples_split=10, n_jobs=-1), n_estimators=100, learning_rate=0.2)

clf_ada.fit(xtr, y_train)

In [66]:
predictions_ada = clf_ada.predict(xte)
macro_score_ada = f1_score(y_test, predictions_ada)

In [67]:
print(macro_score_ada)

0.879231731428802


In [None]:
[871, 876, 879]

In [83]:
clf_gbm = GradientBoostingClassifier(n_estimators=25)
clf_gbm.fit(xtr, y_train)

MemoryError: Unable to allocate 44.7 GiB for an array with shape (120000, 100000) and data type float32

In [75]:
predictions_gbm = clf_gbm.predict(xte)
macro_score_gbm = f1_score(y_test, predictions_gbm, average='macro')

In [76]:
print(macro_score_gbm)

0.7398679601660276


In [86]:
clf_xgb = XGBClassifier(device='cuda')
clf_xgb.fit(xtr, y_train)

In [87]:
predictions_xgb = clf_xgb.predict(xte)
macro_score_xgb = f1_score(y_test, predictions_xgb, average='macro')

In [88]:
print(macro_score_xgb)

0.8933065363459356


## Подбор гипер-параметров

#### XGB - max_depth, colsample_bytree, subsample, reg_lambda, learning_rate

In [20]:
f1_macro_scorer = f1_score

In [42]:
parameters = {
    'max_depth': [5, 7, 10],              # Максимальная глубина дерева
    'colsample_bytree': [0.3,0.4, 0.75],        # Процент задействованных столбцов на дерево
    'reg_lambda': [0.1, 0.5, 1],                  # Коэффициент L2 регуляризации
    'learning_rate': [0.5 ,0.25, 0.1],              # Скорость обучения 
    'num_parallel_tree' : [1, 2]    
}

In [43]:
# Инициализация RandomizedSearchCV
random_search = RandomizedSearchCV(
    XGBClassifier(device='cuda'),
    param_distributions=parameters,
    n_iter=7,                  # Количество итераций
    cv=5,                       # Количество фолдов
    scoring=f1_macro_scorer,    # Метрика
   # n_jobs=-1,                  # Использовать все ядра
    verbose=1,
    refit=True,
    random_state=42,
)

In [44]:
random_search.fit(xtr, y_train)

Fitting 5 folds for each of 7 candidates, totalling 35 fits


XGBoostError: [23:07:23] C:\actions-runner\_work\xgboost\xgboost\src\common\device_vector.cu:23: Memory allocation error on worker 0: bad allocation: cudaErrorMemoryAllocation: out of memory
- Free memory: 0B
- Requested memory: 7.04516GB


In [38]:
print("Лучшие параметры:", random_search.best_params_)
print()
print("Лучший F1 macro score:", random_search.best_score_)

Лучшие параметры: {'reg_lambda': 0.1, 'num_parallel_tree': 2, 'max_depth': 3, 'learning_rate': 0.1, 'colsample_bytree': 0.3}

Лучший F1 macro score: nan


In [39]:
y_pred = random_search.best_estimator_.predict(xte)
sc = f1_score(y_test, y_pred)
print(sc)

0.818383542066268


#### GBM - max_depth, learning_rate, subsample, max_features

In [26]:
GBM_parameters = {
    'max_depth': [7, 10],              # Максимальная глубина дерева
    'learning_rate': [0.1, 0.13, 0.15],              # Скорость обучения 
    'max_features':['sqrt', 'log2', 0.1, 0.2],
    'subsample':[0.5,0.6, 0.7],
}

In [27]:
# Инициализация RandomizedSearchCV
GBM_search = RandomizedSearchCV(
    GradientBoostingClassifier(n_estimators=50),
    param_distributions=GBM_parameters,
    n_iter=5,                  # Количество итераций
    cv=4,                       # Количество фолдов
    scoring=f1_macro_scorer,    # Метрика
    n_jobs=-1,                  # Использовать все ядра
    verbose=1,
    refit=True
)

In [28]:
GBM_search.fit(xtr, y_train)

Fitting 4 folds for each of 5 candidates, totalling 20 fits


In [29]:
print("Лучшие параметры:", GBM_search.best_params_)
print()
print("Лучший F1 macro score:", GBM_search.best_score_)

Лучшие параметры: {'subsample': 0.6, 'max_features': 0.1, 'max_depth': 10, 'learning_rate': 0.13}

Лучший F1 macro score: nan


In [34]:
y_pred = GBM_search.best_estimator_.predict(xte)
sc = f1_score(y_test, y_pred)
print(sc)

0.8301251956181533


#### ADA - base_estimator__max_depth, n_estimators, learning_rate

In [None]:
ADA_parameters = {
    'estimator__max_depth': [3, 5, 7],              # Максимальная глубина дерева
    'n_estimators': [10, 25, 50],              # Скорость обучения 
    'learning_rate': [0.1, 0.5, 0.05],
}

In [None]:
# Инициализация RandomizedSearchCV
ADA_search = RandomizedSearchCV(
    AdaBoostClassifier(estimator=DecisionTreeClassifier(max_depth=7)),
    param_distributions=ADA_parameters,
    n_iter=5,  # Количество итераций
    cv=4,  # Количество фолдов
    scoring=f1_macro_scorer,  # Метрика
    # n_jobs=-1,                  # Использовать все ядра
    verbose=1,
    refit=True
)

In [None]:
ADA_search.fit(xtr, y_train)

In [None]:
print("Лучшие параметры:", ADA_search.best_params_)
print()
print("Лучший F1 macro score:", ADA_search.best_score_)

In [None]:
y_pred = ADA_search.best_estimator_.predict(xte)
sc = f1_score(y_test, y_pred, average='macro')
print(sc)