In [18]:
import nltk
import string
from datasets import load_dataset
import re
from nltk.corpus import stopwords
from nltk import pos_tag
import numpy as np
from sklearn.naive_bayes import BernoulliNB, MultinomialNB, ComplementNB 
from sklearn.metrics import f1_score, make_scorer
from sklearn.feature_extraction.text import TfidfVectorizer
import mlflow
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from sklearnex import patch_sklearn
from warnings import filterwarnings
patch_sklearn()

Intel(R) Extension for Scikit-learn* enabled (https://github.com/uxlfoundation/scikit-learn-intelex)


In [19]:
filterwarnings("ignore")

In [20]:
dataset = "imdb"
ag_news_dataset = load_dataset(dataset)
stop_words = set(stopwords.words("english"))

In [21]:
text_preprocess_types = [None]

words_classes = ['ALL']

frequency_filtration_types = ['low']

In [22]:
iterations_num = len(text_preprocess_types) * len(words_classes) * len(frequency_filtration_types)
print(iterations_num)

1


In [23]:
def base_preprocess(text):
    tokens = text.lower()

    # Удаление спец слов
    if dataset == 'ag_news':
        special_words = ['reuters', 'afp', 'ap', 'usatoday.com', 'forbes.com', 'target=/stocks/quickinfo/fullquote"' ]
        for word in special_words:
            tokens = tokens.replace(word, '')
        
        pattern = r'[&lt][^<>]*&gt'
        tokens = re.sub(pattern, '', tokens)
    elif dataset == 'imdb':
        special_words = ['<br /><br />'] 
        for word in special_words:
            tokens = tokens.replace(word, '')
    
    # Удаление пунктуации
    tokens = ''.join(i if i not in set(string.punctuation)  else ' ' for i in tokens)
    
    return tokens

In [24]:
txt0 = ag_news_dataset['train']['text'][0]
txt1 = base_preprocess(txt0)

print(txt0)
print(txt1)

I rented I AM CURIOUS-YELLOW from my video store because of all the controversy that surrounded it when it was first released in 1967. I also heard that at first it was seized by U.S. customs if it ever tried to enter this country, therefore being a fan of films considered "controversial" I really had to see this for myself.<br /><br />The plot is centered around a young Swedish drama student named Lena who wants to learn everything she can about life. In particular she wants to focus her attentions to making some sort of documentary on what the average Swede thought about certain political issues such as the Vietnam War and race issues in the United States. In between asking politicians and ordinary denizens of Stockholm about their opinions on politics, she has sex with her drama teacher, classmates, and married men.<br /><br />What kills me about I AM CURIOUS-YELLOW is that 40 years ago, this was considered pornographic. Really, the sex and nudity scenes are few and far between, eve

In [25]:
def different_preprocess(tokens, preprocess_type, words_class):
    tokenizer = nltk.tokenize.TreebankWordTokenizer()
    tokens = tokenizer.tokenize(tokens)
    # Обработка частей речи
    if words_class != 'ALL':
        tokens = pos_tag(tokens)
        if words_class == 'N':
            tokens = [word for word, tag in tokens if tag.startswith('N')]
        elif words_class == 'NJ':
            tokens = [word for word, tag in tokens if tag.startswith('N') or tag.startswith('J')]
        elif words_class == 'NJV':
            tokens = [word for word, tag in tokens if tag.startswith('N') or tag.startswith('J') or tag.startswith('V')]
    
    # Обработка слов
    if preprocess_type == 'лемматизация':
        lemmatizer = nltk.WordNetLemmatizer()
        tokens = [lemmatizer.lemmatize(token) for token in tokens]
    elif preprocess_type == 'стемминг':
        stemmer = nltk.PorterStemmer()
        tokens = [stemmer.stem(token) for token in tokens]
    
    return ' '.join(tokens)  

In [26]:
txt2 = different_preprocess(txt1, 'лемматизация', 'ALL')

print(different_preprocess(txt1, 'ничего', 'ALL'))
print(different_preprocess(txt1, 'стемминг', 'ALL'))
print(txt2)

i rented i am curious yellow from my video store because of all the controversy that surrounded it when it was first released in 1967 i also heard that at first it was seized by u s customs if it ever tried to enter this country therefore being a fan of films considered controversial i really had to see this for myself the plot is centered around a young swedish drama student named lena who wants to learn everything she can about life in particular she wants to focus her attentions to making some sort of documentary on what the average swede thought about certain political issues such as the vietnam war and race issues in the united states in between asking politicians and ordinary denizens of stockholm about their opinions on politics she has sex with her drama teacher classmates and married men what kills me about i am curious yellow is that 40 years ago this was considered pornographic really the sex and nudity scenes are few and far between even then it s not shot like some cheaply

In [27]:
def frequency_filtration(words_dictionary, frequency_filtration_type):
    if frequency_filtration_type == 'low':
        return dict([(key, value) for key, value in words_dictionary.items() if value >= 3 ])
    else:
        return words_dictionary

In [28]:
def dummy(doc):
    return doc

In [29]:
def final_preprocess(dataset, model_type):
    # Подготовка данных
    x_train = dataset['train']['text']
    y_train = dataset['train']['label']
    
    x_test = dataset['test']['text']
    y_test = dataset['test']['label']
    
    # Базовая обработка
    xtr = [base_preprocess(text) for text in x_train]
    xte = [base_preprocess(text) for text in x_test]
    
    index = 0    
    
    # Части речи + приведение
    xtr = [different_preprocess(tokens, text_preprocess_types, 'ALL') for tokens in xtr]
    xte = [different_preprocess(tokens, text_preprocess_types, 'ALL') for tokens in xte]
    
    vectorizer = TfidfVectorizer(stop_words="english",ngram_range=(1,2), min_df=2, dtype=np.float32)
    vectorizer.fit(xtr)
    
    xtr = vectorizer.transform(xtr)
    xte = vectorizer.transform(xte)
        
    # Обучение классификаторов
    clf.fit(xtr, y_train)
    
    # Тестирование
    predictions = clf.predict(xte)
    
    # Метрики
    score = f1_score(y_test, predictions, average='macro')
    
    mlflow.start_run(run_name=f'{model_type}')
    mlflow.log_param('model', clf.__class__.__name__)
    mlflow.log_param('preprocess_type', 'ничего')
    mlflow.log_param('words_class', 'ALL')
    mlflow.log_metric('macro_score', score)
    mlflow.end_run()
    
    index += 1
    print(f'Итерация {index}')

In [30]:
dataset1 = ag_news_dataset
preprocess_type = 'лемматизация'
words_class = 'ALL'

# Подготовка данных
x_train = dataset1['train']['text']
y_train = dataset1['train']['label']
    
x_test = dataset1['test']['text']
y_test = dataset1['test']['label']
    
# Базовая обработка
xtr = [base_preprocess(text) for text in x_train]
xte = [base_preprocess(text) for text in x_test]
    
# Части речи + приведение
xtr1 = [different_preprocess(tokens, preprocess_type, words_class) for tokens in xtr]
xte1 = [different_preprocess(tokens, preprocess_type, words_class) for tokens in xte]

In [31]:
if dataset == 'ag_news':
    vectorizer = TfidfVectorizer(stop_words="english",ngram_range=(1,2), min_df=5, dtype=np.float32)
elif dataset == 'imdb':
    vectorizer = TfidfVectorizer(stop_words="english",ngram_range=(1,2), min_df=3, dtype=np.float32)
vectorizer.fit(xtr1)

xtr = vectorizer.transform(xtr1)
xte = vectorizer.transform(xte1) 

In [32]:
print(xtr.shape)

(25000, 161634)


#### BernoulliNB

In [33]:
clf_bnb = BernoulliNB()
clf_bnb.fit(xtr, y_train)

In [34]:
predictions_bnb = clf_bnb.predict(xte)
macro_score_bnb = f1_score(y_test, predictions_bnb, average='macro')
print(macro_score_bnb)

0.8516967383377632


#### MultinomialNB

In [35]:
clf_mnb = MultinomialNB()
clf_mnb.fit(xtr, y_train)

In [36]:
predictions_mnb = clf_mnb.predict(xte)
macro_score_mnb = f1_score(y_test, predictions_mnb, average='macro')
print(macro_score_mnb)

0.8514314724198603


#### ComplementNB

In [37]:
clf_cnb = ComplementNB()
clf_cnb.fit(xtr, y_train)

In [38]:
predictions_cnb = clf_cnb.predict(xte)
macro_score_cnb = f1_score(y_test, predictions_cnb, average='macro')
print(macro_score_cnb)

0.8514314724198603


## Подбор гипер-параметров

#### NB - alpha 

In [39]:
f1_macro_scorer = make_scorer(f1_score, average='macro')

In [40]:
parameters = { # 10
    'alpha':[0.01, 0.1, 0.5, 1, 10]
} 

#### BernoulliNB

In [41]:
# Инициализация RandomizedSearchCV
search_bnb = GridSearchCV(
    BernoulliNB(),
    param_grid=parameters,
    cv=5,                       # Количество фолдов
    scoring=f1_macro_scorer,    # Метрика
    n_jobs=-1,                  # Использовать все ядра
    verbose=1,
    refit=True 
)

In [42]:
search_bnb.fit(xtr, y_train)

Fitting 5 folds for each of 5 candidates, totalling 25 fits


In [43]:
print("Лучшие параметры:", search_bnb.best_params_)
print()
print("Лучший F1 macro score:", search_bnb.best_score_)

Лучшие параметры: {'alpha': 10}

Лучший F1 macro score: 0.8355490453859211


In [44]:
y_pred_bnb = search_bnb.best_estimator_.predict(xte)
sc_bnb = f1_score(y_test, y_pred_bnb, average='macro')
print(sc_bnb)

0.8461815820809724


#### MultinomialNB

In [45]:
# Инициализация RandomizedSearchCV
search_mnb = GridSearchCV(
    MultinomialNB(),
    param_grid=parameters,
    cv=5,                       # Количество фолдов
    scoring=f1_macro_scorer,    # Метрика
    n_jobs=-1,                  # Использовать все ядра
    verbose=1,
    refit=True 
)

In [46]:
search_mnb.fit(xtr, y_train)

Fitting 5 folds for each of 5 candidates, totalling 25 fits


In [47]:
print("Лучшие параметры:", search_mnb.best_params_)
print()
print("Лучший F1 macro score:", search_mnb.best_score_)

Лучшие параметры: {'alpha': 10}

Лучший F1 macro score: 0.8266591026401808


In [48]:
y_pred_mnb = search_mnb.best_estimator_.predict(xte)
sc_mnb = f1_score(y_test, y_pred_mnb, average='macro')
print(sc_mnb)

0.8438440022103378


#### ComplementNB

In [49]:
# Инициализация RandomizedSearchCV
search_cnb = GridSearchCV(
    ComplementNB(),
    param_grid=parameters,
    cv=5,                       # Количество фолдов
    scoring=f1_macro_scorer,    # Метрика
    n_jobs=-1,                  # Использовать все ядра
    verbose=1,
    refit=True 
)

In [50]:
search_cnb.fit(xtr, y_train)

Fitting 5 folds for each of 5 candidates, totalling 25 fits


In [51]:
print("Лучшие параметры:", search_cnb.best_params_)
print()
print("Лучший F1 macro score:", search_cnb.best_score_)

Лучшие параметры: {'alpha': 10}

Лучший F1 macro score: 0.8266591026401808


In [52]:
y_pred_cnb = search_cnb.best_estimator_.predict(xte)
sc_cnb = f1_score(y_test, y_pred_cnb, average='macro')
print(sc_cnb)

0.8438440022103378
