In [1]:
import nltk
import string
from datasets import load_dataset
import re
from nltk.corpus import stopwords
from nltk import pos_tag
import numpy as np
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier
from xgboost import XGBClassifier
from sklearn.svm import LinearSVC
from sklearn.metrics import f1_score, make_scorer
from sklearn.feature_extraction.text import TfidfVectorizer
import mlflow
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from sklearnex import patch_sklearn
from warnings import filterwarnings
patch_sklearn()

Intel(R) Extension for Scikit-learn* enabled (https://github.com/uxlfoundation/scikit-learn-intelex)


In [2]:
filterwarnings("ignore")

In [3]:
dataset = "ag_news"
ag_news_dataset = load_dataset(dataset)
stop_words = set(stopwords.words("english"))

In [4]:
text_preprocess_types = [None]

words_classes = ['ALL']

frequency_filtration_types = ['low']

In [5]:
iterations_num = len(text_preprocess_types) * len(words_classes) * len(frequency_filtration_types)
print(iterations_num)

1


In [6]:
def base_preprocess(text):
    tokens = text.lower()

    # Удаление спец слов
    if dataset == 'ag_news':
        special_words = ['reuters', 'afp', 'ap', 'usatoday.com', 'forbes.com', 'target=/stocks/quickinfo/fullquote"' ]
        for word in special_words:
            tokens = tokens.replace(word, '')
        
        pattern = r'[&lt][^<>]*&gt'
        tokens = re.sub(pattern, '', tokens)
    elif dataset == 'imdb':
        special_words = ['<br /><br />'] 
        for word in special_words:
            tokens = tokens.replace(word, '')
    
    # Удаление пунктуации
    tokens = ''.join(i if i not in set(string.punctuation)  else ' ' for i in tokens)
    
    return tokens

In [7]:
txt0 = ag_news_dataset['train']['text'][0]
txt1 = base_preprocess(txt0)

print(txt0)
print(txt1)

Wall St. Bears Claw Back Into the Black (Reuters) Reuters - Short-sellers, Wall Street's dwindling\band of ultra-cynics, are seeing green again.
wall st  bears claw back into the black       short sellers  wall street s dwindling band of ultra cynics  are seeing green again 


In [8]:
def different_preprocess(tokens, preprocess_type, words_class):
    tokenizer = nltk.tokenize.TreebankWordTokenizer()
    tokens = tokenizer.tokenize(tokens)
    # Обработка частей речи
    if words_class != 'ALL':
        tokens = pos_tag(tokens)
        if words_class == 'N':
            tokens = [word for word, tag in tokens if tag.startswith('N')]
        elif words_class == 'NJ':
            tokens = [word for word, tag in tokens if tag.startswith('N') or tag.startswith('J')]
        elif words_class == 'NJV':
            tokens = [word for word, tag in tokens if tag.startswith('N') or tag.startswith('J') or tag.startswith('V')]
    
    # Обработка слов
    if preprocess_type == 'лемматизация':
        lemmatizer = nltk.WordNetLemmatizer()
        tokens = [lemmatizer.lemmatize(token) for token in tokens]
    elif preprocess_type == 'стемминг':
        stemmer = nltk.PorterStemmer()
        tokens = [stemmer.stem(token) for token in tokens]
    
    return ' '.join(tokens)  

In [9]:
txt2 = different_preprocess(txt1, 'лемматизация', 'ALL')

print(different_preprocess(txt1, 'ничего', 'ALL'))
print(different_preprocess(txt1, 'стемминг', 'ALL'))
print(txt2)

wall st bears claw back into the black short sellers wall street s dwindling band of ultra cynics are seeing green again
wall st bear claw back into the black short seller wall street s dwindl band of ultra cynic are see green again
wall st bear claw back into the black short seller wall street s dwindling band of ultra cynic are seeing green again


In [10]:
def frequency_filtration(words_dictionary, frequency_filtration_type):
    if frequency_filtration_type == 'low':
        return dict([(key, value) for key, value in words_dictionary.items() if value >= 3 ])
    else:
        return words_dictionary

In [11]:
def dummy(doc):
    return doc

In [12]:
def final_preprocess(dataset, model_type):
    # Подготовка данных
    x_train = dataset['train']['text']
    y_train = dataset['train']['label']
    
    x_test = dataset['test']['text']
    y_test = dataset['test']['label']
    
    # Базовая обработка
    xtr = [base_preprocess(text) for text in x_train]
    xte = [base_preprocess(text) for text in x_test]
    
    index = 0    
    
    # Части речи + приведение
    xtr = [different_preprocess(tokens, text_preprocess_types, 'ALL') for tokens in xtr]
    xte = [different_preprocess(tokens, text_preprocess_types, 'ALL') for tokens in xte]
    
    vectorizer = TfidfVectorizer(stop_words="english",ngram_range=(1,2), min_df=2, dtype=np.float32)
    vectorizer.fit(xtr)
    
    xtr = vectorizer.transform(xtr)
    xte = vectorizer.transform(xte)

    # Построение классификаторов
    if model_type == 'ADA':
        clf = AdaBoostClassifier()
       # clf =  AdaBoostClassifier(estimator=RandomForestClassifier(n_estimators=12, max_depth=25, n_jobs=-1), n_estimators=50, learning_rate=0.3)
    elif model_type=='GBM':
        clf = GradientBoostingClassifier()
    elif model_type == 'XGB':
        clf = XGBClassifier()
        
    # Обучение классификаторов
    clf.fit(xtr, y_train)
    
    # Тестирование
    predictions = clf.predict(xte)
    
    # Метрики
    score = f1_score(y_test, predictions, average='macro')
    
    mlflow.start_run(run_name=f'{model_type}')
    mlflow.log_param('model', clf.__class__.__name__)
    mlflow.log_param('preprocess_type', 'ничего')
    mlflow.log_param('words_class', 'ALL')
    mlflow.log_metric('macro_score', score)
    mlflow.end_run()
    
    index += 1
    print(f'Итерация {index}')

In [54]:
dataset = ag_news_dataset
preprocess_type = 'лемматизация'
words_class = 'ALL'

# Подготовка данных
x_train = dataset['train']['text']
y_train = dataset['train']['label']
    
x_test = dataset['test']['text']
y_test = dataset['test']['label']
    
# Базовая обработка
xtr = [base_preprocess(text) for text in x_train]
xte = [base_preprocess(text) for text in x_test]
    
# Части речи + приведение
xtr1 = [different_preprocess(tokens, preprocess_type, words_class) for tokens in xtr]
xte1 = [different_preprocess(tokens, preprocess_type, words_class) for tokens in xte]

In [60]:
vectorizer = TfidfVectorizer(stop_words="english",ngram_range=(1,2), min_df=5, dtype=np.float32)
vectorizer.fit(xtr1)

xtr = vectorizer.transform(xtr1)
xte = vectorizer.transform(xte1) 

In [61]:
print(xtr.shape)

(120000, 104479)


In [62]:
clf_svm = LinearSVC()
clf_svm.fit(xtr, y_train)

In [63]:
predictions_svm = clf_svm.predict(xte)
macro_score_svm = f1_score(y_test, predictions_svm, average='macro')

In [64]:
print(macro_score_svm)

0.9251280248036662


In [65]:
from sklearn.svm import SVC

In [66]:
clf = SVC()
clf.fit(xtr, y_train)

In [67]:
pred = clf.predict(xte)
score = f1_score(y_test, pred, average='macro')

In [68]:
print(score)

0.9255195847875842


## Подбор гипер-параметров

#### SVM - C, class_weight, loss, penalty, dual, multi_class  

In [73]:
f1_macro_scorer = make_scorer(f1_score, average='macro')

In [69]:
parameters1 = { # 10
    'C': [0.01, 0.1, 1, 10, 100],            #  5
    'class_weight': ['balanced', None],      #  2 
    'multi_class':['crammer_singer'],        #  1
} 

In [70]:
parameters2 = { # 40
    'C': [0.01, 0.1, 1, 10, 100],            #  5
    'class_weight': ['balanced', None],      #  2
    'loss': ['hinge', 'squared_hinge'],      #  2     
    'penalty': ['l2'],                       #  1
    'dual' : [True, False],                  #  2
    'multi_class':['ovr'],                   #  1
} 

In [71]:
parameters3 = { # 20
    'C': [0.01, 0.1, 1, 10, 100],            #  5
    'class_weight': ['balanced', None],      #  2
    'loss': ['squared_hinge'],               #  1     
    'penalty': ['l1'],                       #  1
    'dual' : [True, False],                  #  2
    'multi_class':['ovr'],                   #  1
} 

In [74]:
# Инициализация RandomizedSearchCV
search_svc3 = GridSearchCV(
    LinearSVC(),
    param_grid=parameters3,
    cv=5,                       # Количество фолдов
    scoring=f1_macro_scorer,    # Метрика
    n_jobs=-1,                  # Использовать все ядра
    verbose=1,
    refit=True 
)

In [75]:
search_svc3.fit(xtr, y_train)

Fitting 5 folds for each of 20 candidates, totalling 100 fits


In [76]:
print("Лучшие параметры:", search_svc3.best_params_)
print()
print("Лучший F1 macro score:", search_svc3.best_score_)

Лучшие параметры: {'C': 1, 'class_weight': None, 'dual': False, 'loss': 'squared_hinge', 'multi_class': 'ovr', 'penalty': 'l1'}

Лучший F1 macro score: 0.9069483165135812


In [77]:
y_pred = search_svc3.best_estimator_.predict(xte)
sc = f1_score(y_test, y_pred, average='macro')
print(sc)

0.9233766747723486


In [78]:
# Инициализация RandomizedSearchCV
search_svc2 = GridSearchCV(
    LinearSVC(),
    param_grid=parameters2,
    cv=5,                       # Количество фолдов
    scoring=f1_macro_scorer,    # Метрика
    n_jobs=6,                  # Использовать все ядра
    verbose=1,
    refit=True 
)

In [79]:
search_svc2.fit(xtr, y_train)

Fitting 5 folds for each of 40 candidates, totalling 200 fits


In [80]:
print("Лучшие параметры:", search_svc2.best_params_)
print()
print("Лучший F1 macro score:", search_svc2.best_score_)

Лучшие параметры: {'C': 1, 'class_weight': 'balanced', 'dual': True, 'loss': 'hinge', 'multi_class': 'ovr', 'penalty': 'l2'}

Лучший F1 macro score: 0.9117427787159066


In [81]:
y_pred2 = search_svc2.best_estimator_.predict(xte)
sc2 = f1_score(y_test, y_pred2, average='macro')
print(sc2)

0.9254725127452094


In [84]:
# Инициализация RandomizedSearchCV
search_svc1 = GridSearchCV(
    LinearSVC(),
    param_grid=parameters1,
    cv=5,                       # Количество фолдов
    scoring=f1_macro_scorer,    # Метрика
    n_jobs=6,                  # Использовать все ядра
    verbose=1,
    refit=True 
)

In [85]:
search_svc1.fit(xtr, y_train)

Fitting 5 folds for each of 10 candidates, totalling 50 fits


In [86]:
print("Лучшие параметры:", search_svc1.best_params_)
print()
print("Лучший F1 macro score:", search_svc1.best_score_)

Лучшие параметры: {'C': 0.1, 'class_weight': 'balanced', 'multi_class': 'crammer_singer'}

Лучший F1 macro score: 0.9099054088139049


In [87]:
y_pred1 = search_svc1.best_estimator_.predict(xte)
sc1 = f1_score(y_test, y_pred1, average='macro')
print(sc1)

0.918998039432901


In [93]:
parameters2v2 = { # 6
    'C': [0.3, 0.5, 2],                      #  3
    'class_weight': ['balanced'],            #  1
    'loss': ['hinge', 'squared_hinge'],      #  2     
    'penalty': ['l2'],                       #  1
    'dual' : [True],                         #  1
    'multi_class':['ovr'],                   #  1
} 

In [94]:
search_svc2v2 = GridSearchCV(
    LinearSVC(),
    param_grid=parameters2v2,
    cv=5,                       # Количество фолдов
    scoring=f1_macro_scorer,    # Метрика
    n_jobs=6,                  # Использовать все ядра
    verbose=1,
    refit=True 
)

In [95]:
search_svc2v2.fit(xtr, y_train)

Fitting 5 folds for each of 6 candidates, totalling 30 fits


In [96]:
print("Лучшие параметры:", search_svc2v2.best_params_)
print()
print("Лучший F1 macro score:", search_svc2v2.best_score_)

Лучшие параметры: {'C': 0.5, 'class_weight': 'balanced', 'dual': True, 'loss': 'hinge', 'multi_class': 'ovr', 'penalty': 'l2'}

Лучший F1 macro score: 0.9130865965349404


In [97]:
y_pred2v2 = search_svc2v2.best_estimator_.predict(xte)
sc2v2 = f1_score(y_test, y_pred2v2, average='macro')
print(sc2v2)

0.9245471198445654
