In [56]:
import nltk
import string
from datasets import load_dataset
import re
from nltk.corpus import stopwords
from nltk import pos_tag
import numpy as np
from sklearn.metrics import f1_score, make_scorer
from sklearn.svm import LinearSVC
import mlflow
from gensim.models import Word2Vec
from sklearnex import patch_sklearn
from warnings import filterwarnings
patch_sklearn()

Intel(R) Extension for Scikit-learn* enabled (https://github.com/uxlfoundation/scikit-learn-intelex)


In [2]:
filterwarnings("ignore")

In [3]:
dataset = "ag_news"
ag_news_dataset = load_dataset(dataset)
stop_words = set(stopwords.words("english"))

In [4]:
text_preprocess_types = [None]

words_classes = ['ALL']

frequency_filtration_types = ['low']

In [7]:
def base_preprocess(text):
    tokens = text.lower()

    # Удаление спец слов
    if dataset == 'ag_news':
        special_words = ['reuters', 'afp', 'ap', 'usatoday.com', 'forbes.com', 'target=/stocks/quickinfo/fullquote"' ]
        for word in special_words:
            tokens = tokens.replace(word, '')
        
        pattern = r'[&lt][^<>]*&gt'
        tokens = re.sub(pattern, '', tokens)
    elif dataset == 'imdb':
        special_words = ['<br /><br />'] 
        for word in special_words:
            tokens = tokens.replace(word, '')
    
    # Удаление пунктуации
    tokens = ''.join(i if i not in set(string.punctuation)  else ' ' for i in tokens)
    
    return tokens

In [8]:
txt0 = ag_news_dataset['train']['text'][0]
txt1 = base_preprocess(txt0)

print(txt0)
print(txt1)

Wall St. Bears Claw Back Into the Black (Reuters) Reuters - Short-sellers, Wall Street's dwindling\band of ultra-cynics, are seeing green again.
wall st  bears claw back into the black       short sellers  wall street s dwindling band of ultra cynics  are seeing green again 


In [17]:
def different_preprocess(tokens, preprocess_type, words_class):
    tokenizer = nltk.tokenize.TreebankWordTokenizer()
    tokens = tokenizer.tokenize(tokens)
    # Обработка частей речи
    if words_class != 'ALL':
        tokens = pos_tag(tokens)
        if words_class == 'N':
            tokens = [word for word, tag in tokens if tag.startswith('N')]
        elif words_class == 'NJ':
            tokens = [word for word, tag in tokens if tag.startswith('N') or tag.startswith('J')]
        elif words_class == 'NJV':
            tokens = [word for word, tag in tokens if tag.startswith('N') or tag.startswith('J') or tag.startswith('V')]
    
    # Обработка слов
    if preprocess_type == 'лемматизация':
        lemmatizer = nltk.WordNetLemmatizer()
        tokens = [lemmatizer.lemmatize(token) for token in tokens]
    elif preprocess_type == 'стемминг':
        stemmer = nltk.PorterStemmer()
        tokens = [stemmer.stem(token) for token in tokens]
    
    return tokens  

In [10]:
txt2 = different_preprocess(txt1, 'лемматизация', 'ALL')

print(different_preprocess(txt1, 'ничего', 'ALL'))
print(different_preprocess(txt1, 'стемминг', 'ALL'))
print(txt2)

wall st bears claw back into the black short sellers wall street s dwindling band of ultra cynics are seeing green again
wall st bear claw back into the black short seller wall street s dwindl band of ultra cynic are see green again
wall st bear claw back into the black short seller wall street s dwindling band of ultra cynic are seeing green again


In [11]:
def frequency_filtration(words_dictionary, frequency_filtration_type):
    if frequency_filtration_type == 'low':
        return dict([(key, value) for key, value in words_dictionary.items() if value >= 3 ])
    else:
        return words_dictionary

In [12]:
def dummy(doc):
    return doc

In [13]:
def final_preprocess(dataset, model_type):
    # Подготовка данных
    x_train = dataset['train']['text']
    y_train = dataset['train']['label']
    
    x_test = dataset['test']['text']
    y_test = dataset['test']['label']
    
    # Базовая обработка
    xtr = [base_preprocess(text) for text in x_train]
    xte = [base_preprocess(text) for text in x_test]
    
    index = 0    
    
    # Части речи + приведение
    xtr = [different_preprocess(tokens, text_preprocess_types, 'ALL') for tokens in xtr]
    xte = [different_preprocess(tokens, text_preprocess_types, 'ALL') for tokens in xte]
    
    vectorizer = TfidfVectorizer(stop_words="english",ngram_range=(1,2), min_df=2, dtype=np.float32)
    vectorizer.fit(xtr)
    
    xtr = vectorizer.transform(xtr)
    xte = vectorizer.transform(xte)
        
    # Обучение классификаторов
    clf.fit(xtr, y_train)
    
    # Тестирование
    predictions = clf.predict(xte)
    
    # Метрики
    score = f1_score(y_test, predictions, average='macro')
    
    mlflow.start_run(run_name=f'{model_type}')
    mlflow.log_param('model', clf.__class__.__name__)
    mlflow.log_param('preprocess_type', 'ничего')
    mlflow.log_param('words_class', 'ALL')
    mlflow.log_metric('macro_score', score)
    mlflow.end_run()
    
    index += 1
    print(f'Итерация {index}')

In [18]:
dataset1 = ag_news_dataset # Без очистки от стоп-слов
preprocess_type = 'лемматизация'
words_class = 'ALL'

# Подготовка данных
x_train = dataset1['train']['text']
y_train = dataset1['train']['label']
    
x_test = dataset1['test']['text']
y_test = dataset1['test']['label']
    
# Базовая обработка
xtr = [base_preprocess(text) for text in x_train]
xte = [base_preprocess(text) for text in x_test]
    
# Части речи + приведение
xtr1 = [different_preprocess(tokens, preprocess_type, words_class) for tokens in xtr]
xte1 = [different_preprocess(tokens, preprocess_type, words_class) for tokens in xte]

In [19]:
print(xtr1[0])

['wall', 'st', 'bear', 'claw', 'back', 'into', 'the', 'black', 'short', 'seller', 'wall', 'street', 's', 'dwindling', 'band', 'of', 'ultra', 'cynic', 'are', 'seeing', 'green', 'again']


In [59]:
model = Word2Vec(
    sentences=xtr1,
    vector_size=100,
    window=5,
    min_count=5,
    workers=4,
    epochs=10,
    negative=5,
    sample=1e-3,
)

In [76]:
words = ['company',
         'government',
         'president',
         'technology',
         'market',
         'space',
         'movie',
         'war',
         'election',
         'financial'
         ]

In [77]:
for word in words:
    similar_words = model.wv.most_similar(word)
    print(f"\nСлово: '{word}'")
    print("Топ-10 ближайших:")
    for similar_word, similarity in similar_words:
        print(f"{similar_word}: {similarity:.3f}")
    print('\n')


Слово: 'company'
Топ-10 ближайших:
firm: 0.767
giant: 0.682
manufacturer: 0.658
industry: 0.627
vendor: 0.623
carrier: 0.620
supplier: 0.619
provider: 0.618
maker: 0.617
unit: 0.616



Слово: 'government'
Топ-10 ближайших:
authority: 0.605
administration: 0.569
congress: 0.560
opposition: 0.528
commission: 0.517
syria: 0.511
cabinet: 0.510
coalition: 0.507
military: 0.504
parliament: 0.503



Слово: 'president'
Топ-10 ближайших:
administration: 0.658
tyrant: 0.574
voltchkov: 0.555
governor: 0.539
doctrine: 0.538
leadership: 0.524
leader: 0.516
adwatch: 0.507
regime: 0.501
presidency: 0.495



Слово: 'technology'
Топ-10 ближайших:
cabilities: 0.679
architecture: 0.624
rfid: 0.619
device: 0.598
product: 0.596
computing: 0.594
networking: 0.592
chip: 0.591
technique: 0.590
cability: 0.578



Слово: 'market'
Топ-10 ближайших:
sector: 0.680
boom: 0.561
sale: 0.527
listing: 0.518
trend: 0.515
index: 0.514
growth: 0.513
momentum: 0.509
economy: 0.508
business: 0.500



Слово: 'space'
Топ-10 

In [61]:
def text_to_vector(tokens, model, vector_size):
    # Инициализируем нулевой вектор
    vector = np.zeros(vector_size)
    count = 0
    
    # Суммируем векторы всех слов в тексте
    for word in tokens:
        if word in model.wv:
            vector += model.wv[word]
            count += 1
    
    # Усредняем (чтобы избежать влияния длины текста)
    if count > 0:
        vector /= count
    
    return vector

In [63]:
# Преобразуем все тексты в векторы
X_train_vec = np.array([text_to_vector(text, model, 100) for text in xtr])
X_test_vec = np.array([text_to_vector(text, model, 100) for text in xte])

In [64]:
print(X_train_vec[0])

[-0.80815233 -0.19333562  0.2882349   0.35271781 -0.3430605  -0.1597423
  0.18682618 -0.33848218  0.42078045  0.50048873 -0.33110976 -0.6055087
  1.23823712 -0.16140241  0.35532136 -0.08891205  0.40009164  0.64110277
  0.3362271  -0.65411718  1.07277837 -0.15126366  0.45740098  0.12451344
  0.6053802   0.2392165  -0.42635354 -0.13693942 -0.12494822 -0.71354578
  0.11098836  0.72782723 -0.22933599  0.57843022 -0.07163183  1.45161162
 -0.52701171 -0.49506452  1.78272183 -0.69607961 -0.90094915  0.57114143
  0.56665688 -0.39060227  0.56118453  0.31770595  0.22018699 -0.62512825
 -0.90515638 -0.4140596  -0.63596746  0.20034514 -0.13541063  0.46488812
 -0.76655374 -0.71173644  0.44103987  0.94213649  0.60540897 -0.02566092
  0.32702463 -0.00544454 -0.50007798  0.24403984  0.08146099  0.76063252
 -0.56051885 -0.84373458  0.59298517  0.01252231 -0.67625714  1.16361518
  0.23749742  0.52514986  0.25292787 -0.09779385 -0.22713234 -1.30790416
  0.51929935  0.19791537 -0.06214432 -0.91423749 -1.6

In [65]:
clf = LinearSVC()
clf.fit(X_train_vec, y_train)

In [66]:
predictions = clf.predict(X_test_vec)
score = f1_score(y_test, predictions, average='macro')
print(score)

0.49100333038752986
