In [194]:
import numpy as np
import pandas as pd
import tensorflow as tf
from keras import Model
from keras.src.layers import Softmax
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, SpatialDropout1D, Concatenate, Input
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.metrics import F1Score
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score
from sklearn.preprocessing import LabelEncoder, OneHotEncoder, LabelBinarizer
from tensorflow.keras.initializers import Constant

import nltk
import string
from datasets import load_dataset
import re
from nltk.corpus import stopwords
from nltk import pos_tag
import mlflow
from sklearnex import patch_sklearn
from warnings import filterwarnings
from gensim.models import Word2Vec
patch_sklearn()

Extension for Scikit-learn* enabled (https://github.com/uxlfoundation/scikit-learn-intelex)


In [171]:
filterwarnings("ignore")

In [172]:
dataset = "ag_news"
data = load_dataset(dataset)
stop_words = set(stopwords.words("english"))

DataFilesNotFoundError: No (supported) data files found in ag_news

In [4]:
def base_preprocess(text):
    tokens = text.lower()

    # Удаление спец слов
    if dataset == 'ag_news':
        special_words = ['reuters', 'afp', 'ap', 'usatoday.com', 'forbes.com', 'target=/stocks/quickinfo/fullquote"' ]
        for word in special_words:
            tokens = tokens.replace(word, '')
        
        pattern = r'[&lt][^<>]*&gt'
        tokens = re.sub(pattern, '', tokens)
    elif dataset == 'imdb':
        special_words = ['<br /><br />'] 
        for word in special_words:
            tokens = tokens.replace(word, '')
    
    # Удаление пунктуации
    tokens = ''.join(i if i not in set(string.punctuation)  else ' ' for i in tokens)
    
    return tokens

In [173]:
txt1 = base_preprocess(data['train']['text'][0])
print(txt1)

wall st  bears claw back into the black       short sellers  wall street s dwindling band of ultra cynics  are seeing green again 


In [174]:
def different_preprocess(tokens, preprocess_type, words_class):
    tokenizer = nltk.tokenize.TreebankWordTokenizer()
    tokens = tokenizer.tokenize(tokens)
    
    # Обработка стоп-слов
    tokens = [token for token in tokens if token not in stop_words]
    
    # Обработка частей речи
    if words_class != 'ALL':
        tokens = pos_tag(tokens)
        if words_class == 'N':
            tokens = [word for word, tag in tokens if tag.startswith('N')]
        elif words_class == 'NJ':
            tokens = [word for word, tag in tokens if tag.startswith('N') or tag.startswith('J')]
        elif words_class == 'NJV':
            tokens = [word for word, tag in tokens if tag.startswith('N') or tag.startswith('J') or tag.startswith('V')]
    
    # Обработка слов
    if preprocess_type == 'лемматизация':
        lemmatizer = nltk.WordNetLemmatizer()
        tokens = [lemmatizer.lemmatize(token) for token in tokens]
    elif preprocess_type == 'стемминг':
        stemmer = nltk.PorterStemmer()
        tokens = [stemmer.stem(token) for token in tokens]
    
    return ' '.join(tokens)  

In [175]:
txt2 = different_preprocess(txt1,'лемматизация', 'ALL')
print(txt2)

wall st bear claw back black short seller wall street dwindling band ultra cynic seeing green


In [178]:
dataset1 = data 
preprocess_type = 'лемматизация'
words_class = 'ALL'

# Подготовка данных
shuffled_train = data["train"].shuffle(seed=42)
x_train = shuffled_train['text']
y_train = shuffled_train['label']
    
x_test = dataset1['test']['text']
y_test = dataset1['test']['label']
    
# Базовая обработка
xtr = [base_preprocess(text) for text in x_train]
xte = [base_preprocess(text) for text in x_test]
    
# Части речи + приведение
xtr = [different_preprocess(tokens, preprocess_type, words_class) for tokens in xtr]
xte = [different_preprocess(tokens, preprocess_type, words_class) for tokens in xte]

In [179]:
print(xtr[0])

bangladesh paralysed strike opposition activist brought many town city bangladesh halt day 18 people died explosion political rally


In [180]:
xtr1 = [x.split() for x in xtr]
xte1 = [x.split() for x in xte]

In [181]:
print(xtr1[0])

['bangladesh', 'paralysed', 'strike', 'opposition', 'activist', 'brought', 'many', 'town', 'city', 'bangladesh', 'halt', 'day', '18', 'people', 'died', 'explosion', 'political', 'rally']


#### Блок Word2Vec

In [182]:
model = Word2Vec(
    sentences=xtr1,
    vector_size=100,
    window=5,
    min_count=1,
    workers=4,
    epochs=5,
    negative=7,
    sample=1e-3,
)

In [183]:
tokenizer = Tokenizer()
tokenizer.fit_on_texts(xtr)
vocab_size = len(tokenizer.word_index) + 1

embedding_matrix = np.zeros((vocab_size, model.vector_size))
for word, i in tokenizer.word_index.items():
    if word in model.wv:
        embedding_matrix[i] = model.wv[word]

In [212]:
embedding_dim = model.vector_size

lstm = Sequential()
embedding_layer = Embedding(
    input_dim=vocab_size,   
    output_dim=embedding_dim, 
    embeddings_initializer=Constant(embedding_matrix),
    input_length=125,
    trainable=True,     # дообучение эмбеддингов
    mask_zero=True
)
lstm.add(embedding_layer)
lstm.add(SpatialDropout1D(0.2))
lstm.add(LSTM(128,dropout=0.2, recurrent_dropout=0.2))
lstm.add(Dense(4, activation='softmax'))  # Для бинарной классификации

lstm.compile(loss='categorical_crossentropy', optimizer=Adam(learning_rate=0.01),
metrics=[F1Score(average='macro')])
lstm.summary()

In [213]:
X = tokenizer.texts_to_sequences(xtr)
X = pad_sequences(X, maxlen=125, padding='post')

In [214]:
XT = tokenizer.texts_to_sequences(xte)
XT = pad_sequences(XT, maxlen=125, padding='post')

In [215]:
print(X[0])

[ 1799 19137   348   688  2194  1337   254   793    98  1799  1546    29
   468    55   714   820   489   580     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0]


In [216]:
le = LabelBinarizer()
y_train1 = le.fit_transform(np.array(y_train).reshape(-1,1))
y_train1 = np.array(y_train1)

In [217]:
y_test1 = le.transform(np.array(y_test).reshape(-1,1))

In [218]:
print(y_train1[:5])

[[1 0 0 0]
 [0 1 0 0]
 [1 0 0 0]
 [0 0 0 1]
 [1 0 0 0]]


In [219]:
lstm.fit(X, y_train1, epochs=3, batch_size=128, validation_split=0.1)

Epoch 1/3
[1m844/844[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m171s[0m 197ms/step - f1_score: 0.8658 - loss: 0.3911 - val_f1_score: 0.9109 - val_loss: 0.2716
Epoch 2/3
[1m844/844[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m164s[0m 195ms/step - f1_score: 0.9261 - loss: 0.2291 - val_f1_score: 0.9129 - val_loss: 0.2797
Epoch 3/3
[1m844/844[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m168s[0m 199ms/step - f1_score: 0.9401 - loss: 0.1813 - val_f1_score: 0.9103 - val_loss: 0.3027


<keras.src.callbacks.history.History at 0x1c361cb42c0>

In [220]:
y_pred = lstm.predict(XT)

[1m238/238[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 21ms/step


In [221]:
y_predtr = le.inverse_transform(y_pred)
print(y_predtr.shape)

(7600,)


In [222]:
f1 = f1_score(y_predtr, y_test, average='macro')
print("F1-score:", f1)

F1-score: 0.9075636728452149


In [211]:
# Без до обучения - [883, 888] 1 эпоха, 3 эпохи
# С дообучением - [907, 909] 3 эпохи, 1 эпоха