In [1]:
import numpy as np
import pandas as pd
import tensorflow as tf
from keras.src.layers import Softmax
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, SpatialDropout1D
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.metrics import F1Score
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score
from sklearn.preprocessing import LabelEncoder, OneHotEncoder, LabelBinarizer

import nltk
import string
from datasets import load_dataset
import re
from nltk.corpus import stopwords
from nltk import pos_tag
import mlflow
from sklearnex import patch_sklearn
from warnings import filterwarnings
patch_sklearn()

Extension for Scikit-learn* enabled (https://github.com/uxlfoundation/scikit-learn-intelex)


In [2]:
filterwarnings("ignore")

In [3]:
dataset = "ag_news"
data = load_dataset(dataset)
stop_words = set(stopwords.words("english"))

In [4]:
def base_preprocess(text):
    tokens = text.lower()

    # Удаление спец слов
    if dataset == 'ag_news':
        special_words = ['reuters', 'afp', 'ap', 'usatoday.com', 'forbes.com', 'target=/stocks/quickinfo/fullquote"' ]
        for word in special_words:
            tokens = tokens.replace(word, '')
        
        pattern = r'[&lt][^<>]*&gt'
        tokens = re.sub(pattern, '', tokens)
    elif dataset == 'imdb':
        special_words = ['<br /><br />'] 
        for word in special_words:
            tokens = tokens.replace(word, '')
    
    # Удаление пунктуации
    tokens = ''.join(i if i not in set(string.punctuation)  else ' ' for i in tokens)
    
    return tokens

In [5]:
txt1 = base_preprocess(data['train']['text'][0])
print(txt1)

wall st  bears claw back into the black       short sellers  wall street s dwindling band of ultra cynics  are seeing green again 


In [6]:
def different_preprocess(tokens, preprocess_type, words_class):
    tokenizer = nltk.tokenize.TreebankWordTokenizer()
    tokens = tokenizer.tokenize(tokens)
    
    # Обработка стоп-слов
    tokens = [token for token in tokens if token not in stop_words]
    
    # Обработка частей речи
    if words_class != 'ALL':
        tokens = pos_tag(tokens)
        if words_class == 'N':
            tokens = [word for word, tag in tokens if tag.startswith('N')]
        elif words_class == 'NJ':
            tokens = [word for word, tag in tokens if tag.startswith('N') or tag.startswith('J')]
        elif words_class == 'NJV':
            tokens = [word for word, tag in tokens if tag.startswith('N') or tag.startswith('J') or tag.startswith('V')]
    
    # Обработка слов
    if preprocess_type == 'лемматизация':
        lemmatizer = nltk.WordNetLemmatizer()
        tokens = [lemmatizer.lemmatize(token) for token in tokens]
    elif preprocess_type == 'стемминг':
        stemmer = nltk.PorterStemmer()
        tokens = [stemmer.stem(token) for token in tokens]
    
    return ' '.join(tokens)  

In [7]:
txt2 = different_preprocess(txt1,'лемматизация', 'ALL')
print(txt2)

wall st bear claw back black short seller wall street dwindling band ultra cynic seeing green


In [10]:
dataset1 = data 
preprocess_type = 'лемматизация'
words_class = 'ALL'

# Подготовка данных
shuffled_train = data["train"].shuffle(seed=42)
x_train = shuffled_train['text']
y_train = shuffled_train['label']
    
x_test = dataset1['test']['text']
y_test = dataset1['test']['label']
    
# Базовая обработка
xtr = [base_preprocess(text) for text in x_train]
xte = [base_preprocess(text) for text in x_test]
    
# Части речи + приведение
xtr = [different_preprocess(tokens, preprocess_type, words_class) for tokens in xtr]
xte = [different_preprocess(tokens, preprocess_type, words_class) for tokens in xte]

In [11]:
xtr = np.array(xtr)
xte = np.array(xte)

In [12]:
print(xtr[0])

bangladesh paralysed strike opposition activist brought many town city bangladesh halt day 18 people died explosion political rally


In [13]:
tokenizer = Tokenizer()
tokenizer.fit_on_texts(xtr)
x = tokenizer.texts_to_sequences(xtr)
x = pad_sequences(x, maxlen=125)

print(x.shape)

(120000, 125)


In [14]:
x_test = tokenizer.texts_to_sequences(xte)
x_test = pad_sequences(x_test, maxlen=125)

In [15]:
print(x[1])

[    0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0 38708  4215  1887   153  2116  4479  1169   150  5306   358  5048
   341 31256  4036  4215   122]


In [16]:
print(y_train[:100])

[0, 1, 0, 3, 0, 3, 0, 3, 3, 2, 1, 2, 3, 0, 2, 1, 1, 2, 2, 3, 3, 2, 3, 2, 0, 1, 3, 2, 1, 1, 3, 1, 0, 1, 2, 0, 0, 0, 2, 2, 0, 3, 1, 1, 2, 0, 0, 0, 0, 1, 2, 1, 0, 3, 1, 1, 2, 2, 3, 2, 3, 3, 0, 0, 1, 3, 1, 1, 1, 2, 0, 2, 2, 3, 2, 1, 1, 3, 1, 0, 0, 0, 2, 0, 1, 2, 0, 3, 1, 0, 1, 0, 3, 3, 2, 1, 2, 1, 2, 1]


In [17]:
le = LabelBinarizer()
y_train2 = le.fit_transform(np.array(y_train).reshape(-1,1))
y_train2 = np.array(y_train2)

In [18]:
y_test2 = le.transform(np.array(y_test).reshape(-1,1))

In [19]:
print(y_train2[:5])

[[1 0 0 0]
 [0 1 0 0]
 [1 0 0 0]
 [0 0 0 1]
 [1 0 0 0]]


In [26]:
# Build the LSTM model
model = Sequential([
    Embedding(len(tokenizer.word_index) + 1, output_dim = 100, input_length=125),
    SpatialDropout1D(0.2),
    LSTM(128, dropout=0.2, recurrent_dropout=0.2),
    Dense(4, activation='softmax'),
])

model.compile(loss='categorical_crossentropy', optimizer=Adam(learning_rate=0.01),
metrics=[F1Score(average='macro')])
model.summary()

In [27]:
# Train the model
model.fit(x, y_train2, epochs=3, batch_size=128, validation_split=0.1)

Epoch 1/3
[1m844/844[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m168s[0m 197ms/step - f1_score: 0.8393 - loss: 0.4510 - val_f1_score: 0.9151 - val_loss: 0.2740
Epoch 2/3
[1m844/844[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m164s[0m 194ms/step - f1_score: 0.9358 - loss: 0.1977 - val_f1_score: 0.9134 - val_loss: 0.2781
Epoch 3/3
[1m844/844[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m161s[0m 191ms/step - f1_score: 0.9476 - loss: 0.1532 - val_f1_score: 0.9023 - val_loss: 0.3287


<keras.src.callbacks.history.History at 0x11a1b2dd460>

In [28]:
#Step 4: Evaluate the Model with F1-Score
# Predict on test data
y_pred = model.predict(x_test)

[1m238/238[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 19ms/step


In [29]:
y_predtr = le.inverse_transform(y_pred)
print(y_predtr.shape)

(7600,)


In [30]:
# Calculate F1 Score
f1 = f1_score(y_predtr, y_test, average='macro')
print("F1-score:", f1)

F1-score: 0.9061861295149627


In [31]:
[901, 905, 907, 908, 912] # 912 после перемешивания 

[901, 905, 907, 908, 912]