In [1]:
import numpy as np
import pandas as pd
import tensorflow as tf
from keras.src.layers import Softmax
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, SpatialDropout1D
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.metrics import F1Score
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score
from sklearn.preprocessing import LabelBinarizer

import nltk
import string
from datasets import load_dataset
import re
from nltk.corpus import stopwords
from nltk import pos_tag
import mlflow
from sklearnex import patch_sklearn
from warnings import filterwarnings
patch_sklearn()

Extension for Scikit-learn* enabled (https://github.com/uxlfoundation/scikit-learn-intelex)


In [2]:
filterwarnings("ignore")

In [3]:
dataset = "imdb"
data = load_dataset(dataset)
stop_words = set(stopwords.words("english"))

In [16]:
def base_preprocess(text):
    tokens = text.lower()

    # Удаление спец слов
    if dataset == 'ag_news':
        special_words = ['reuters', 'afp', 'ap', 'usatoday.com', 'forbes.com', 'target=/stocks/quickinfo/fullquote"' ]
        for word in special_words:
            tokens = tokens.replace(word, '')
        
        pattern = r'[&lt][^<>]*&gt'
        tokens = re.sub(pattern, '', tokens)
    elif dataset == 'imdb':
        special_words = ['<br /><br />'] 
        for word in special_words:
            tokens = tokens.replace(word, '')
    
    # Удаление пунктуации
    tokens = ''.join(i if i not in set(string.punctuation) | set('0123456789')  else ' ' for i in tokens)
    
    return tokens

In [17]:
txt1 = base_preprocess(data['train']['text'][0])
print(txt1)

i rented i am curious yellow from my video store because of all the controversy that surrounded it when it was first released in       i also heard that at first it was seized by u s  customs if it ever tried to enter this country  therefore being a fan of films considered  controversial  i really had to see this for myself the plot is centered around a young swedish drama student named lena who wants to learn everything she can about life  in particular she wants to focus her attentions to making some sort of documentary on what the average swede thought about certain political issues such as the vietnam war and race issues in the united states  in between asking politicians and ordinary denizens of stockholm about their opinions on politics  she has sex with her drama teacher  classmates  and married men what kills me about i am curious yellow is that    years ago  this was considered pornographic  really  the sex and nudity scenes are few and far between  even then it s not shot lik

In [18]:
def different_preprocess(tokens, preprocess_type, words_class):
    tokenizer = nltk.tokenize.TreebankWordTokenizer()
    tokens = tokenizer.tokenize(tokens)
    
    # Обработка стоп-слов
    #tokens = [token for token in tokens if token not in stop_words]
    
    # Обработка частей речи
    if words_class != 'ALL':
        tokens = pos_tag(tokens)
        if words_class == 'N':
            tokens = [word for word, tag in tokens if tag.startswith('N')]
        elif words_class == 'NJ':
            tokens = [word for word, tag in tokens if tag.startswith('N') or tag.startswith('J')]
        elif words_class == 'NJV':
            tokens = [word for word, tag in tokens if tag.startswith('N') or tag.startswith('J') or tag.startswith('V')]
    
    # Обработка слов
    if preprocess_type == 'лемматизация':
        lemmatizer = nltk.WordNetLemmatizer()
        tokens = [lemmatizer.lemmatize(token) for token in tokens]
    elif preprocess_type == 'стемминг':
        stemmer = nltk.PorterStemmer()
        tokens = [stemmer.stem(token) for token in tokens]
    
    return ' '.join(tokens)  

In [19]:
txt2 = different_preprocess(txt1,'лемматизация', 'ALL')
print(txt2)

i rented i am curious yellow from my video store because of all the controversy that surrounded it when it wa first released in i also heard that at first it wa seized by u s custom if it ever tried to enter this country therefore being a fan of film considered controversial i really had to see this for myself the plot is centered around a young swedish drama student named lena who want to learn everything she can about life in particular she want to focus her attention to making some sort of documentary on what the average swede thought about certain political issue such a the vietnam war and race issue in the united state in between asking politician and ordinary denizen of stockholm about their opinion on politics she ha sex with her drama teacher classmate and married men what kill me about i am curious yellow is that year ago this wa considered pornographic really the sex and nudity scene are few and far between even then it s not shot like some cheaply made porno while my country

In [20]:
dataset1 = data 
preprocess_type = 'лемматизация'
words_class = 'ALL'

# Подготовка данных
shuffled_train = data["train"].shuffle(seed=42)
x_train = shuffled_train['text']
y_train = shuffled_train['label']
    
x_test = dataset1['test']['text']
y_test = dataset1['test']['label']
    
# Базовая обработка
xtr = [base_preprocess(text) for text in x_train]
xte = [base_preprocess(text) for text in x_test]
    
# Части речи + приведение
xtr = [different_preprocess(tokens, preprocess_type, words_class) for tokens in xtr]
xte = [different_preprocess(tokens, preprocess_type, words_class) for tokens in xte]

In [21]:
xtr = np.array(xtr)
xte = np.array(xte)

In [22]:
print(xtr[0])

there is no relation at all between fortier and profiler but the fact that both are police series about violent crime profiler look crispy fortier look classic profiler plot are quite simple fortier s plot are far more complicated fortier look more like prime suspect if we have to spot similarity the main character is weak and weirdo but have clairvoyance people like to compare to judge to evaluate how about just enjoying funny thing too people writing fortier look american but on the other hand arguing they prefer american series maybe it s the language or the spirit but i think this series is more english than american by the way the actor are really good and funny the acting is not superficial at all


In [23]:
tokenizer = Tokenizer()
tokenizer.fit_on_texts(xtr)
X = tokenizer.texts_to_sequences(xtr)
X = pad_sequences(X, maxlen=300, padding='post')

print(X.shape)

(25000, 300)


In [24]:
XT = tokenizer.texts_to_sequences(xte)
XT = pad_sequences(XT, maxlen=300, padding='post')

In [25]:
print(X[1])

[   10    13     6     2    85     1   110     6    55   289     5     1
   236    64     6     2   327   405    32   785 13733     1    13   237
     4    17     2    74   122  2056  3001     2   344    17     2   740
     4   229   452    53    20 25686   132  5521    21     1  1726     7
  1753    71     4  2050    12   344   315   377     7     6   254     3
  5655     1   228     6    85   477     3    61   463   344     6  4871
    32     1   598  2056  7585  5720     3  2561 21678 41276   451     2
    85   203    13    40    59     2    85  1122    13    10     6     2
    13    20    48   105   123     3   123   179     1  2378   266    32
 11001  8416     6  1467     9   111    10    13    45    20   432  1592
  9728     8     1  2326 11622    96    20    80   418    35    10    13
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0   

In [26]:
y_train2 = np.array(y_train).reshape(-1, 1)

In [27]:
y_test2 = np.array(y_test).reshape(-1, 1)

In [28]:
print(y_train2[0])

[1]


In [29]:
print(y_train2[:5])

[[1]
 [1]
 [0]
 [1]
 [0]]


In [47]:
# Build the LSTM model
model = Sequential([
    Embedding(len(tokenizer.word_index) + 1, output_dim = 100, input_length=300, mask_zero = True),
    SpatialDropout1D(0.2),
    LSTM(128, dropout=0.2, recurrent_dropout=0.2),
    Dense(1, activation='sigmoid'),
])

model.compile(loss='binary_crossentropy', optimizer=Adam(learning_rate=0.01),
metrics=[F1Score])
model.summary()

In [48]:
# Train the model
model.fit(X, y_train2, epochs=2, batch_size=32, validation_split=0.1)

Epoch 1/2
[1m704/704[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m173s[0m 241ms/step - f1_score: 0.6641 - loss: 0.5584 - val_f1_score: 0.6818 - val_loss: 0.3022
Epoch 2/2
[1m704/704[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m145s[0m 206ms/step - f1_score: 0.6679 - loss: 0.2068 - val_f1_score: 0.6818 - val_loss: 0.3382


<keras.src.callbacks.history.History at 0x2d33f184650>

In [49]:
#Step 4: Evaluate the Model with F1-Score
# Predict on test data
y_pred = model.predict(XT)

[1m782/782[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m36s[0m 45ms/step


In [50]:
y_pred[:5]

array([[0.02559253],
       [0.0107685 ],
       [0.02478401],
       [0.04488847],
       [0.85800225]], dtype=float32)

In [51]:
y_pred_label = y_pred = (y_pred > 0.5).astype(int).reshape(-1)

In [52]:
print(y_pred_label)

[0 0 0 ... 0 0 1]


In [53]:
print(y_test2.reshape(-1))

[0 0 0 ... 1 1 1]


In [54]:
# Calculate F1 Score
f1 = f1_score(y_pred_label, y_test2.reshape(-1))
print("F1-score:", f1)

F1-score: 0.8567674113009198


In [None]:
# [849, 838, 870] 1 - эпоха, 3 - эпохи, 1 - эпоха, BS = 64