# Классификация текстов с применением RNN

In [2]:
import pandas as pd

data = pd.read_csv('D:/PythonWork/OldZmiy/NLPWork/data/Petitions.csv')[:10000]
data.head()

Unnamed: 0,id,public_petition_text,reason_category
0,3168490,снег на дороге,Благоустройство
1,3219678,очистить кабельный киоск от рекламы,Благоустройство
2,2963920,"Просим убрать все деревья и кустарники, которы...",Благоустройство
3,3374910,Неудовлетворительное состояние парадной - надп...,Содержание МКД
4,3336285,Граффити,Благоустройство


In [3]:
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem.snowball import SnowballStemmer

nltk.download('punkt')
nltk.download('punkt_tab')
nltk.download('stopwords')
stop_words = set(stopwords.words('russian'))

def preprocess(text):
    text = text.lower()
    tokens = word_tokenize(text)
    filtered_tokens = [w for w in tokens if w.isalpha() and w not in stop_words]
    stemmer = SnowballStemmer("russian")
    stems = [stemmer.stem(w) for w in filtered_tokens]
    return ' '.join(stems)

data['preprocessed_text'] = data['public_petition_text'].apply(preprocess)

data.to_csv('preprocessed_dataset.csv', index=False)
data.head()

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\sheld\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\sheld\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\sheld\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Unnamed: 0,id,public_petition_text,reason_category,preprocessed_text
0,3168490,снег на дороге,Благоустройство,снег дорог
1,3219678,очистить кабельный киоск от рекламы,Благоустройство,очист кабельн киоск реклам
2,2963920,"Просим убрать все деревья и кустарники, которы...",Благоустройство,прос убра дерев кустарник котор вышл предел га...
3,3374910,Неудовлетворительное состояние парадной - надп...,Содержание МКД,неудовлетворительн состоян парадн надпис двер ...
4,3336285,Граффити,Благоустройство,граффит


In [4]:
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer()

tfidf_matrix = vectorizer.fit_transform(data['preprocessed_text'])

feature_names = vectorizer.get_feature_names_out()

print(tfidf_matrix.toarray())
print("Имена признаков:", feature_names)

[[0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 ...
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]]
Имена признаков: ['covid' 'dns' 'err' ... 'ясельн' 'яхтен' 'ящик']


In [5]:
categories_mapping = {}

unique_values = data['reason_category'].unique().tolist() 
mapping_dict = {val: idx+1 for idx, val in enumerate(unique_values)}  
categories_mapping['reason_category'] = mapping_dict
    
data['reason_category'] = data['reason_category'].map(mapping_dict)

data, mapping_dict

(           id                               public_petition_text  \
 0     3168490                                     снег на дороге   
 1     3219678                очистить кабельный киоск от рекламы   
 2     2963920  Просим убрать все деревья и кустарники, которы...   
 3     3374910  Неудовлетворительное состояние парадной - надп...   
 4     3336285                                           Граффити   
 ...       ...                                                ...   
 9995  3236509                                      очистите о  о   
 9996  3213091  На фасаде дома, незаконно и без разрешающих до...   
 9997  3242261  Товарный переулок. Мусор. В администрацию Цент...   
 9998  3311922  В проезжей части просел канализационный люк.\n...   
 9999  3049014                                  Надписи на будке.   
 
       reason_category                                  preprocessed_text  
 0                   1                                         снег дорог  
 1                

In [6]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(tfidf_matrix, data['reason_category'], test_size=0.2, random_state=42)
y_test

6252     9
4684     2
1731     5
4742     2
4521     1
        ..
6412     1
8285     1
7853    14
1095     1
6929     2
Name: reason_category, Length: 2000, dtype: int64

In [7]:
import torch
import numpy as np
from torch.utils.data import TensorDataset, DataLoader

X_train_tensor = torch.tensor(X_train.todense().astype(np.float32))
y_train_tensor = torch.tensor(y_train.values)

X_test_tensor = torch.tensor(X_test.todense().astype(np.float32))
y_test_tensor = torch.tensor(y_test.values)

In [8]:
train_dataset = TensorDataset(X_train_tensor, y_train_tensor)
test_dataset = TensorDataset(X_test_tensor, y_test_tensor)

In [9]:
batch_size = 128

train_dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
test_dataloader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)

In [10]:
from sklearn.neighbors import KNeighborsClassifier

classifier = KNeighborsClassifier()
classifier.fit(X_train, y_train)

accuracy = classifier.score(X_test, y_test)
print(f"Точность модели: {accuracy:.2f}")

Точность модели: 0.74


## RNN

In [11]:
import torch
import torch.nn as nn
import torch.optim as optim

In [12]:
class RNN(nn.Module):
    def __init__(self, input_size, hidden_size, output_size):
        super(RNN, self).__init__()
        self.rnn = nn.RNN(input_size=input_size, hidden_size=hidden_size, batch_first=True)
        self.fc = nn.Linear(hidden_size, output_size)
    
    def forward(self, x):
        rnn_out, _ = self.rnn(x)
        last_hidden_state = rnn_out[:, -1, :]
        return self.fc(last_hidden_state)

In [13]:
embed_dim = 7436 
hidden_size = 128
output_size = 16 
learning_rate = 0.001
num_epochs = 15

vocab_size = len(vectorizer.get_feature_names_out())

model = RNN(embed_dim, hidden_size, output_size)

criterion = nn.CrossEntropyLoss()

optimizer = optim.Adam(model.parameters(), lr=learning_rate)

for epoch in range(num_epochs):
    total_loss = 0
    for inputs, targets in train_dataloader:
        inputs, targets = inputs.to('cpu'), targets.to('cpu')
        
        outputs = model(inputs.unsqueeze(1))
        
        loss = criterion(outputs, targets)

        optimizer.zero_grad()
 
        loss.backward()

        optimizer.step()
        
        total_loss += loss.item()
    
    average_loss = total_loss / len(train_dataloader)
    print(f'Эпоха {epoch+1}: Средняя потеря - {average_loss:.4f}')

Эпоха 1: Средняя потеря - 1.9613
Эпоха 2: Средняя потеря - 1.0111
Эпоха 3: Средняя потеря - 0.7133
Эпоха 4: Средняя потеря - 0.5379
Эпоха 5: Средняя потеря - 0.4204
Эпоха 6: Средняя потеря - 0.3351
Эпоха 7: Средняя потеря - 0.2757
Эпоха 8: Средняя потеря - 0.2292
Эпоха 9: Средняя потеря - 0.1958
Эпоха 10: Средняя потеря - 0.1704
Эпоха 11: Средняя потеря - 0.1502
Эпоха 12: Средняя потеря - 0.1334
Эпоха 13: Средняя потеря - 0.1204
Эпоха 14: Средняя потеря - 0.1099
Эпоха 15: Средняя потеря - 0.1022


In [14]:
from sklearn.metrics import classification_report

model.eval()

with torch.no_grad():
    outputs = model(X_test_tensor.unsqueeze(1))
    probs = outputs.softmax(dim=-1)
    predictions = probs.argmax(dim=-1)

y_true = y_test_tensor.cpu().numpy()
y_pred = predictions.cpu().numpy()

print(classification_report(y_true, y_pred))

              precision    recall  f1-score   support

           1       0.93      0.95      0.94      1221
           2       0.82      0.90      0.86       453
           3       0.96      0.91      0.93        55
           4       0.76      0.53      0.63        49
           5       0.80      0.89      0.84        18
           6       0.98      0.86      0.92        65
           7       0.92      0.90      0.91        39
           8       0.96      0.77      0.85        30
           9       0.75      0.26      0.39        23
          10       0.00      0.00      0.00         5
          11       1.00      0.60      0.75         5
          12       0.88      0.64      0.74        11
          13       0.88      0.47      0.61        15
          14       1.00      0.50      0.67         8
          15       1.00      0.67      0.80         3

    accuracy                           0.90      2000
   macro avg       0.84      0.66      0.72      2000
weighted avg       0.90   

  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])


In [15]:
class MyLSTM(nn.Module):
    def __init__(self, input_size, hidden_size, output_size):
        super(MyLSTM, self).__init__()
        self.lstm = nn.LSTM(input_size=input_size, hidden_size=hidden_size, batch_first=True)
        self.fc = nn.Linear(hidden_size, output_size)
    
    def forward(self, x):
        lstm_out, _ = self.lstm(x)
        last_hidden_state = lstm_out[:, -1, :]
        return self.fc(last_hidden_state)

In [16]:
embed_dim = 7436 
hidden_size = 128
output_size = 16 
learning_rate = 0.001
num_epochs = 20

vocab_size = len(vectorizer.get_feature_names_out())

model2 = MyLSTM(embed_dim, hidden_size, output_size)

criterion = nn.CrossEntropyLoss()

optimizer = optim.Adam(model2.parameters(), lr=learning_rate)

for epoch in range(num_epochs):
    total_loss = 0
    for inputs, targets in train_dataloader:
        inputs, targets = inputs.to('cpu'), targets.to('cpu')
        
        outputs = model2(inputs.unsqueeze(1))
        
        loss = criterion(outputs, targets)

        optimizer.zero_grad()
 
        loss.backward()

        optimizer.step()
        
        total_loss += loss.item()
    
    average_loss = total_loss / len(train_dataloader)
    print(f'Эпоха {epoch+1}: Средняя потеря - {average_loss:.4f}')

Эпоха 1: Средняя потеря - 2.3944
Эпоха 2: Средняя потеря - 1.3540
Эпоха 3: Средняя потеря - 0.9898
Эпоха 4: Средняя потеря - 0.7488
Эпоха 5: Средняя потеря - 0.5894
Эпоха 6: Средняя потеря - 0.4812
Эпоха 7: Средняя потеря - 0.3939
Эпоха 8: Средняя потеря - 0.3283
Эпоха 9: Средняя потеря - 0.2783
Эпоха 10: Средняя потеря - 0.2390
Эпоха 11: Средняя потеря - 0.2059
Эпоха 12: Средняя потеря - 0.1803
Эпоха 13: Средняя потеря - 0.1595
Эпоха 14: Средняя потеря - 0.1425
Эпоха 15: Средняя потеря - 0.1285
Эпоха 16: Средняя потеря - 0.1168
Эпоха 17: Средняя потеря - 0.1074
Эпоха 18: Средняя потеря - 0.1001
Эпоха 19: Средняя потеря - 0.0941
Эпоха 20: Средняя потеря - 0.0884


In [17]:
model2.eval()

with torch.no_grad():
    outputs = model2(X_test_tensor.unsqueeze(1))
    probs = outputs.softmax(dim=-1)
    predictions = probs.argmax(dim=-1)

y_true = y_test_tensor.cpu().numpy()
y_pred = predictions.cpu().numpy()

print(classification_report(y_true, y_pred))

              precision    recall  f1-score   support

           1       0.94      0.95      0.94      1221
           2       0.82      0.91      0.86       453
           3       0.96      0.91      0.93        55
           4       0.72      0.53      0.61        49
           5       0.80      0.89      0.84        18
           6       1.00      0.88      0.93        65
           7       0.92      0.92      0.92        39
           8       0.88      0.70      0.78        30
           9       0.78      0.30      0.44        23
          10       0.50      0.40      0.44         5
          11       1.00      0.60      0.75         5
          12       0.78      0.64      0.70        11
          13       0.86      0.40      0.55        15
          14       1.00      0.50      0.67         8
          15       1.00      0.67      0.80         3

    accuracy                           0.90      2000
   macro avg       0.86      0.68      0.74      2000
weighted avg       0.90   

In [None]:
class MyGRU(nn.Module):
    def __init__(self, input_size, hidden_size, output_size):
        super(MyGRU, self).__init__()
        self.gru = nn.GRU(input_size=input_size, hidden_size=hidden_size, batch_first=True)
        self.fc = nn.Linear(hidden_size, output_size)
    
    def forward(self, x):
        gru_out, _ = self.gru(x)
        last_hidden_state = gru_out[:, -1, :]
        return self.fc(last_hidden_state)

In [19]:
embed_dim = 7436 
hidden_size = 1024
output_size = 16 
learning_rate = 0.001
num_epochs = 5

vocab_size = len(vectorizer.get_feature_names_out())

model3 = MyGRU(embed_dim, hidden_size, output_size)

criterion = nn.CrossEntropyLoss()

optimizer = optim.Adam(model3.parameters(), lr=learning_rate)

for epoch in range(num_epochs):
    total_loss = 0
    for inputs, targets in train_dataloader:
        optimizer.zero_grad()
        inputs, targets = inputs.to('cpu'), targets.to('cpu')
        
        outputs = model3(inputs.unsqueeze(1))
        
        loss = criterion(outputs, targets)
 
        loss.backward()

        optimizer.step()
        
        total_loss += loss.item()
    
    average_loss = total_loss / len(train_dataloader)
    print(f'Эпоха {epoch+1}: Средняя потеря - {average_loss:.4f}')

Эпоха 1: Средняя потеря - 1.6181
Эпоха 2: Средняя потеря - 0.6433
Эпоха 3: Средняя потеря - 0.3777
Эпоха 4: Средняя потеря - 0.2473
Эпоха 5: Средняя потеря - 0.1777


In [20]:
model3.eval()

with torch.no_grad():
    outputs = model3(X_test_tensor.unsqueeze(1))
    probs = outputs.softmax(dim=-1)
    predictions = probs.argmax(dim=-1)

y_true = y_test_tensor.cpu().numpy()
y_pred = predictions.cpu().numpy()

print(classification_report(y_true, y_pred))

              precision    recall  f1-score   support

           1       0.93      0.95      0.94      1221
           2       0.83      0.89      0.86       453
           3       0.93      0.91      0.92        55
           4       0.77      0.55      0.64        49
           5       0.80      0.89      0.84        18
           6       0.98      0.86      0.92        65
           7       0.92      0.90      0.91        39
           8       0.88      0.77      0.82        30
           9       0.75      0.26      0.39        23
          10       0.00      0.00      0.00         5
          11       1.00      0.20      0.33         5
          12       1.00      0.64      0.78        11
          13       0.83      0.33      0.48        15
          14       1.00      0.38      0.55         8
          15       1.00      0.67      0.80         3

    accuracy                           0.90      2000
   macro avg       0.84      0.61      0.68      2000
weighted avg       0.90   

  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])


RNN и LSTM (accuracy = 0.9) отработали лучше, чем GRU (accuracy = 0.88) 

### Реализация GRU

In [21]:
# z - gate обновления
# r - gate сброса
# h с волной - скрытое состояние
# h - окончательное скрытое состояние

class MyGRUBlock(nn.Module):
    def __init__(self, input_size, hidden_size):
        super(MyGRUBlock, self).__init__()
        
        self.Wz = nn.Linear(input_size, hidden_size)
        self.Uz = nn.Linear(hidden_size, hidden_size)
        
        self.Wr = nn.Linear(input_size, hidden_size)
        self.Ur = nn.Linear(hidden_size, hidden_size)
        
        self.Wh = nn.Linear(input_size, hidden_size)
        self.Uh = nn.Linear(hidden_size, hidden_size)
    
    def forward(self, xt, ht_prev):
        zt = torch.sigmoid(self.Wz(xt) + self.Uz(ht_prev))
        
        rt = torch.sigmoid(self.Wr(xt) + self.Ur(ht_prev))
        
        ht_with_wave = torch.tanh(self.Wh(xt) + self.Uh(rt * ht_prev))

        ht = (1 - zt) * ht_with_wave + zt * ht_prev
        
        return ht

In [None]:
class MyGRUModel(nn.Module):
    def __init__(self, embed_dim, hidden_size, output_size):
        super(MyGRUModel, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embed_dim)
        self.gru_block = MyGRUBlock(embed_dim, hidden_size)
        self.fc = nn.Linear(hidden_size, output_size)
    
    def forward(self, inputs):
        embedded = self.embedding(inputs)
        ht_prev = torch.zeros(batch_size, hidden_size)
        for i in range(inputs.size(1)):
            ht_prev = self.gru_block(embedded[:,i,:], ht_prev)
        outputs = self.fc(ht_prev)
        return outputs

: 

In [None]:
embed_dim = 512 
hidden_size = 1024
output_size = 16 
learning_rate = 0.001
num_epochs = 5

vocab_size = len(vectorizer.get_feature_names_out())

my_model = MyGRUModel(embed_dim, hidden_size, output_size)

criterion = nn.CrossEntropyLoss()

optimizer = optim.Adam(my_model.parameters(), lr=learning_rate)

for epoch in range(num_epochs):
    total_loss = 0
    for inputs, targets in train_dataloader:
        optimizer.zero_grad()
        
        inputs, targets = inputs.long().to('cpu'), targets.to('cpu')
        
        outputs = my_model(inputs)
        
        loss = criterion(outputs, targets)
        
        loss.backward()
        
        optimizer.step()
        
        total_loss += loss.item()
    
    average_loss = total_loss / len(train_dataloader)
    print(f'Эпоха {epoch+1}: Средняя потеря - {average_loss:.4f}')

In [None]:
my_model.eval()

with torch.no_grad():
    outputs = my_model(X_test_tensor.unsqueeze(1))
    probs = outputs.softmax(dim=-1)
    predictions = probs.argmax(dim=-1)

y_true = y_test_tensor.cpu().numpy()
y_pred = predictions.cpu().numpy()

print(classification_report(y_true, y_pred))