### Подключение библиотек

In [1]:
import pandas as pd
import jsonlines
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, f1_score
import re
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from gensim.models import FastText
import fasttext
import fasttext.util
import fasttext.util
import torch
import torch.nn as nn
import torch.optim as optim

### Загрузка и предварительный анализ данных

In [2]:
data = []

with jsonlines.open('C:/data.jsonl', 'r') as reader:
    for line in reader:
        data.append(line)
        
# Создание DataFrame
df = pd.DataFrame(data)

print(df.head())

                                                text  label
0  i feel awful about it too because it s my job ...      0
1                              im alone i feel awful      0
2  ive probably mentioned this before but i reall...      1
3           i was feeling a little low few days back      0
4  i beleive that i am much more sensitive to oth...      2


In [3]:
nltk.download('stopwords')
nltk.download('punkt')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\user\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\user\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

### Предобработка текстовых данных в DataFrame

In [4]:
df['text'] = df['text'].str.replace(r'[^a-zA-Z]', ' ')
df['text'] = df['text'].str.lower()
df['text'] = df['text'].apply(word_tokenize)
stop_words = set(stopwords.words('english'))

def remove_stopwords(text):
    return ' '.join(word for word in text if word not in stop_words)

df['text'] = df['text'].apply(remove_stopwords)

  df['text'] = df['text'].str.replace(r'[^a-zA-Z]', ' ')


### Векторизация текста с использованием TF-IDF

In [5]:
tfidf_vectorizer = TfidfVectorizer(max_features=5000)
X = tfidf_vectorizer.fit_transform(df['text'])
y = df['label']

### Разделение на обучающую и тестовую выборки

In [6]:

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

model = LogisticRegression(max_iter=1000)
model.fit(X_train, y_train)

LogisticRegression(max_iter=1000)

### Оценка модели и предоставление метрик

In [7]:
y_pred = model.predict(X_test)

f1 = f1_score(y_test, y_pred, average='weighted')
report = classification_report(y_test, y_pred)

print(f'F1 Score: {f1}')
print(report)

F1 Score: 0.8988322173496703
              precision    recall  f1-score   support

           0       0.94      0.94      0.94     24504
           1       0.92      0.93      0.93     28247
           2       0.80      0.77      0.79      6853
           3       0.89      0.90      0.90     11339
           4       0.85      0.85      0.85      9376
           5       0.79      0.71      0.75      3043

    accuracy                           0.90     83362
   macro avg       0.86      0.85      0.86     83362
weighted avg       0.90      0.90      0.90     83362



In [8]:
fasttext.util.download_model('en', if_exists='ignore')  # English
fasttext_model = fasttext.load_model('cc.en.300.bin')



In [11]:
data = []

with jsonlines.open('C:/data.jsonl', 'r') as reader:
    for line in reader:
        data.append(line)
        
df = pd.DataFrame(data)

print(df.head())

                                                text  label
0  i feel awful about it too because it s my job ...      0
1                              im alone i feel awful      0
2  ive probably mentioned this before but i reall...      1
3           i was feeling a little low few days back      0
4  i beleive that i am much more sensitive to oth...      2


### Создание эмбеддингов для текстовых данных с использованием модели FastText

In [12]:
def text_to_vector(text):
    words = text.split()
    vectors = [fasttext_model[word] for word in words if word in fasttext_model]
    if not vectors:
        return [0] * fasttext_model.vector_size
    return sum(vectors) / len(vectors)

# Преобразование текстов в эмбеддинги
df['text_vector'] = df['text'].apply(text_to_vector)

### Нейронная сеть и LSTM слой

In [13]:
class SimpleNN(nn.Module):
    def __init__(self, input_size, hidden_size, output_size):
        super(SimpleNN, self).__init__()
        self.fc1 = nn.Linear(input_size, hidden_size)
        self.relu = nn.ReLU()
        self.lstm = nn.LSTM(hidden_size, hidden_size, num_layers=1, batch_first=True)
        self.fc2 = nn.Linear(hidden_size, output_size)
    
    def forward(self, x):
        x = self.fc1(x)
        x = self.relu(x)
        x, _ = self.lstm(x)
        x = self.fc2(x)
        return x

X = torch.Tensor(df['text_vector'].tolist())
y = torch.LongTensor(df['label'].values)

# Разделение данных на обучающий и тестовый наборы
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

train_data = torch.utils.data.TensorDataset(X_train, y_train)
train_loader = torch.utils.data.DataLoader(train_data, batch_size=128, shuffle=True)

  X = torch.Tensor(df['text_vector'].tolist())


### Cоздание, обучение и оценка нейронной сети. Определение архитектуры модели, обучение модели на обучающих данных с использованием DataLoader, оценка производительности модели на тестовых данных и формирование отчета о классификации.

In [14]:
input_size = X_train.shape[1]
hidden_size = 128
output_size = len(df['label'].unique())

model = SimpleNN(input_size, hidden_size, output_size)

criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

num_epochs = 20
for epoch in range(num_epochs):
    for inputs, labels in train_loader:
        optimizer.zero_grad()
        outputs = model(inputs)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()

model.eval()
with torch.no_grad():
    test_outputs = model(X_test)
    _, predicted = torch.max(test_outputs, 1)
    
y_test_numpy = y_test.numpy()
predicted_numpy = predicted.numpy()

# Вычисление метрик и создание отчета о классификации
f1 = f1_score(y_test_numpy, predicted_numpy, average='weighted')
report = classification_report(y_test_numpy, predicted_numpy)

print(f'F1 Score: {f1}')
print(report)

F1 Score: 0.7047659248975529
              precision    recall  f1-score   support

           0       0.73      0.77      0.75     24504
           1       0.72      0.82      0.77     28247
           2       0.64      0.49      0.55      6853
           3       0.73      0.62      0.67     11339
           4       0.65      0.58      0.61      9376
           5       0.59      0.42      0.49      3043

    accuracy                           0.71     83362
   macro avg       0.68      0.62      0.64     83362
weighted avg       0.71      0.71      0.70     83362



## Итоги:
Первая модель продемонстрировала наилучшие результаты по сравнению со второй.
Плохие результаты второй модели могут быть обусловлены различиями в архитектуре моделей и отсутствием предобработки данных во второй модели. Для улучшения её результатов рекомендуется внедрить предобработку данных и произвести тщательный подбор гиперпараметров.