In [None]:
# Импорт необходимых библиотек
import torch
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from torch.utils.data import DataLoader, TensorDataset

# Импорт необходимых библиотек
import warnings
warnings.filterwarnings('ignore')

# Добавляем путь к src
import sys
import os
sys.path.append(os.path.join(os.getcwd(), 'src'))

# Импорт наших модулей
#import sys
#sys.path.append('./src')

from data_preprocessing import download_and_load_data, preprocess_data, create_sequence_samples
from model import EnhancedLSTMModel
from training import train_with_gradient_clipping, custom_collate_fn
from evaluation import evaluate_model_on_test, test_model_predictions, save_test_results

# Загрузка и предобработка данных
print("Загрузка данных...")
url = "http://cs.stanford.edu/people/alecmgo/trainingandtestdata.zip"
tweets_df = download_and_load_data(url)

print(len(tweets_df['text']))

print(f"Сохранено {len(tweets_df['text'])} твитов в raw_dataset.csv")

print("Предобработка данных...")
clean_df, tokenizer = preprocess_data(tweets_df, save_path='./data/')

print(f"После очистки: {len(clean_df)} твитов")


# Создание последовательностей
print("Создание обучающих последовательностей...")
def convert_to_list(input_str):
    if isinstance(input_str, str):
        return [int(x) for x in input_str.strip('[]').split(',') if x.strip()]
    return input_str

clean_df['input_ids'] = clean_df['input_ids'].apply(convert_to_list)

sequence_length = 23
X_data, y_data = create_sequence_samples(clean_df['input_ids'].tolist(), sequence_length)

print(f"Создано {len(X_data)} примеров")
print(f"Форма X: {X_data.shape}, Форма y: {y_data.shape}")
print(f"Пример X: {X_data[0][:5]}... → y: {y_data[0]}")

# Разделение данных
X_train, X_temp, y_train, y_temp = train_test_split(
    X_data, y_data, test_size=0.2, random_state=42, shuffle=True
)

X_val, X_test, y_val, y_test = train_test_split(
    X_temp, y_temp, test_size=0.5, random_state=42, shuffle=True
)

print(f"Размеры выборок:")
print(f"Train: {len(X_train)} примеров")
print(f"Validation: {len(X_val)} примеров")
print(f"Test: {len(X_test)} примеров")

# Сохранение разделенных данных
train_df = pd.DataFrame({'X': X_train.tolist(), 'y': y_train.tolist()})
train_df.to_csv('./data/train_dataset.csv', index=False)

val_df = pd.DataFrame({'X': X_val.tolist(), 'y': y_val.tolist()})
val_df.to_csv('./data/validation_dataset.csv', index=False)

test_df = pd.DataFrame({'X': X_test.tolist(), 'y': y_test.tolist()})
test_df.to_csv('./data/test_dataset.csv', index=False)

# Создание DataLoader'ов
batch_size = 256
pad_token_id = 0

X_train_tensor = torch.tensor(X_train, dtype=torch.long)
y_train_tensor = torch.tensor(y_train, dtype=torch.long)
X_val_tensor = torch.tensor(X_val, dtype=torch.long)
y_val_tensor = torch.tensor(y_val, dtype=torch.long)
X_test_tensor = torch.tensor(X_test, dtype=torch.long)
y_test_tensor = torch.tensor(y_test, dtype=torch.long)

train_dataset = TensorDataset(X_train_tensor, y_train_tensor)
val_dataset = TensorDataset(X_val_tensor, y_val_tensor)
test_dataset = TensorDataset(X_test_tensor, y_test_tensor)

train_loader = DataLoader(
    train_dataset, 
    batch_size=batch_size, 
    shuffle=True,
    collate_fn=lambda batch: custom_collate_fn(batch, pad_token_id)
)

val_loader = DataLoader(
    val_dataset,
    batch_size=batch_size,
    shuffle=False,
    collate_fn=lambda batch: custom_collate_fn(batch, pad_token_id)
)

test_loader = DataLoader(
    test_dataset,
    batch_size=batch_size,
    shuffle=False,
    collate_fn=lambda batch: custom_collate_fn(batch, pad_token_id)
)



In [None]:
# Инициализация модели
print("Инициализация модели...")
model = EnhancedLSTMModel(
    vocab_size=30522,
    embedding_dim=128,
    hidden_dim=128,
    num_layers=2,
    pad_token_id=0,
    dropout=0.5,
    use_layer_norm=True,
    use_mean_pooling=False
)


In [None]:

# Обучение модели
print("Начало обучения...")
model, train_losses, val_losses, learning_rates = train_with_gradient_clipping(
    model, train_loader, val_loader,
    num_epochs=3,
    learning_rate=0.001,
    max_grad_norm=1.0,
    weight_decay=1e-5,
    model_save_path='./models/trained_model_weights_ELSTM_loc1.pth'
)

# Сохранение финальной модели
torch.save(model.state_dict(), './models/final_model.pth')

# Визуализация результатов обучения
plt.figure(figsize=(12, 4))

plt.subplot(1, 2, 1)
plt.plot(train_losses, label='Train Loss')
plt.plot(val_losses, label='Validation Loss')
plt.xlabel('Epoch')
plt.ylabel('Loss')
plt.legend()
plt.title('Loss during Training')

plt.subplot(1, 2, 2)
plt.plot(learning_rates)
plt.xlabel('Epoch')
plt.ylabel('Learning Rate')
plt.title('Learning Rate Schedule')
plt.tight_layout()
plt.savefig('./results/training_curves.png')
plt.show()


In [None]:

import torch
from model import EnhancedLSTMModel

# Тестирование модели
print("Тестирование модели...")
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# Загрузка лучшей модели для тестирования
best_model = EnhancedLSTMModel(
    vocab_size=30522,
    embedding_dim=128,
    hidden_dim=128,
    num_layers=2,
    pad_token_id=0,
    dropout=0.5,
    use_layer_norm=True,
    use_mean_pooling=False
)
best_model.load_state_dict(torch.load('./models/final_model.pth'))
best_model.to(device)

# Оценка на тестовых данных
metrics, predictions, targets = evaluate_model_on_test(best_model, test_loader, device)

print("\nРЕЗУЛЬТАТЫ ТЕСТИРОВАНИЯ:")
for key, value in metrics.items():
    print(f"{key}: {value}")

# Тестирование конкретных предсказаний
test_model_predictions(best_model, test_dataset, tokenizer, num_examples=20, device=device)

# Сохранение результатов
save_test_results(metrics, predictions, targets, './results/test_results.csv')

print("Все этапы завершены успешно!")
