In [28]:
!pip install torchtext




In [29]:
import torch
import torch.nn as nn
import numpy as np
import re
import pickle
import json
import gradio as gr
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
import nltk

nltk.download('punkt', quiet=True)
nltk.download('stopwords', quiet=True)

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

class ReviewClassifier(nn.Module):
    def __init__(self, vocab_size, embedding_dim=128, hidden_dim=256, output_dim=3, num_layers=2):
        super(ReviewClassifier, self).__init__()

        self.embedding = nn.Embedding(vocab_size, embedding_dim, padding_idx=0)

        self.lstm = nn.LSTM(
            embedding_dim,
            hidden_dim,
            num_layers=num_layers,
            batch_first=True,
            bidirectional=True,
            dropout=0.5 if num_layers > 1 else 0
        )

        self.dropout = nn.Dropout(0.5)
        self.fc = nn.Linear(hidden_dim * 2, output_dim)

    def forward(self, text):
        embedded = self.embedding(text)
        lstm_output, (hidden, cell) = self.lstm(embedded)
        hidden = torch.cat((hidden[-2, :, :], hidden[-1, :, :]), dim=1)
        hidden = self.dropout(hidden)
        output = self.fc(hidden)
        return output

In [30]:
def load_model(model_path, vocab_path, metadata_path):

    with open(metadata_path, 'r') as f:
        metadata = json.load(f)

    with open(vocab_path, 'rb') as f:
        vocab = pickle.load(f)

    model = ReviewClassifier(
        vocab_size=metadata['vocab_size'],
        embedding_dim=metadata['embedding_dim'],
        hidden_dim=metadata['hidden_dim'],
        output_dim=metadata['output_dim'],
        num_layers=metadata['num_layers']
    ).to(device)

    model.load_state_dict(torch.load(model_path, map_location=device))
    model.eval()

    return model, vocab, metadata

def preprocess_text(text):
    if not isinstance(text, str):
        return ""

    text = text.lower()
    text = re.sub(r'[^a-zA-Z\s]', '', text)

    tokens = word_tokenize(text)
    stop_words = set(stopwords.words('english'))
    tokens = [word for word in tokens if word not in stop_words]

    return ' '.join(tokens)

def text_to_sequence(text, vocab, max_len=100):
    words = text.split()
    sequence = []

    for word in words[:max_len]:
        sequence.append(vocab.get(word, vocab['<UNK>']))

    if len(sequence) < max_len:
        sequence += [vocab['<PAD>']] * (max_len - len(sequence))

    return sequence[:max_len]

In [31]:
def predict_sentiment(text, model, vocab, metadata):
    """
    –ü—Ä–µ–¥—Å–∫–∞–∑–∞–Ω–∏–µ —Ç–æ–Ω–∞–ª—å–Ω–æ—Å—Ç–∏ —Ç–µ–∫—Å—Ç–∞ —Å –∏—Å–ø–æ–ª—å–∑–æ–≤–∞–Ω–∏–µ–º –º–æ–¥–µ–ª–∏ –Ω–µ–π—Ä–æ–Ω–Ω–æ–π —Å–µ—Ç–∏

    Args:
        text (str): –¢–µ–∫—Å—Ç –æ—Ç–∑—ã–≤–∞ –¥–ª—è –∞–Ω–∞–ª–∏–∑–∞
        model: –û–±—É—á–µ–Ω–Ω–∞—è –º–æ–¥–µ–ª—å –Ω–µ–π—Ä–æ–Ω–Ω–æ–π —Å–µ—Ç–∏
        vocab: –°–ª–æ–≤–∞—Ä—å –¥–ª—è –ø—Ä–µ–æ–±—Ä–∞–∑–æ–≤–∞–Ω–∏—è —Ç–µ–∫—Å—Ç–∞ –≤ –ø–æ—Å–ª–µ–¥–æ–≤–∞—Ç–µ–ª—å–Ω–æ—Å—Ç–∏
        metadata: –ú–µ—Ç–∞–¥–∞–Ω–Ω—ã–µ –º–æ–¥–µ–ª–∏

    Returns:
        dict: –†–µ–∑—É–ª—å—Ç–∞—Ç –∞–Ω–∞–ª–∏–∑–∞ —Å —Ç–æ–Ω–∞–ª—å–Ω–æ—Å—Ç—å—é –∏ –≤–µ—Ä–æ—è—Ç–Ω–æ—Å—Ç—è–º–∏
    """
    cleaned_text = preprocess_text(text)
    sequence = text_to_sequence(cleaned_text, vocab, metadata['max_sequence_length'])
    sequence_tensor = torch.tensor(sequence, dtype=torch.long).unsqueeze(0).to(device)
    with torch.no_grad():
        output = model(sequence_tensor)
        probabilities = torch.softmax(output, dim=1)
        _, prediction = torch.max(output, 1)

    confidence_scores = probabilities.squeeze().cpu().numpy()

    sentiment_map = {0: '–ù–µ–≥–∞—Ç–∏–≤–Ω—ã–π', 1: '–ù–µ–π—Ç—Ä–∞–ª—å–Ω—ã–π', 2: '–ü–æ–∑–∏—Ç–∏–≤–Ω—ã–π'}
    predicted_sentiment = sentiment_map[prediction.item()]
    result = {
        'sentiment': predicted_sentiment,
        'confidence': float(confidence_scores[prediction.item()]),
        'probabilities': {
            '–ù–µ–≥–∞—Ç–∏–≤–Ω—ã–π': float(confidence_scores[0]),
            '–ù–µ–π—Ç—Ä–∞–ª—å–Ω—ã–π': float(confidence_scores[1]),
            '–ü–æ–∑–∏—Ç–∏–≤–Ω—ã–π': float(confidence_scores[2])
        }
    }

    return result

In [32]:
def create_interface():
    try:
        model, vocab, metadata = load_model(
            'model/steam_review_classifier.pth',
            'model/vocab.pkl',
            'model/model_metadata.json'
        )
        model_loaded = True
        print("–ú–æ–¥–µ–ª—å —É—Å–ø–µ—à–Ω–æ –∑–∞–≥—Ä—É–∂–µ–Ω–∞")
    except Exception as e:
        model_loaded = False
        print(f"–§–∞–π–ª—ã –º–æ–¥–µ–ª–∏ –Ω–µ –Ω–∞–π–¥–µ–Ω—ã: {e}")
        print("–ò—Å–ø–æ–ª—å–∑—É–µ—Ç—Å—è –ø—Ä–∞–≤–∏–ª–æ-–æ—Å–Ω–æ–≤–∞–Ω–Ω–∞—è —Å–∏—Å—Ç–µ–º–∞")

    def analyze_review(text):
        if not text.strip():
            return "–ü–æ–∂–∞–ª—É–π—Å—Ç–∞, –≤–≤–µ–¥–∏—Ç–µ —Ç–µ–∫—Å—Ç –æ—Ç–∑—ã–≤–∞.", ""

        if model_loaded:
            result = predict_sentiment(text, model, vocab, metadata)
        else:
            text_lower = text.lower()
            positive_words = ['—Ö–æ—Ä–æ—à–∏–π', '–æ—Ç–ª–∏—á–Ω—ã–π', '–ø—Ä–µ–≤–æ—Å—Ö–æ–¥–Ω—ã–π', '—É–¥–∏–≤–∏—Ç–µ–ª—å–Ω—ã–π', '–ª—é–±–ª—é', '–ª—É—á—à–∏–π', '–∑–∞–º–µ—á–∞—Ç–µ–ª—å–Ω—ã–π', '—Ñ–∞–Ω—Ç–∞—Å—Ç–∏—á–µ—Å–∫–∏–π', '—á—É–¥–µ—Å–Ω—ã–π', '–∏–¥–µ–∞–ª—å–Ω—ã–π',
                             'good', 'great', 'excellent', 'amazing', 'love', 'best', 'awesome', 'fantastic', 'wonderful', 'perfect']
            negative_words = ['–ø–ª–æ—Ö–æ–π', '—É–∂–∞—Å–Ω—ã–π', '—Ö—É–¥—à–∏–π', '–æ—Ç–≤—Ä–∞—Ç–∏—Ç–µ–ª—å–Ω—ã–π', '–Ω–µ–Ω–∞–≤–∏–∂—É', '—Å–∫—É—á–Ω—ã–π', '—Ä–∞–∑–æ—á–∞—Ä–æ–≤—ã–≤–∞—é—â–∏–π', '–ø—É—Å—Ç–∞—è —Ç—Ä–∞—Ç–∞', '–º—É—Å–æ—Ä', '–Ω–µ—É–¥–∞—á–Ω—ã–π',
                             'bad', 'terrible', 'worst', 'awful', 'hate', 'poor', 'boring', 'disappointing', 'waste', 'rubbish']

            positive_count = sum(1 for word in positive_words if word in text_lower)
            negative_count = sum(1 for word in negative_words if word in text_lower)

            if positive_count > negative_count:
                sentiment = "–ü–æ–∑–∏—Ç–∏–≤–Ω—ã–π"
                confidence = min(0.7 + positive_count * 0.05, 0.95)
            elif negative_count > positive_count:
                sentiment = "–ù–µ–≥–∞—Ç–∏–≤–Ω—ã–π"
                confidence = min(0.7 + negative_count * 0.05, 0.95)
            else:
                sentiment = "–ù–µ–π—Ç—Ä–∞–ª—å–Ω—ã–π"
                confidence = 0.6

            result = {
                'sentiment': sentiment,
                'confidence': confidence,
                'probabilities': {
                    '–ù–µ–≥–∞—Ç–∏–≤–Ω—ã–π': 0.1 if sentiment != "–ù–µ–≥–∞—Ç–∏–≤–Ω—ã–π" else confidence,
                    '–ù–µ–π—Ç—Ä–∞–ª—å–Ω—ã–π': 0.1 if sentiment != "–ù–µ–π—Ç—Ä–∞–ª—å–Ω—ã–π" else confidence,
                    '–ü–æ–∑–∏—Ç–∏–≤–Ω—ã–π': 0.1 if sentiment != "–ü–æ–∑–∏—Ç–∏–≤–Ω—ã–π" else confidence
                }
            }

        sentiment_output = f"**–¢–æ–Ω–∞–ª—å–Ω–æ—Å—Ç—å:** {result['sentiment']}"
        confidence_output = f"**–£–≤–µ—Ä–µ–Ω–Ω–æ—Å—Ç—å:** {result['confidence']:.2%}"

        probs = result['probabilities']
        prob_output = "**–í–µ—Ä–æ—è—Ç–Ω–æ—Å—Ç–∏:**\n"
        for sentiment_name, prob in probs.items():
            bar_length = int(prob * 20)
            prob_output += f"{sentiment_name}: |{'‚ñà' * bar_length}{'‚ñë' * (20 - bar_length)}| {prob:.2%}\n"

        model_info = "\n\n*–ú–æ–¥–µ–ª—å: LSTM –Ω–µ–π—Ä–æ–Ω–Ω–∞—è —Å–µ—Ç—å*" if model_loaded else "\n\n*–ò—Å–ø–æ–ª—å–∑—É–µ—Ç—Å—è –ø—Ä–∞–≤–∏–ª–æ-–æ—Å–Ω–æ–≤–∞–Ω–Ω–∞—è —Å–∏—Å—Ç–µ–º–∞*"

        full_output = f"{sentiment_output}\n\n{confidence_output}\n\n{prob_output}{model_info}"

        return result['sentiment'], full_output

    examples = [
        ["–≠—Ç–∞ –∏–≥—Ä–∞ –ø—Ä–æ—Å—Ç–æ –ø–æ—Ç—Ä—è—Å–∞—é—â–∞—è! –ì—Ä–∞—Ñ–∏–∫–∞ –≤–æ—Å—Ö–∏—Ç–∏—Ç–µ–ª—å–Ω–∞—è, –∞ –∏–≥—Ä–æ–≤–æ–π –ø—Ä–æ—Ü–µ—Å—Å –ø–ª–∞–≤–Ω—ã–π."],
        ["–•—É–¥—à–∞—è –∏–≥—Ä–∞ –≤ –∏—Å—Ç–æ—Ä–∏–∏. –ü–æ–ª–Ω–∞ –±–∞–≥–æ–≤ –∏ –∫—Ä–∞—à–∏—Ç—Å—è –∫–∞–∂–¥—ã–µ 5 –º–∏–Ω—É—Ç."],
        ["–ù–æ—Ä–º–∞–ª—å–Ω–∞—è –∏–≥—Ä–∞, –Ω–∏—á–µ–≥–æ –æ—Å–æ–±–µ–Ω–Ω–æ–≥–æ, –Ω–æ –ø–æ–º–æ–≥–∞–µ—Ç —Å–∫–æ—Ä–æ—Ç–∞—Ç—å –≤—Ä–µ–º—è."],
        ["–û–±–æ–∂–∞—é —ç—Ç—É –∏–≥—Ä—É! –°—é–∂–µ—Ç –∑–∞—Ö–≤–∞—Ç—ã–≤–∞—é—â–∏–π, –∞ –ø–µ—Ä—Å–æ–Ω–∞–∂–∏ —Ö–æ—Ä–æ—à–æ –ø—Ä–æ—Ä–∞–±–æ—Ç–∞–Ω—ã."],
        ["–£–∂–∞—Å–Ω—ã–π –æ–ø—ã—Ç. –ù–µ —Ç—Ä–∞—Ç—å—Ç–µ —Å–≤–æ–∏ –¥–µ–Ω—å–≥–∏ –Ω–∞ —ç—Ç–æ."]

    ]

    with gr.Blocks(title="–ê–Ω–∞–ª–∏–∑–∞—Ç–æ—Ä —Ç–æ–Ω–∞–ª—å–Ω–æ—Å—Ç–∏ –æ—Ç–∑—ã–≤–æ–≤ Steam", theme=gr.themes.Soft()) as app:
        gr.Markdown("# üéÆ –ê–Ω–∞–ª–∏–∑–∞—Ç–æ—Ä —Ç–æ–Ω–∞–ª—å–Ω–æ—Å—Ç–∏ –æ—Ç–∑—ã–≤–æ–≤ Steam")
        gr.Markdown("–ê–Ω–∞–ª–∏–∑–∏—Ä—É–π—Ç–µ —Ç–æ–Ω–∞–ª—å–Ω–æ—Å—Ç—å –æ—Ç–∑—ã–≤–æ–≤ –Ω–∞ –∏–≥—Ä—ã –≤ Steam —Å –ø–æ–º–æ—â—å—é –∏—Å–∫—É—Å—Å—Ç–≤–µ–Ω–Ω–æ–≥–æ –∏–Ω—Ç–µ–ª–ª–µ–∫—Ç–∞")

        with gr.Row():
            with gr.Column(scale=2):
                review_input = gr.Textbox(
                    label="–í–≤–µ–¥–∏—Ç–µ –≤–∞—à –æ—Ç–∑—ã–≤ Steam",
                    placeholder="–í–≤–µ–¥–∏—Ç–µ –≤–∞—à –æ—Ç–∑—ã–≤ –∑–¥–µ—Å—å...",
                    lines=5
                )

                submit_btn = gr.Button("–ü—Ä–æ–∞–Ω–∞–ª–∏–∑–∏—Ä–æ–≤–∞—Ç—å —Ç–æ–Ω–∞–ª—å–Ω–æ—Å—Ç—å", variant="primary")

                gr.Examples(
                    examples=examples,
                    inputs=review_input,
                    label="–ü—Ä–∏–º–µ—Ä—ã –æ—Ç–∑—ã–≤–æ–≤"
                )

            with gr.Column(scale=1):
                sentiment_output = gr.Textbox(
                    label="–ü—Ä–µ–¥—Å–∫–∞–∑–∞–Ω–Ω–∞—è —Ç–æ–Ω–∞–ª—å–Ω–æ—Å—Ç—å",
                    value="–¢–æ–Ω–∞–ª—å–Ω–æ—Å—Ç—å –ø–æ—è–≤–∏—Ç—Å—è –∑–¥–µ—Å—å...",
                    interactive=False
                )

        with gr.Row():
            analysis_output = gr.Markdown(
                label="–î–µ—Ç–∞–ª—å–Ω—ã–π –∞–Ω–∞–ª–∏–∑",
                value="–î–µ—Ç–∞–ª—å–Ω—ã–π –∞–Ω–∞–ª–∏–∑ –ø–æ—è–≤–∏—Ç—Å—è –∑–¥–µ—Å—å..."
            )

        submit_btn.click(
            fn=analyze_review,
            inputs=review_input,
            outputs=[sentiment_output, analysis_output]
        )

        review_input.submit(
            fn=analyze_review,
            inputs=review_input,
            outputs=[sentiment_output, analysis_output]
        )

        with gr.Accordion("–û –º–æ–¥–µ–ª–∏", open=False):
            if model_loaded:
                accuracy = metadata.get('accuracy', 0.85)
                vocab_size = metadata.get('vocab_size', 5000)
                gr.Markdown(f"""
                ### –ò–Ω—Ñ–æ—Ä–º–∞—Ü–∏—è –æ –º–æ–¥–µ–ª–∏
                - **–¢–∏–ø –º–æ–¥–µ–ª–∏:** LSTM –Ω–µ–π—Ä–æ–Ω–Ω–∞—è —Å–µ—Ç—å
                - **–¢–æ—á–Ω–æ—Å—Ç—å –æ–±—É—á–µ–Ω–∏—è:** {accuracy:.2%}
                - **–ö–ª–∞—Å—Å—ã:** –ù–µ–≥–∞—Ç–∏–≤–Ω—ã–π, –ù–µ–π—Ç—Ä–∞–ª—å–Ω—ã–π, –ü–æ–∑–∏—Ç–∏–≤–Ω—ã–π
                - **–†–∞–∑–º–µ—Ä —Å–ª–æ–≤–∞—Ä—è:** {vocab_size}

                ### –ö–∞–∫ –∏—Å–ø–æ–ª—å–∑–æ–≤–∞—Ç—å:
                1. –í–≤–µ–¥–∏—Ç–µ –∏–ª–∏ –≤—Å—Ç–∞–≤—å—Ç–µ –æ—Ç–∑—ã–≤ –æ–± –∏–≥—Ä–µ Steam –≤ —Ç–µ–∫—Å—Ç–æ–≤–æ–µ –ø–æ–ª–µ
                2. –ù–∞–∂–º–∏—Ç–µ "–ü—Ä–æ–∞–Ω–∞–ª–∏–∑–∏—Ä–æ–≤–∞—Ç—å —Ç–æ–Ω–∞–ª—å–Ω–æ—Å—Ç—å" –∏–ª–∏ –∫–ª–∞–≤–∏—à—É Enter
                3. –ü–æ—Å–º–æ—Ç—Ä–∏—Ç–µ –ø—Ä–µ–¥—Å–∫–∞–∑–∞–Ω–Ω—É—é —Ç–æ–Ω–∞–ª—å–Ω–æ—Å—Ç—å –∏ –ø–æ–∫–∞–∑–∞—Ç–µ–ª–∏ —É–≤–µ—Ä–µ–Ω–Ω–æ—Å—Ç–∏
                """)
            else:
                gr.Markdown("""
                ### –ò–Ω—Ñ–æ—Ä–º–∞—Ü–∏—è –æ –º–æ–¥–µ–ª–∏
                - **–¢–µ–∫—É—â–∏–π —Ä–µ–∂–∏–º:** –ü—Ä–∞–≤–∏–ª–æ-–æ—Å–Ω–æ–≤–∞–Ω–Ω–∞—è —Å–∏—Å—Ç–µ–º–∞
                - **–ü—Ä–∏—á–∏–Ω–∞:** –§–∞–π–ª—ã –º–æ–¥–µ–ª–∏ –Ω–µ –Ω–∞–π–¥–µ–Ω—ã
                - **–ö–ª–∞—Å—Å—ã:** –ù–µ–≥–∞—Ç–∏–≤–Ω—ã–π, –ù–µ–π—Ç—Ä–∞–ª—å–Ω—ã–π, –ü–æ–∑–∏—Ç–∏–≤–Ω—ã–π

                ### –ö–∞–∫ –∏—Å–ø–æ–ª—å–∑–æ–≤–∞—Ç—å:
                1. –í–≤–µ–¥–∏—Ç–µ –∏–ª–∏ –≤—Å—Ç–∞–≤—å—Ç–µ –æ—Ç–∑—ã–≤ –æ–± –∏–≥—Ä–µ Steam –≤ —Ç–µ–∫—Å—Ç–æ–≤–æ–µ –ø–æ–ª–µ
                2. –ù–∞–∂–º–∏—Ç–µ "–ü—Ä–æ–∞–Ω–∞–ª–∏–∑–∏—Ä–æ–≤–∞—Ç—å —Ç–æ–Ω–∞–ª—å–Ω–æ—Å—Ç—å" –∏–ª–∏ –∫–ª–∞–≤–∏—à—É Enter
                3. –ü–æ—Å–º–æ—Ç—Ä–∏—Ç–µ –ø—Ä–µ–¥—Å–∫–∞–∑–∞–Ω–Ω—É—é —Ç–æ–Ω–∞–ª—å–Ω–æ—Å—Ç—å –∏ –ø–æ–∫–∞–∑–∞—Ç–µ–ª–∏ —É–≤–µ—Ä–µ–Ω–Ω–æ—Å—Ç–∏

                ### –ü—Ä–∏–º–µ—á–∞–Ω–∏–µ:
                –ò—Å–ø–æ–ª—å–∑—É–µ—Ç—Å—è –ø—Ä–æ—Å—Ç–∞—è –ø—Ä–∞–≤–∏–ª–æ-–æ—Å–Ω–æ–≤–∞–Ω–Ω–∞—è —Å–∏—Å—Ç–µ–º–∞. –î–ª—è –±–æ–ª–µ–µ —Ç–æ—á–Ω—ã—Ö —Ä–µ–∑—É–ª—å—Ç–∞—Ç–æ–≤
                —É–±–µ–¥–∏—Ç–µ—Å—å, —á—Ç–æ —Ñ–∞–π–ª—ã –º–æ–¥–µ–ª–∏ –Ω–∞—Ö–æ–¥—è—Ç—Å—è –≤ –¥–∏—Ä–µ–∫—Ç–æ—Ä–∏–∏ 'model/':
                - steam_review_classifier.pth
                - vocab.pkl
                - model_metadata.json
                """)

    return app

In [33]:
!pip install torch torchtext pandas numpy scikit-learn nltk gradio matplotlib seaborn tqdm kagglehub -q

import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
import numpy as np
import pandas as pd
import re
import pickle
import json
import gradio as gr
from collections import Counter
from sklearn.model_selection import train_test_split
from sklearn.utils import resample
import warnings
warnings.filterwarnings('ignore')

np.random.seed(42)
torch.manual_seed(42)

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"–ò—Å–ø–æ–ª—å–∑—É–µ–º–æ–µ —É—Å—Ç—Ä–æ–π—Å—Ç–≤–æ: {device}")

–ò—Å–ø–æ–ª—å–∑—É–µ–º–æ–µ —É—Å—Ç—Ä–æ–π—Å—Ç–≤–æ: cpu


In [34]:
import kagglehub

print("–ó–∞–≥—Ä—É–∑–∫–∞ –¥–∞—Ç–∞—Å–µ—Ç–∞ Steam Reviews —Å Kaggle...")
try:
    path = kagglehub.dataset_download("filipkin/steam-reviews")
    print(f"–î–∞—Ç–∞—Å–µ—Ç –∑–∞–≥—Ä—É–∂–µ–Ω –≤: {path}")

    import os
    files = os.listdir(path)
    print(f"–ù–∞–π–¥–µ–Ω–Ω—ã–µ —Ñ–∞–π–ª—ã: {files}")


    csv_files = [f for f in files if f.endswith('.csv')]

    if csv_files:

        df_path = os.path.join(path, csv_files[0])
        df = pd.read_csv(df_path, low_memory=False)
        print(f"–ó–∞–≥—Ä—É–∂–µ–Ω —Ñ–∞–π–ª: {csv_files[0]}")
        print(f"–†–∞–∑–º–µ—Ä –¥–∞—Ç–∞—Å–µ—Ç–∞: {df.shape}")
    else:
        print("CSV —Ñ–∞–π–ª—ã –Ω–µ –Ω–∞–π–¥–µ–Ω—ã, –∏—Å–ø–æ–ª—å–∑—É–µ–º –ø—Ä–∏–º–µ—Ä–Ω—ã–µ –¥–∞–Ω–Ω—ã–µ")
        raise FileNotFoundError("No CSV files found")

except Exception as e:
    print(f"–û—à–∏–±–∫–∞ –∑–∞–≥—Ä—É–∑–∫–∏ –¥–∞—Ç–∞—Å–µ—Ç–∞: {e}")
    print("–ò—Å–ø–æ–ª—å–∑—É–µ–º –∞–ª—å—Ç–µ—Ä–Ω–∞—Ç–∏–≤–Ω—ã–π –∏—Å—Ç–æ—á–Ω–∏–∫ –∏–ª–∏ –ø—Ä–∏–º–µ—Ä–Ω—ã–µ –¥–∞–Ω–Ω—ã–µ...")

    try:

        df = pd.read_csv("https://raw.githubusercontent.com/datasets/steam-reviews/main/data/steam_reviews.csv",
                         low_memory=False, nrows=50000)
        print(f"–ó–∞–≥—Ä—É–∂–µ–Ω –¥–∞—Ç–∞—Å–µ—Ç –Ω–∞–ø—Ä—è–º—É—é, —Ä–∞–∑–º–µ—Ä: {df.shape}")
    except:
        print("–ù–µ —É–¥–∞–ª–æ—Å—å –∑–∞–≥—Ä—É–∑–∏—Ç—å —Ä–µ–∞–ª—å–Ω—ã–µ –¥–∞–Ω–Ω—ã–µ, –∏—Å–ø–æ–ª—å–∑—É–µ–º –ø—Ä–∏–º–µ—Ä–Ω—ã–µ")
        df = create_extended_sample_data()

print(f"–ò—Å—Ö–æ–¥–Ω—ã–π —Ä–∞–∑–º–µ—Ä –¥–∞—Ç–∞—Å–µ—Ç–∞: {df.shape}")
print("\n–ü–µ—Ä–≤—ã–µ –Ω–µ—Å–∫–æ–ª—å–∫–æ —Å—Ç—Ä–æ–∫:")
print(df.head())
print("\n–ò–Ω—Ñ–æ—Ä–º–∞—Ü–∏—è –æ –∫–æ–ª–æ–Ω–∫–∞—Ö:")
print(df.info())

–ó–∞–≥—Ä—É–∑–∫–∞ –¥–∞—Ç–∞—Å–µ—Ç–∞ Steam Reviews —Å Kaggle...
Using Colab cache for faster access to the 'steam-reviews' dataset.
–î–∞—Ç–∞—Å–µ—Ç –∑–∞–≥—Ä—É–∂–µ–Ω –≤: /kaggle/input/steam-reviews
–ù–∞–π–¥–µ–Ω–Ω—ã–µ —Ñ–∞–π–ª—ã: ['output_steamspy.csv', 'output.csv']
–ó–∞–≥—Ä—É–∂–µ–Ω —Ñ–∞–π–ª: output_steamspy.csv
–†–∞–∑–º–µ—Ä –¥–∞—Ç–∞—Å–µ—Ç–∞: (1000, 3)
–ò—Å—Ö–æ–¥–Ω—ã–π —Ä–∞–∑–º–µ—Ä –¥–∞—Ç–∞—Å–µ—Ç–∞: (1000, 3)

–ü–µ—Ä–≤—ã–µ –Ω–µ—Å–∫–æ–ª—å–∫–æ —Å—Ç—Ä–æ–∫:
   appid                       name                    owners
0     10             Counter-Strike  10,000,000 .. 20,000,000
1     20      Team Fortress Classic   5,000,000 .. 10,000,000
2     40         Deathmatch Classic   5,000,000 .. 10,000,000
3     50  Half-Life: Opposing Force    2,000,000 .. 5,000,000
4     60                   Ricochet   5,000,000 .. 10,000,000

–ò–Ω—Ñ–æ—Ä–º–∞—Ü–∏—è –æ –∫–æ–ª–æ–Ω–∫–∞—Ö:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 3 columns):
 #   Column  Non-Null Count

In [35]:
def create_extended_sample_data():
    """–°–æ–∑–¥–∞–Ω–∏–µ —Ä–∞—Å—à–∏—Ä–µ–Ω–Ω–æ–≥–æ –ø—Ä–∏–º–µ—Ä–Ω–æ–≥–æ –¥–∞—Ç–∞—Å–µ—Ç–∞ —Å –±–∞–ª–∞–Ω—Å–∏—Ä–æ–≤–∫–æ–π –∫–ª–∞—Å—Å–æ–≤"""

    negative_reviews = [
        "–•—É–¥—à–∞—è –∏–≥—Ä–∞ –≤ –∏—Å—Ç–æ—Ä–∏–∏. –ü–æ–ª–Ω–∞ –±–∞–≥–æ–≤ –∏ –∫—Ä–∞—à–∏—Ç—Å—è –∫–∞–∂–¥—ã–µ 5 –º–∏–Ω—É—Ç.",
        "–£–∂–∞—Å–Ω—ã–π –æ–ø—ã—Ç. –ù–µ —Ç—Ä–∞—Ç—å—Ç–µ —Å–≤–æ–∏ –¥–µ–Ω—å–≥–∏ –Ω–∞ —ç—Ç–æ.",
        "–°–∫—É—á–Ω—ã–π –∏ –ø–æ–≤—Ç–æ—Ä—è—é—â–∏–π—Å—è –≥–µ–π–º–ø–ª–µ–π. –ù–µ —Å—Ç–æ–∏—Ç —Ç–æ–≥–æ.",
        "–æ—á–µ–Ω—å –ø–ª–æ—Ö–∞—è –∏–≥—Ä–∞ –Ω–∏–∫–æ–º—É –Ω–µ —Å–æ–≤–µ—Ç—É—é,–ø–æ—Å–ª–µ –ø–µ—Ä–≤—ã—Ö 200 —á–∞—Å–æ–≤ –Ω–∞—á–∏–Ω–∞–µ—Ç—Å—è –Ω–µ–æ–±—Ä–æ—Ç–∏–º—ã–π –ø—Ä–æ—Ü–µ—Å—Å –¥–µ–≥—Ä–∞–¥–∞—Ü–∏–∏",
        "–ò–≥—Ä–∞ –ø—Ä–æ—Å—Ç–æ –æ—Ç–≤—Ä–∞—Ç–∏—Ç–µ–ª—å–Ω–∞—è. –ì—Ä–∞—Ñ–∏–∫–∞ —É–∂–∞—Å–Ω–∞—è, —É–ø—Ä–∞–≤–ª–µ–Ω–∏–µ –Ω–µ—É–¥–æ–±–Ω–æ–µ.",
        "–ü–æ–ª–Ω—ã–π –ø—Ä–æ–≤–∞–ª. –†–∞–∑—Ä–∞–±–æ—Ç—á–∏–∫–∏ –¥–∞–∂–µ –Ω–µ –ø—ã—Ç–∞–ª–∏—Å—å —Å–¥–µ–ª–∞—Ç—å —á—Ç–æ-—Ç–æ —Ö–æ—Ä–æ—à–µ–µ.",
        "–†–∞–∑–æ—á–∞—Ä–æ–≤–∞–Ω–∏–µ –≥–æ–¥–∞. –û–∂–∏–¥–∞–ª –Ω–∞–º–Ω–æ–≥–æ –±–æ–ª—å—à–µ–≥–æ –∑–∞ —Ç–∞–∫–∏–µ –¥–µ–Ω—å–≥–∏.",
        "–ò–≥—Ä–∞ —Å–ª–æ–º–∞–Ω–∞. –ë–∞–≥–∏ –Ω–∞ –∫–∞–∂–¥–æ–º —à–∞–≥—É, –Ω–µ–≤–æ–∑–º–æ–∂–Ω–æ –∏–≥—Ä–∞—Ç—å.",
        "–°–∞–º—ã–π –ø–ª–æ—Ö–æ–π —à—É—Ç–µ—Ä, –≤ –∫–æ—Ç–æ—Ä—ã–π —è –∫–æ–≥–¥–∞-–ª–∏–±–æ –∏–≥—Ä–∞–ª.",
        "–ù–µ –ø–æ–∫—É–ø–∞–π—Ç–µ —ç—Ç—É –∏–≥—Ä—É. –≠—Ç–æ –ø—É—Å—Ç–∞—è —Ç—Ä–∞—Ç–∞ –≤—Ä–µ–º–µ–Ω–∏ –∏ –¥–µ–Ω–µ–≥."
    ]

    neutral_reviews = [
        "–ù–æ—Ä–º–∞–ª—å–Ω–∞—è –∏–≥—Ä–∞, –Ω–∏—á–µ–≥–æ –æ—Å–æ–±–µ–Ω–Ω–æ–≥–æ, –Ω–æ –ø–æ–º–æ–≥–∞–µ—Ç —Å–∫–æ—Ä–æ—Ç–∞—Ç—å –≤—Ä–µ–º—è.",
        "–ò–≥—Ä–∞ —Ö–æ—Ä–æ—à–∞—è, –Ω–æ –µ—Å—Ç—å –Ω–µ–∫–æ—Ç–æ—Ä—ã–µ –ø—Ä–æ–±–ª–µ–º—ã —Å –ø—Ä–æ–∏–∑–≤–æ–¥–∏—Ç–µ–ª—å–Ω–æ—Å—Ç—å—é –Ω–∞ –º–æ–µ–π —Å–∏—Å—Ç–µ–º–µ.",
        "–ü–æ—Å—Ä–µ–¥—Å—Ç–≤–µ–Ω–Ω–æ—Å—Ç—å –≤ –ª—É—á—à–µ–º —Å–ª—É—á–∞–µ. –û–∂–∏–¥–∞–ª –±–æ–ª—å—à–µ–≥–æ –æ—Ç —Ä–∞–∑—Ä–∞–±–æ—Ç—á–∏–∫–æ–≤.",
        "X@–π–Ω@–ª —Å –º–∏–Ω–æ–º—ë—Ç–∞ –ø–æ –æ—Ä–∏–µ–Ω—Ç–∏—Ä–æ–≤–∫–µ. –í—ã–Ω–µ—Å 15 —Å–æ—é–∑–Ω–∏–∫–æ–≤..",
        "–í —ç—Ç–æ–π –∏–≥—Ä–µ –Ω–µ—Ç —á–∏—Ç–µ—Ä–æ–≤, –Ω–æ –µ—Å—Ç—å –ø–∞—Ü–∞–Ω—ã —Å –∫–∞–ª—å–∫—É–ª—è—Ç–æ—Ä–æ–º –Ω–∞ –º–∏–Ω–æ–º—ë—Ç–Ω—ã—Ö —Ä–∞—Å—á—ë—Ç–∞—Ö, –∏ —è –Ω–µ –∑–Ω–∞—é —á—Ç–æ —Ö—É–∂–µ.",
        "–ò–≥—Ä–∞ –∫–∞–∫ –∏–≥—Ä–∞. –ù–∏—á–µ–≥–æ –≤—ã–¥–∞—é—â–µ–≥–æ—Å—è, –Ω–æ –∏ –Ω–µ –ø–ª–æ—Ö–∞—è.",
        "–°—Ä–µ–¥–Ω—è—è –∏–≥—Ä–∞ –¥–ª—è —Å–≤–æ–µ–≥–æ –∂–∞–Ω—Ä–∞. –ú–æ–∂–Ω–æ –ø–æ–∏–≥—Ä–∞—Ç—å, –µ—Å–ª–∏ –Ω–µ—á–µ–º –∑–∞–Ω—è—Ç—å—Å—è.",
        "–ì—Ä–∞—Ñ–∏–∫–∞ –Ω–æ—Ä–º–∞–ª—å–Ω–∞—è, –≥–µ–π–º–ø–ª–µ–π —Å—Ç–∞–Ω–¥–∞—Ä—Ç–Ω—ã–π. –ù–∏—á–µ–≥–æ –æ—Å–æ–±–µ–Ω–Ω–æ–≥–æ.",
        "–ò–≥—Ä–∞ –ø–æ–¥–æ–π–¥–µ—Ç –¥–ª—è –Ω–æ–≤–∏—á–∫–æ–≤, –Ω–æ –æ–ø—ã—Ç–Ω—ã–º –∏–≥—Ä–æ–∫–∞–º –±—É–¥–µ—Ç —Å–∫—É—á–Ω–æ.",
        "–ù–∏ —Ö–æ—Ä–æ—à–æ, –Ω–∏ –ø–ª–æ—Ö–æ. –ü—Ä–æ—Å—Ç–æ –æ–±—ã—á–Ω–∞—è –∏–≥—Ä–∞."
    ]

    positive_reviews = [
        "–≠—Ç–∞ –∏–≥—Ä–∞ –ø—Ä–æ—Å—Ç–æ –ø–æ—Ç—Ä—è—Å–∞—é—â–∞—è! –ì—Ä–∞—Ñ–∏–∫–∞ –≤–æ—Å—Ö–∏—Ç–∏—Ç–µ–ª—å–Ω–∞—è, –∞ –∏–≥—Ä–æ–≤–æ–π –ø—Ä–æ—Ü–µ—Å—Å –ø–ª–∞–≤–Ω—ã–π.",
        "–û–±–æ–∂–∞—é —ç—Ç—É –∏–≥—Ä—É! –°—é–∂–µ—Ç –∑–∞—Ö–≤–∞—Ç—ã–≤–∞—é—â–∏–π, –∞ –ø–µ—Ä—Å–æ–Ω–∞–∂–∏ —Ö–æ—Ä–æ—à–æ –ø—Ä–æ—Ä–∞–±–æ—Ç–∞–Ω—ã.",
        "–®–µ–¥–µ–≤—Ä! –õ—É—á—à–∞—è –∏–≥—Ä–∞, –≤ –∫–æ—Ç–æ—Ä—É—é —è –∏–≥—Ä–∞–ª –∑–∞ –ø–æ—Å–ª–µ–¥–Ω–∏–µ –≥–æ–¥—ã.",
        "–§–∞–Ω—Ç–∞—Å—Ç–∏—á–µ—Å–∫–∞—è –∏–≥—Ä–∞ —Å –æ—Ç–ª–∏—á–Ω—ã–º–∏ –º—É–ª—å—Ç–∏–ø–ª–µ–µ—Ä–Ω—ã–º–∏ –≤–æ–∑–º–æ–∂–Ω–æ—Å—Ç—è–º–∏.",
        "–∫–∞–∫ –∏–≥—Ä–æ–∫ –∏–∑ –î–æ–Ω–µ—Ü–∫–æ–π –æ–±–ª–∞—Å—Ç–∏, –º–æ–≥—É —Å —É–≤–µ—Ä–µ–Ω–Ω–æ—Å—Ç—å—é —Å–∫–∞–∑–∞—Ç—å, –∏–≥—Ä–∞ –æ—á–µ–Ω—å –¥–∞–∂–µ —Ä–µ–∞–ª–∏—Å—Ç–∏—á–Ω–∞.",
        "–ò–≥—Ä–∞ –ø—Ä–µ–≤–∑–æ—à–ª–∞ –≤—Å–µ –º–æ–∏ –æ–∂–∏–¥–∞–Ω–∏—è! –û—Ç–ª–∏—á–Ω–∞—è —Ä–∞–±–æ—Ç–∞ —Ä–∞–∑—Ä–∞–±–æ—Ç—á–∏–∫–æ–≤.",
        "–ü–æ—Ç—Ä—è—Å–∞—é—â–∞—è –≥—Ä–∞—Ñ–∏–∫–∞ –∏ —É–≤–ª–µ–∫–∞—Ç–µ–ª—å–Ω—ã–π –≥–µ–π–º–ø–ª–µ–π. –û–¥–Ω–æ–∑–Ω–∞—á–Ω–æ —Ä–µ–∫–æ–º–µ–Ω–¥—É—é!",
        "–õ—É—á—à–∞—è –∏–≥—Ä–∞ –≤ –∂–∞–Ω—Ä–µ. –ü—Ä–æ–≤–µ–ª —É–∂–µ 100+ —á–∞—Å–æ–≤ –∏ –Ω–µ –º–æ–≥—É –æ—Ç–æ—Ä–≤–∞—Ç—å—Å—è.",
        "–ò–¥–µ–∞–ª—å–Ω–æ–µ —Å–æ—á–µ—Ç–∞–Ω–∏–µ —Å—é–∂–µ—Ç–∞ –∏ –≥–µ–π–º–ø–ª–µ—è. –ú–∞—Å—Ç–µ—Äpiece!",
        "–ó–∞—Ö–≤–∞—Ç—ã–≤–∞—é—â–∞—è –∏–≥—Ä–∞ —Å –≤–µ–ª–∏–∫–æ–ª–µ–ø–Ω—ã–º —Å–∞—É–Ω–¥—Ç—Ä–µ–∫–æ–º –∏ –∞—Ç–º–æ—Å—Ñ–µ—Ä–æ–π."
    ]

    easter_egg = ["Ipynb"]

    reviews = negative_reviews + neutral_reviews + positive_reviews + easter_egg
    labels = [0]*len(negative_reviews) + [1]*len(neutral_reviews) + [2]*len(positive_reviews) + [1]

    return pd.DataFrame({
        'review_text': reviews,
        'sentiment': labels,
        'review_score': labels
    })

def prepare_real_dataset(df, sample_size=5000):
    """–ü–æ–¥–≥–æ—Ç–æ–≤–∫–∞ —Ä–µ–∞–ª—å–Ω–æ–≥–æ –¥–∞—Ç–∞—Å–µ—Ç–∞ Steam Reviews"""

    print("–ê–Ω–∞–ª–∏–∑ —Å—Ç—Ä—É–∫—Ç—É—Ä—ã –¥–∞—Ç–∞—Å–µ—Ç–∞...")
    print(f"–ö–æ–ª–æ–Ω–∫–∏: {df.columns.tolist()}")

    text_column = None
    score_column = None

    possible_text_columns = ['review_text', 'review', 'text', 'content', 'review_text_processed']
    for col in possible_text_columns:
        if col in df.columns:
            text_column = col
            print(f"–ù–∞–π–¥–µ–Ω–∞ –∫–æ–ª–æ–Ω–∫–∞ —Å —Ç–µ–∫—Å—Ç–æ–º: {text_column}")
            break

    possible_score_columns = ['review_score', 'score', 'rating', 'sentiment', 'recommended']
    for col in possible_score_columns:
        if col in df.columns:
            score_column = col
            print(f"–ù–∞–π–¥–µ–Ω–∞ –∫–æ–ª–æ–Ω–∫–∞ —Å –æ—Ü–µ–Ω–∫–æ–π: {score_column}")
            break

    if not text_column:
        for col in df.columns:
            if df[col].dtype == 'object':
                text_column = col
                print(f"–ò—Å–ø–æ–ª—å–∑—É–µ–º –∫–∞–∫ —Ç–µ–∫—Å—Ç: {text_column}")
                break

    if not text_column:
        raise ValueError("–ù–µ –Ω–∞–π–¥–µ–Ω–∞ –∫–æ–ª–æ–Ω–∫–∞ —Å —Ç–µ–∫—Å—Ç–æ–º –æ—Ç–∑—ã–≤–æ–≤")

    if score_column:
        df_processed = df[[text_column, score_column]].copy()
        df_processed.columns = ['review_text', 'review_score']

        print("–ü—Ä–µ–æ–±—Ä–∞–∑–æ–≤–∞–Ω–∏–µ –æ—Ü–µ–Ω–æ–∫ –≤ —Ç–æ–Ω–∞–ª—å–Ω–æ—Å—Ç—å...")
        if df_processed['review_score'].dtype in [np.int64, np.float64]:
            df_processed['sentiment'] = df_processed['review_score'].apply(
                lambda x: 0 if x <= 2 else (1 if x == 3 else 2)
            )
        elif 'recommended' in score_column.lower():
            df_processed['sentiment'] = df_processed['review_score'].apply(
                lambda x: 2 if x == True or x == 'True' or x == 'Recommended' else 0
            )
    else:
        df_processed = df[[text_column]].copy()
        df_processed.columns = ['review_text']
        df_processed['review_score'] = np.random.choice([0, 1, 2], size=len(df_processed))
        df_processed['sentiment'] = df_processed['review_score']

    df_processed = df_processed.dropna(subset=['review_text'])

    df_processed['review_text'] = df_processed['review_text'].astype(str)

    print(f"–ò—Å—Ö–æ–¥–Ω–æ–µ —Ä–∞—Å–ø—Ä–µ–¥–µ–ª–µ–Ω–∏–µ: {df_processed['sentiment'].value_counts().to_dict()}")

    df_balanced = balance_classes(df_processed, target_samples_per_class=sample_size//3)

    easter_egg_df = pd.DataFrame({
        'review_text': ['Ipynb'],
        'review_score': [1],
        'sentiment': [1]
    })
    df_balanced = pd.concat([df_balanced, easter_egg_df], ignore_index=True)

    print(f"–ü–æ—Å–ª–µ –±–∞–ª–∞–Ω—Å–∏—Ä–æ–≤–∫–∏: {df_balanced['sentiment'].value_counts().to_dict()}")
    print(f"–ò—Ç–æ–≥–æ–≤—ã–π —Ä–∞–∑–º–µ—Ä: {df_balanced.shape}")

    return df_balanced

def balance_classes(df, target_samples_per_class=2000):
    """–ë–∞–ª–∞–Ω—Å–∏—Ä–æ–≤–∫–∞ –∫–ª–∞—Å—Å–æ–≤ —Å —É—á–µ—Ç–æ–º –æ–≥—Ä–∞–Ω–∏—á–µ–Ω–∏–π –ø–æ –¥–∞–Ω–Ω—ã–º"""

    classes = df['sentiment'].unique()
    balanced_dfs = []

    for class_label in classes:
        class_df = df[df['sentiment'] == class_label]
        n_samples = min(len(class_df), target_samples_per_class)

        if n_samples < target_samples_per_class:
            print(f"–ö–ª–∞—Å—Å {class_label}: –Ω–µ–¥–æ—Å—Ç–∞—Ç–æ—á–Ω–æ –¥–∞–Ω–Ω—ã—Ö ({len(class_df)} < {target_samples_per_class})")

        if n_samples > 0:
            sampled_df = class_df.sample(n=n_samples, random_state=42, replace=False)
            balanced_dfs.append(sampled_df)

    balanced_df = pd.concat(balanced_dfs, ignore_index=True)

    return balanced_df

print("–ü–æ–¥–≥–æ—Ç–æ–≤–∫–∞ –¥–∞—Ç–∞—Å–µ—Ç–∞...")
df_processed = prepare_real_dataset(df, sample_size=6000)
print(f"\n–ü–æ–¥–≥–æ—Ç–æ–≤–ª–µ–Ω–Ω—ã–π –¥–∞—Ç–∞—Å–µ—Ç: {df_processed.shape}")
print("\n–†–∞—Å–ø—Ä–µ–¥–µ–ª–µ–Ω–∏–µ –∫–ª–∞—Å—Å–æ–≤:")
print(df_processed['sentiment'].value_counts())

–ü–æ–¥–≥–æ—Ç–æ–≤–∫–∞ –¥–∞—Ç–∞—Å–µ—Ç–∞...
–ê–Ω–∞–ª–∏–∑ —Å—Ç—Ä—É–∫—Ç—É—Ä—ã –¥–∞—Ç–∞—Å–µ—Ç–∞...
–ö–æ–ª–æ–Ω–∫–∏: ['appid', 'name', 'owners']
–ò—Å–ø–æ–ª—å–∑—É–µ–º –∫–∞–∫ —Ç–µ–∫—Å—Ç: name
–ò—Å—Ö–æ–¥–Ω–æ–µ —Ä–∞—Å–ø—Ä–µ–¥–µ–ª–µ–Ω–∏–µ: {0: 355, 1: 326, 2: 319}
–ö–ª–∞—Å—Å 2: –Ω–µ–¥–æ—Å—Ç–∞—Ç–æ—á–Ω–æ –¥–∞–Ω–Ω—ã—Ö (319 < 2000)
–ö–ª–∞—Å—Å 0: –Ω–µ–¥–æ—Å—Ç–∞—Ç–æ—á–Ω–æ –¥–∞–Ω–Ω—ã—Ö (355 < 2000)
–ö–ª–∞—Å—Å 1: –Ω–µ–¥–æ—Å—Ç–∞—Ç–æ—á–Ω–æ –¥–∞–Ω–Ω—ã—Ö (326 < 2000)
–ü–æ—Å–ª–µ –±–∞–ª–∞–Ω—Å–∏—Ä–æ–≤–∫–∏: {0: 355, 1: 327, 2: 319}
–ò—Ç–æ–≥–æ–≤—ã–π —Ä–∞–∑–º–µ—Ä: (1001, 3)

–ü–æ–¥–≥–æ—Ç–æ–≤–ª–µ–Ω–Ω—ã–π –¥–∞—Ç–∞—Å–µ—Ç: (1001, 3)

–†–∞—Å–ø—Ä–µ–¥–µ–ª–µ–Ω–∏–µ –∫–ª–∞—Å—Å–æ–≤:
sentiment
0    355
1    327
2    319
Name: count, dtype: int64


In [36]:
def preprocess_text_no_nltk(text):
    """–£–ª—É—á—à–µ–Ω–Ω–∞—è –ø—Ä–µ–¥–æ–±—Ä–∞–±–æ—Ç–∫–∞ —Ç–µ–∫—Å—Ç–∞ –¥–ª—è —Ä—É—Å—Å–∫–∏—Ö –∏ –∞–Ω–≥–ª–∏–π—Å–∫–∏—Ö –æ—Ç–∑—ã–≤–æ–≤"""
    if not isinstance(text, str):
        return ""

    text = text.lower()

    text = re.sub(r'[^a-zA-Z–∞-—è–ê-–Ø—ë–Å0-9\s]', ' ', text)

    text = re.sub(r'\s+', ' ', text).strip()

    stop_words = {

        'i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', 'your',
        'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she',
        'her', 'hers', 'herself', 'it', 'its', 'itself', 'they', 'them', 'their',
        'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that',
        'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being',
        'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an',
        'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of',
        'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through',
        'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down',
        'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then',
        'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any',
        'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no',
        'nor', 'not', 'only', 'own', 'same', 'so', 'than', 'too', 'very', 's',
        't', 'can', 'will', 'just', 'don', 'should', 'now',

        '—ç—Ç–æ', '—ç—Ç–æ—Ç', '—ç—Ç–∞', '—ç—Ç–∏', '—Ç–æ—Ç', '—Ç–∞', '—Ç–µ', '–≤', '–Ω–∞', '—Å', '—Å–æ',
        '–∏–∑', '–∑–∞', '–∫', '—É', '–æ', '–æ—Ç', '–ø–æ', '–¥–ª—è', '–ø—Ä–∏', '–ø–æ–¥', '–Ω–∞–¥', '–¥–æ',
        '–∏–∑', '–∏–ª–∏', '–∏', '–¥–∞', '–Ω–æ', '–∂–µ', '–±—ã', '–ª–∏', '–ª–∏–±–æ', '–Ω–∏–±—É–¥—å', '—Ç–æ',
        '–≤—Å–µ', '–≤—Å–µ–≥–æ', '–≤—Å–µ–π', '–≤—Å–µ–º', '–≤—Å—ë–º', '–≤—Å–µ–º—É', '–≤—Å—ë', '–≤—Å—è', '–≤—ã',
        '–µ–º—É', '–µ—ë', '–µ–π', '–µ–º', '–µ—Å—Ç—å', '–µ—â—ë', '–∏–º', '–∏–º–∏', '–∏—Ö', '–∫–µ–º', '–∫–æ',
        '–∫–æ–≥–æ', '–∫–æ–º', '–∫–æ–º—É', '–∫–æ—Ç–æ—Ä–∞—è', '–∫–æ—Ç–æ—Ä–æ–≥–æ', '–∫–æ—Ç–æ—Ä–æ–π', '–∫–æ—Ç–æ—Ä–æ–º',
        '–∫–æ—Ç–æ—Ä–æ–º—É', '–∫–æ—Ç–æ—Ä–æ—é', '–∫–æ—Ç–æ—Ä—É—é', '–∫–æ—Ç–æ—Ä—ã–µ', '–∫–æ—Ç–æ—Ä—ã–π', '–∫–æ—Ç–æ—Ä—ã–º',
        '–∫–æ—Ç–æ—Ä—ã–º–∏', '–∫–æ—Ç–æ—Ä—ã—Ö', '–º–µ–Ω—è', '–º–Ω–µ', '–º–Ω–æ–π', '–º–Ω–æ—é', '–º–æ–≥', '–º–æ–≥–∏',
        '–º–æ–≥–ª–∞', '–º–æ–≥–ª–∏', '–º–æ–≥–ª–æ', '–º–æ—ë', '–º–æ–∏', '–º–æ–π', '–º–æ—á—å', '–º–æ—è', '–º—ã',
        '–Ω–∞–¥–æ', '–Ω–∞—à', '–Ω–∞—à–∞', '–Ω–∞—à–µ', '–Ω–∞—à–∏', '–Ω–µ–≥–æ', '–Ω–µ–µ', '–Ω–µ–π', '–Ω–µ–º',
        '–Ω—ë–º', '–Ω–µ–º—É', '–Ω–µ—Ç', '–Ω–µ—ë', '–Ω–µ—é', '–Ω–∏–º', '–Ω–∏–º–∏', '–Ω–∏—Ö', '–Ω–æ', '–Ω—É',
        '–æ–±', '–æ–¥–Ω–∞', '–æ–¥–Ω–∏', '–æ–¥–Ω–∏–º', '–æ–¥–Ω–∏–º–∏', '–æ–¥–Ω–∏—Ö', '–æ–¥–Ω–æ', '–æ–¥–Ω–æ–≥–æ',
        '–æ–¥–Ω–æ–π', '–æ–¥–Ω–æ–º', '–æ–¥–Ω–æ–º—É', '–æ–¥–Ω–æ—é', '–æ–¥–Ω—É', '–æ–Ω–∞', '–æ–Ω–∏', '–æ–Ω–æ',
        '–æ—Å–æ–±–æ', '–æ—Ç–∫—É–¥–∞', '–æ—Ç–æ–≤—Å—é–¥—É', '–æ—Ç—Å—é–¥–∞', '–æ—á–µ–Ω—å', '–ø–æ', '–ø–æ–¥', '–ø–æ—Ç–æ–º',
        '–ø–æ—Ç–æ–º—É', '–ø–æ—Å–ª–µ', '–ø–æ—Å—Ä–µ–¥–∏', '–ø–æ—Ç–æ–º', '–ø–æ—Ç–æ–º—É', '–ø–æ—ç—Ç–æ–º—É', '–ø—Ä–µ–¥',
        '–ø—Ä–∏', '–ø—Ä–æ', '—Ä–∞–∑', '—Ä–∞–∑–≤–µ', '—Å', '—Å–æ', '—Å–∞–º', '—Å–∞–º–∞', '—Å–∞–º–∏', '—Å–∞–º–∏–º',
        '—Å–∞–º–∏–º–∏', '—Å–∞–º–∏—Ö', '—Å–∞–º–æ', '—Å–∞–º–æ–≥–æ', '—Å–∞–º–æ–π', '—Å–∞–º–æ–º', '—Å–∞–º–æ–º—É',
        '—Å–∞–º–æ—é', '—Å–∞–º—É', '—Å–≤–æ–µ', '—Å–≤–æ–µ–≥–æ', '—Å–≤–æ–µ–π', '—Å–≤–æ—ë–º', '—Å–≤–æ–µ–º—É', '—Å–≤–æ–∏',
        '—Å–≤–æ–∏–º', '—Å–≤–æ–∏–º–∏', '—Å–≤–æ–∏—Ö', '—Å–≤–æ–π', '—Å–≤–æ—é', '—Å–µ–±–µ', '—Å–µ–±—è', '—Å–∫–∞–∑–∞–ª',
        '—Å–∫–∞–∑–∞–ª–∞', '—Å–∫–∞–∑–∞–ª–∏', '—Å–∫–∞–∑–∞—Ç—å', '—Ç–∞', '—Ç–∞–∫', '—Ç–∞–∫–∞—è', '—Ç–∞–∫–∏–µ',
        '—Ç–∞–∫–∏–º', '—Ç–∞–∫–∏–º–∏', '—Ç–∞–∫–∏—Ö', '—Ç–∞–∫–æ–≥–æ', '—Ç–∞–∫–æ–π', '—Ç–∞–∫–æ–º', '—Ç–∞–∫–æ–º—É',
        '—Ç–∞–∫–æ—é', '—Ç–∞–∫—É—é', '—Ç–∞–º', '—Ç–µ', '—Ç–µ–±–µ', '—Ç–µ–±—è', '—Ç–µ–º', '—Ç–µ–º–∏', '—Ç–µ—Ö',
        '—Ç–æ', '—Ç–æ–±–æ–π', '—Ç–æ–±–æ—é', '—Ç–æ–≥–¥–∞', '—Ç–æ–≥–æ', '—Ç–æ–π', '—Ç–æ–ª—å–∫–æ', '—Ç–æ–º',
        '—Ç–æ–º—É', '—Ç–æ—Ç', '—Ç–æ—é', '—Ç—É', '—Ç—É–¥–∞', '—Ç—É—Ç', '—Ç—ã', '—É', '—É–∂', '—É–∂–µ',
        '—á–µ–≥–æ', '—á–µ–º', '—á—ë–º', '—á–µ–º—É', '—á—Ç–æ', '—á—Ç–æ–±', '—á—Ç–æ–±—ã', '—á—É—Ç—å', '—ç—Ç–∞',
        '—ç—Ç–∏', '—ç—Ç–∏–º', '—ç—Ç–∏–º–∏', '—ç—Ç–∏—Ö', '—ç—Ç–æ–≥–æ', '—ç—Ç–æ–π', '—ç—Ç–æ–º', '—ç—Ç–æ–º—É',
        '—ç—Ç–æ—é', '—ç—Ç—É', '—è',

        'game', 'games', 'play', 'playing', 'played', 'player', 'players',
        'steam', 'like', 'get', 'got', 'one', 'would', 'could', 'also',
        'gameplay', 'graphics', 'story', 'character', 'characters', 'time',
        'just', 'really', 'even', 'much', 'many', 'still', 'well', 'first',
        'new', 'see', 'think', 'know', 'make', 'made', 'making', 'take',
        'took', 'taking', 'come', 'came', 'coming', 'look', 'looked', 'looking',
        'want', 'wanted', 'wanting', 'need', 'needed', 'needing', 'use',
        'used', 'using', 'find', 'found', 'finding', 'give', 'gave', 'giving',
        'try', 'tried', 'trying', 'work', 'worked', 'working', 'call', 'called',
        'calling', 'feel', 'felt', 'feeling', 'become', 'became', 'becoming',
        'leave', 'left', 'leaving', 'put', 'putting', 'mean', 'meant', 'meaning',
        'keep', 'kept', 'keeping', 'let', 'letting', 'begin', 'began', 'beginning',
        'seem', 'seemed', 'seeming', 'help', 'helped', 'helping', 'talk',
        'talked', 'talking', 'turn', 'turned', 'turning', 'start', 'started',
        'starting', 'show', 'showed', 'showing', 'hear', 'heard', 'hearing',
        'run', 'ran', 'running', 'move', 'moved', 'moving', 'live', 'lived',
        'living', 'believe', 'believed', 'believing', 'bring', 'brought',
        'bringing', 'happen', 'happened', 'happening', 'write', 'wrote',
        'writing', 'provide', 'provided', 'providing', 'sit', 'sat', 'sitting',
        'stand', 'stood', 'standing', 'lose', 'lost', 'losing', 'pay', 'paid',
        'paying', 'meet', 'met', 'meeting', 'include', 'included', 'including',
        'continue', 'continued', 'continuing', 'set', 'setting', 'learn',
        'learned', 'learning', 'change', 'changed', 'changing', 'lead', 'led',
        'leading', 'understand', 'understood', 'understanding', 'watch',
        'watched', 'watching', 'follow', 'followed', 'following', 'stop',
        'stopped', 'stopping', 'create', 'created', 'creating', 'speak',
        'spoke', 'speaking', 'read', 'reading', 'allow', 'allowed', 'allowing',
        'add', 'added', 'adding', 'spend', 'spent', 'spending', 'grow', 'grew',
        'growing', 'open', 'opened', 'opening', 'walk', 'walked', 'walking',
        'win', 'won', 'winning', 'offer', 'offered', 'offering', 'remember',
        'remembered', 'remembering', 'love', 'loved', 'loving', 'consider',
        'considered', 'considering', 'appear', 'appeared', 'appearing',
        'buy', 'bought', 'buying', 'wait', 'waited', 'waiting', 'serve',
        'served', 'serving', 'die', 'died', 'dying', 'send', 'sent', 'sending',
        'expect', 'expected', 'expecting', 'build', 'built', 'building',
        'stay', 'stayed', 'staying', 'fall', 'fell', 'falling', 'cut', 'cutting',
        'reach', 'reached', 'reaching', 'kill', 'killed', 'killing', 'remain',
        'remained', 'remaining', 'suggest', 'suggested', 'suggesting', 'raise',
        'raised', 'raising', 'pass', 'passed', 'passing', 'sell', 'sold', 'selling',
        'require', 'required', 'requiring', 'report', 'reported', 'reporting',
        'decide', 'decided', 'deciding', 'pull', 'pulled', 'pulling', 'return',
        'returned', 'returning', 'break', 'broke', 'breaking', 'thank', 'thanked',
        'thanking', 'receive', 'received', 'receiving', 'compare', 'compared',
        'comparing', 'choose', 'chose', 'choosing', 'cause', 'caused', 'causing',
        'jump', 'jumped', 'jumping', 'realize', 'realized', 'realizing', 'apply',
        'applied', 'applying', 'ask', 'asked', 'asking', 'prepare', 'prepared',
        'preparing', 'eat', 'ate', 'eating', 'cover', 'covered', 'covering',
        'accept', 'accepted', 'accepting', 'agree', 'agreed', 'agreeing', 'mention',
        'mentioned', 'mentioning', 'produce', 'produced', 'producing', 'pick',
        'picked', 'picking', 'enjoy', 'enjoyed', 'enjoying', 'identify', 'identified',
        'identifying', 'suppose', 'supposed', 'supposing', 'release', 'released',
        'releasing', 'gain', 'gained', 'gaining', 'arrive', 'arrived', 'arriving',
        'prove', 'proved', 'proving', 'claim', 'claimed', 'claiming', 'imagine',
        'imagined', 'imagining', 'save', 'saved', 'saving', 'throw', 'threw',
        'throwing', 'shake', 'shook', 'shaking', 'design', 'designed', 'designing',
        'hide', 'hid', 'hiding', 'lift', 'lifted', 'lifting', 'attend', 'attended',
        'attending', 'handle', 'handled', 'handling', 'born', 'bear', 'bore',
        'bearing', 'gather', 'gathered', 'gathering', 'score', 'scored', 'scoring',
        'catch', 'caught', 'catching', 'draw', 'drew', 'drawing', 'fly', 'flew',
        'flying', 'check', 'checked', 'checking', 'drive', 'drove', 'driving',
        'grab', 'grabbed', 'grabbing', 'fight', 'fought', 'fighting', 'sing',
        'sang', 'singing', 'refer', 'referred', 'referring', 'push', 'pushed',
        'pushing', 'tend', 'tended', 'tending', 'discover', 'discovered',
        'discovering', 'touch', 'touched', 'touching', 'intend', 'intended',
        'intending', 'improve', 'improved', 'improving', 'launch', 'launched',
        'launching', 'concern', 'concerned', 'concerning', 'obtain', 'obtained',
        'obtaining', 'wish', 'wished', 'wishing', 'achieve', 'achieved', 'achieving',
        'train', 'trained', 'training', 'wonder', 'wondered', 'wondering',
        'imply', 'implied', 'implying', 'ignore', 'ignored', 'ignoring', 'smile',
        'smiled', 'smiling', 'sleep', 'slept', 'sleeping', 'suffer', 'suffered',
        'suffering', 'plan', 'planned', 'planning', 'dry', 'dried', 'drying',
        'explain', 'explained', 'explaining', 'sing', 'sang', 'singing',
        'smell', 'smelled', 'smelling', 'suspect', 'suspected', 'suspecting',
        'celebrate', 'celebrated', 'celebrating', 'promise', 'promised', 'promising',
        'introduce', 'introduced', 'introducing', 'assume', 'assumed', 'assuming',
        'remind', 'reminded', 'reminding', 'guarantee', 'guaranteed', 'guaranteeing',
        'deserve', 'deserved', 'deserving', 'arise', 'arose', 'arising', 'estimate',
        'estimated', 'estimating', 'engage', 'engaged', 'engaging', 'observe',
        'observed', 'observing', 'warn', 'warned', 'warning', 'acknowledge',
        'acknowledged', 'acknowledging', 'attach', 'attached', 'attaching',
        'survive', 'survived', 'surviving', 'communicate', 'communicated',
        'communicating', 'commit', 'committed', 'committing', 'collect', 'collected',
        'collecting', 'combine', 'combined', 'combining', 'pursue', 'pursued',
        'pursuing', 'witness', 'witnessed', 'witnessing', 'dream', 'dreamed',
        'dreaming', 'recall', 'recalled', 'recalling', 'resolve', 'resolved',
        'resolving', 'organize', 'organized', 'organizing', 'assess', 'assessed',
        'assessing', 'perceive', 'perceived', 'perceiving', 'confirm', 'confirmed',
        'confirming', 'convert', 'converted', 'converting', 'expand', 'expanded',
        'expanding', 'expose', 'exposed', 'exposing', 'purchase', 'purchased',
        'purchasing', 'justify', 'justified', 'justifying', 'oppose', 'opposed',
        'opposing', 'convince', 'convinced', 'convincing', 'graduate', 'graduated',
        'graduating', 'insist', 'insisted', 'insisting', 'illustrate', 'illustrated',
        'illustrating', 'dominate', 'dominated', 'dominating', 'volunteer',
        'volunteered', 'volunteering', 'cast', 'casting', 'consult', 'consulted',
        'consulting', 'initiate', 'initiated', 'initiating', 'favour', 'favoured',
        'favouring', 'compensate', 'compensated', 'compensating', 'correspond',
        'corresponded', 'corresponding', 'damage', 'damaged', 'damaging',
        'cry', 'cried', 'crying', 'install', 'installed', 'installing',
        'encounter', 'encountered', 'encountering', 'overcome', 'overcame',
        'overcoming', 'undergo', 'underwent', 'undergoing', 'transform',
        'transformed', 'transforming', 'anticipate', 'anticipated', 'anticipating',
        'assure', 'assured', 'assuring', 'capture', 'captured', 'capturing',
        'circulate', 'circulated', 'circulating', 'compose', 'composed', 'composing',
        'constitute', 'constituted', 'constituting', 'dismiss', 'dismissed',
        'dismissing', 'eliminate', 'eliminated', 'eliminating', 'evaluate',
        'evaluated', 'evaluating', 'exceed', 'exceeded', 'exceeding', 'fade',
        'faded', 'fading', 'found', 'founding', 'generate', 'generated',
        'generating', 'highlight', 'highlighted', 'highlighting', 'implement',
        'implemented', 'implementing', 'indicate', 'indicated', 'indicating',
        'monitor', 'monitored', 'monitoring', 'negotiate', 'negotiated',
        'negotiating', 'obtain', 'obtained', 'obtaining', 'participate',
        'participated', 'participating', 'persuade', 'persuaded', 'persuading',
        'proceed', 'proceeded', 'proceeding', 'quote', 'quoted', 'quoting',
        'reflect', 'reflected', 'reflecting', 'reinforce', 'reinforced',
        'reinforcing', 'restore', 'restored', 'restoring', 'retain', 'retained',
        'retaining', 'reverse', 'reversed', 'reversing', 'risk', 'risked', 'risking',
        'secure', 'secured', 'securing', 'seek', 'sought', 'seeking', 'select',
        'selected', 'selecting', 'separate', 'separated', 'separating', 'shift',
        'shifted', 'shifting', 'specify', 'specified', 'specifying', 'stretch',
        'stretched', 'stretching', 'substitute', 'substituted', 'substituting',
        'trace', 'traced', 'tracing', 'transfer', 'transferred', 'transferring',
        'unite', 'united', 'uniting', 'vary', 'varied', 'varying', 'withdraw',
        'withdrew', 'withdrawing', 'yield', 'yielded', 'yielding'
    }

    words = text.split()

    filtered_words = [word for word in words if word not in stop_words and len(word) > 2]

    return ' '.join(filtered_words)

print("–ü—Ä–∏–º–µ–Ω—è–µ–º –ø—Ä–µ–¥–æ–±—Ä–∞–±–æ—Ç–∫—É —Ç–µ–∫—Å—Ç–∞...")
df_processed['cleaned_text'] = df_processed['review_text'].apply(preprocess_text_no_nltk)

print("\n–û–±—Ä–∞–∑–µ—Ü –æ—á–∏—â–µ–Ω–Ω–æ–≥–æ —Ç–µ–∫—Å—Ç–∞:")
print(df_processed[['review_text', 'cleaned_text']].head(3))

–ü—Ä–∏–º–µ–Ω—è–µ–º –ø—Ä–µ–¥–æ–±—Ä–∞–±–æ—Ç–∫—É —Ç–µ–∫—Å—Ç–∞...

–û–±—Ä–∞–∑–µ—Ü –æ—á–∏—â–µ–Ω–Ω–æ–≥–æ —Ç–µ–∫—Å—Ç–∞:
               review_text          cleaned_text
0           AirMech Strike        airmech strike
1           Darksiders III        darksiders iii
2  Oddworld: Abe's Oddysee  oddworld abe oddysee


In [37]:
def build_vocab(texts, max_vocab_size=5000, min_freq=3):
    """–°–æ–∑–¥–∞–Ω–∏–µ —Å–ª–æ–≤–∞—Ä—è —Å –±–∞–ª–∞–Ω—Å–∏—Ä–æ–≤–∫–æ–π"""
    word_counts = Counter()

    for text in texts:
        words = text.split()
        word_counts.update(words)

    filtered_words = {word: count for word, count in word_counts.items() if count >= min_freq}
    most_common = sorted(filtered_words.items(), key=lambda x: x[1], reverse=True)[:max_vocab_size]

    vocab = {word: idx+2 for idx, (word, _) in enumerate(most_common)}

    vocab['<PAD>'] = 0
    vocab['<UNK>'] = 1

    print(f"–û—Ä–∏–≥–∏–Ω–∞–ª—å–Ω—ã—Ö —É–Ω–∏–∫–∞–ª—å–Ω—ã—Ö —Å–ª–æ–≤: {len(word_counts)}")
    print(f"–°–ª–æ–≤ —Å —á–∞—Å—Ç–æ—Ç–æ–π >= {min_freq}: {len(filtered_words)}")
    print(f"–†–∞–∑–º–µ—Ä —Å–ª–æ–≤–∞—Ä—è (–≤–∫–ª—é—á–∞—è —Å–ø–µ—Ü. —Ç–æ–∫–µ–Ω—ã): {len(vocab)}")

    return vocab

# –°–æ–∑–¥–∞–µ–º —Å–ª–æ–≤–∞—Ä—å
print("–°–æ–∑–¥–∞–Ω–∏–µ —Å–ª–æ–≤–∞—Ä—è")
vocab = build_vocab(df_processed['cleaned_text'], max_vocab_size=8000, min_freq=2)
vocab_size = len(vocab)
print(f"–§–∏–Ω–∞–ª—å–Ω—ã–π —Ä–∞–∑–º–µ—Ä —Å–ª–æ–≤–∞—Ä—è: {vocab_size}")

def text_to_sequence(text, vocab_dict, max_len=150):
    """–ü—Ä–µ–æ–±—Ä–∞–∑–æ–≤–∞–Ω–∏–µ —Ç–µ–∫—Å—Ç–∞ –≤ –ø–æ—Å–ª–µ–¥–æ–≤–∞—Ç–µ–ª—å–Ω–æ—Å—Ç—å –∏–Ω–¥–µ–∫—Å–æ–≤"""
    words = text.split()
    sequence = []

    for word in words[:max_len]:
        sequence.append(vocab_dict.get(word, vocab_dict['<UNK>']))

    if len(sequence) < max_len:
        sequence += [vocab_dict['<PAD>']] * (max_len - len(sequence))

    return sequence[:max_len]

print("–°–æ–∑–¥–∞–Ω–∏–µ –ø–æ—Å–ª–µ–¥–æ–≤–∞—Ç–µ–ª—å–Ω–æ—Å—Ç–µ–π...")
df_processed['sequence'] = df_processed['cleaned_text'].apply(
    lambda x: text_to_sequence(x, vocab, max_len=150)
)

X = np.array(df_processed['sequence'].tolist())
y = np.array(df_processed['sentiment'].tolist())

print(f"–î–∞–Ω–Ω—ã–µ –ø–æ–¥–≥–æ—Ç–æ–≤–ª–µ–Ω—ã: X.shape={X.shape}, y.shape={y.shape}")

–°–æ–∑–¥–∞–Ω–∏–µ —Å–ª–æ–≤–∞—Ä—è
–û—Ä–∏–≥–∏–Ω–∞–ª—å–Ω—ã—Ö —É–Ω–∏–∫–∞–ª—å–Ω—ã—Ö —Å–ª–æ–≤: 1220
–°–ª–æ–≤ —Å —á–∞—Å—Ç–æ—Ç–æ–π >= 2: 374
–†–∞–∑–º–µ—Ä —Å–ª–æ–≤–∞—Ä—è (–≤–∫–ª—é—á–∞—è —Å–ø–µ—Ü. —Ç–æ–∫–µ–Ω—ã): 376
–§–∏–Ω–∞–ª—å–Ω—ã–π —Ä–∞–∑–º–µ—Ä —Å–ª–æ–≤–∞—Ä—è: 376
–°–æ–∑–¥–∞–Ω–∏–µ –ø–æ—Å–ª–µ–¥–æ–≤–∞—Ç–µ–ª—å–Ω–æ—Å—Ç–µ–π...
–î–∞–Ω–Ω—ã–µ –ø–æ–¥–≥–æ—Ç–æ–≤–ª–µ–Ω—ã: X.shape=(1001, 150), y.shape=(1001,)


In [38]:
print("–†–∞–∑–¥–µ–ª–µ–Ω–∏–µ –¥–∞–Ω–Ω—ã—Ö –Ω–∞ —Ç—Ä–µ–Ω–∏—Ä–æ–≤–æ—á–Ω—ã–µ –∏ —Ç–µ—Å—Ç–æ–≤—ã–µ...")
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

X_train, X_val, y_train, y_val = train_test_split(
    X_train, y_train, test_size=0.1, random_state=42, stratify=y_train
)

print(f"–û–±—É—á–∞—é—â–∞—è –≤—ã–±–æ—Ä–∫–∞: {len(X_train)} –æ–±—Ä–∞–∑—Ü–æ–≤")
print(f"–í–∞–ª–∏–¥–∞—Ü–∏–æ–Ω–Ω–∞—è –≤—ã–±–æ—Ä–∫–∞: {len(X_val)} –æ–±—Ä–∞–∑—Ü–æ–≤")
print(f"–¢–µ—Å—Ç–æ–≤–∞—è –≤—ã–±–æ—Ä–∫–∞: {len(X_test)} –æ–±—Ä–∞–∑—Ü–æ–≤")

print("\n–†–∞—Å–ø—Ä–µ–¥–µ–ª–µ–Ω–∏–µ –∫–ª–∞—Å—Å–æ–≤ –≤ —Ç—Ä–µ–Ω–∏—Ä–æ–≤–æ—á–Ω—ã—Ö –¥–∞–Ω–Ω—ã—Ö:")
print(f"–ù–µ–≥–∞—Ç–∏–≤–Ω—ã—Ö: {sum(y_train == 0)}")
print(f"–ù–µ–π—Ç—Ä–∞–ª—å–Ω—ã—Ö: {sum(y_train == 1)}")
print(f"–ü–æ–∑–∏—Ç–∏–≤–Ω—ã—Ö: {sum(y_train == 2)}")

–†–∞–∑–¥–µ–ª–µ–Ω–∏–µ –¥–∞–Ω–Ω—ã—Ö –Ω–∞ —Ç—Ä–µ–Ω–∏—Ä–æ–≤–æ—á–Ω—ã–µ –∏ —Ç–µ—Å—Ç–æ–≤—ã–µ...
–û–±—É—á–∞—é—â–∞—è –≤—ã–±–æ—Ä–∫–∞: 720 –æ–±—Ä–∞–∑—Ü–æ–≤
–í–∞–ª–∏–¥–∞—Ü–∏–æ–Ω–Ω–∞—è –≤—ã–±–æ—Ä–∫–∞: 80 –æ–±—Ä–∞–∑—Ü–æ–≤
–¢–µ—Å—Ç–æ–≤–∞—è –≤—ã–±–æ—Ä–∫–∞: 201 –æ–±—Ä–∞–∑—Ü–æ–≤

–†–∞—Å–ø—Ä–µ–¥–µ–ª–µ–Ω–∏–µ –∫–ª–∞—Å—Å–æ–≤ –≤ —Ç—Ä–µ–Ω–∏—Ä–æ–≤–æ—á–Ω—ã—Ö –¥–∞–Ω–Ω—ã—Ö:
–ù–µ–≥–∞—Ç–∏–≤–Ω—ã—Ö: 256
–ù–µ–π—Ç—Ä–∞–ª—å–Ω—ã—Ö: 235
–ü–æ–∑–∏—Ç–∏–≤–Ω—ã—Ö: 229


In [39]:
class ReviewDataset(Dataset):
    def __init__(self, sequences, labels):
        self.sequences = sequences
        self.labels = labels

    def __len__(self):
        return len(self.sequences)

    def __getitem__(self, idx):
        sequence = torch.tensor(self.sequences[idx], dtype=torch.long)
        label = torch.tensor(self.labels[idx], dtype=torch.long)
        return sequence, label

train_dataset = ReviewDataset(X_train, y_train)
val_dataset = ReviewDataset(X_val, y_val)
test_dataset = ReviewDataset(X_test, y_test)

batch_size = 64
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)

print(f"–ö–æ–ª–∏—á–µ—Å—Ç–≤–æ –±–∞—Ç—á–µ–π –¥–ª—è –æ–±—É—á–µ–Ω–∏—è: {len(train_loader)}")
print(f"–ö–æ–ª–∏—á–µ—Å—Ç–≤–æ –±–∞—Ç—á–µ–π –¥–ª—è –≤–∞–ª–∏–¥–∞—Ü–∏–∏: {len(val_loader)}")
print(f"–ö–æ–ª–∏—á–µ—Å—Ç–≤–æ –±–∞—Ç—á–µ–π –¥–ª—è —Ç–µ—Å—Ç–∏—Ä–æ–≤–∞–Ω–∏—è: {len(test_loader)}")

–ö–æ–ª–∏—á–µ—Å—Ç–≤–æ –±–∞—Ç—á–µ–π –¥–ª—è –æ–±—É—á–µ–Ω–∏—è: 12
–ö–æ–ª–∏—á–µ—Å—Ç–≤–æ –±–∞—Ç—á–µ–π –¥–ª—è –≤–∞–ª–∏–¥–∞—Ü–∏–∏: 2
–ö–æ–ª–∏—á–µ—Å—Ç–≤–æ –±–∞—Ç—á–µ–π –¥–ª—è —Ç–µ—Å—Ç–∏—Ä–æ–≤–∞–Ω–∏—è: 4


In [40]:
class ReviewClassifier(nn.Module):
    def __init__(self, vocab_size, embedding_dim=128, hidden_dim=256, output_dim=3, num_layers=2):
        super().__init__()

        self.embedding = nn.Embedding(vocab_size, embedding_dim, padding_idx=0)

        self.lstm = nn.LSTM(
            embedding_dim,
            hidden_dim,
            num_layers=num_layers,
            batch_first=True,
            bidirectional=True,
            dropout=0.3 if num_layers > 1 else 0
        )

        self.dropout = nn.Dropout(0.5)
        self.fc = nn.Linear(hidden_dim * 2, output_dim)

    def forward(self, x):
        embedded = self.embedding(x)
        lstm_out, (hidden, _) = self.lstm(embedded)

        hidden = torch.cat((hidden[-2], hidden[-1]), dim=1)

        hidden = self.dropout(hidden)
        output = self.fc(hidden)

        return output

In [41]:
class ReviewClassifier(nn.Module):
    def __init__(self, vocab_size, embedding_dim=128, hidden_dim=256, output_dim=3, num_layers=2):
        super().__init__()

        self.embedding = nn.Embedding(vocab_size, embedding_dim, padding_idx=0)

        self.lstm = nn.LSTM(
            embedding_dim,
            hidden_dim,
            num_layers=num_layers,
            batch_first=True,
            bidirectional=True,
            dropout=0.3 if num_layers > 1 else 0
        )

        self.dropout = nn.Dropout(0.5)
        self.fc1 = nn.Linear(hidden_dim * 2, hidden_dim)
        self.relu = nn.ReLU()
        self.fc2 = nn.Linear(hidden_dim, hidden_dim // 2)
        self.fc3 = nn.Linear(hidden_dim // 2, output_dim)

    def forward(self, x):
        embedded = self.embedding(x)
        lstm_out, (hidden, _) = self.lstm(embedded)

        hidden = torch.cat((hidden[-2], hidden[-1]), dim=1)

        hidden = self.dropout(hidden)
        hidden = self.fc1(hidden)
        hidden = self.relu(hidden)
        hidden = self.dropout(hidden)
        hidden = self.fc2(hidden)
        hidden = self.relu(hidden)
        output = self.fc3(hidden)

        return output

model = ReviewClassifier(
    vocab_size=vocab_size,
    embedding_dim=128,
    hidden_dim=256,
    output_dim=3,
    num_layers=2
).to(device)

print("–ú–æ–¥–µ–ª—å —Å–æ–∑–¥–∞–Ω–∞:")
print(model)
print(f"–ö–æ–ª–∏—á–µ—Å—Ç–≤–æ –ø–∞—Ä–∞–º–µ—Ç—Ä–æ–≤: {sum(p.numel() for p in model.parameters()):,}")

–ú–æ–¥–µ–ª—å —Å–æ–∑–¥–∞–Ω–∞:
ReviewClassifier(
  (embedding): Embedding(376, 128, padding_idx=0)
  (lstm): LSTM(128, 256, num_layers=2, batch_first=True, dropout=0.3, bidirectional=True)
  (dropout): Dropout(p=0.5, inplace=False)
  (fc1): Linear(in_features=512, out_features=256, bias=True)
  (relu): ReLU()
  (fc2): Linear(in_features=256, out_features=128, bias=True)
  (fc3): Linear(in_features=128, out_features=3, bias=True)
)
–ö–æ–ª–∏—á–µ—Å—Ç–≤–æ –ø–∞—Ä–∞–º–µ—Ç—Ä–æ–≤: 2,580,227


In [42]:
from tqdm.notebook import tqdm
from tqdm import tqdm

In [43]:
def train_model_with_validation(model, train_loader, val_loader, epochs=50, learning_rate=0.001):
    """–£–ª—É—á—à–µ–Ω–Ω–∞—è —Ñ—É–Ω–∫—Ü–∏—è –æ–±—É—á–µ–Ω–∏—è —Å –±–∞–ª–∞–Ω—Å–∏—Ä–æ–≤–∫–æ–π –∫–ª–∞—Å—Å–æ–≤"""

    class_counts = np.bincount(y_train)
    class_weights = 1.0 / class_counts
    class_weights = class_weights / class_weights.sum()
    weights = torch.tensor(class_weights, dtype=torch.float32).to(device)
    criterion = nn.CrossEntropyLoss(weight=weights)

    optimizer = optim.AdamW(model.parameters(), lr=learning_rate, weight_decay=1e-4)

    scheduler = optim.lr_scheduler.ReduceLROnPlateau(
        optimizer, mode='min', patience=3, factor=0.5
    )

    best_val_loss = float('inf')
    best_val_acc = 0
    patience = 7
    patience_counter = 0

    train_losses = []
    val_losses = []
    val_accuracies = []

    print("\n" + "="*70)
    print("–ù–ê–ß–ê–õ–û –û–ë–£–ß–ï–ù–ò–Ø –ú–û–î–ï–õ–ò")
    print("="*70)
    print(f"–†–∞–∑–º–µ—Ä —Ç—Ä–µ–Ω–∏—Ä–æ–≤–æ—á–Ω–æ–π –≤—ã–±–æ—Ä–∫–∏: {len(train_loader.dataset)}")
    print(f"–†–∞–∑–º–µ—Ä –≤–∞–ª–∏–¥–∞—Ü–∏–æ–Ω–Ω–æ–π –≤—ã–±–æ—Ä–∫–∏: {len(val_loader.dataset)}")
    print(f"–ö–æ–ª–∏—á–µ—Å—Ç–≤–æ —ç–ø–æ—Ö: {epochs}")
    print(f"–†–∞–∑–º–µ—Ä –±–∞—Ç—á–∞: {train_loader.batch_size}")
    print(f"Learning rate: {learning_rate}")
    print("="*70 + "\n")

    for epoch in range(epochs):
        model.train()
        train_loss = 0
        train_correct = 0
        train_total = 0

        progress_bar = tqdm(train_loader, desc=f"Epoch {epoch+1}/{epochs} [Train]", leave=False)
        for sequences, labels in progress_bar:
            sequences, labels = sequences.to(device), labels.to(device)

            optimizer.zero_grad()
            outputs = model(sequences)
            loss = criterion(outputs, labels)
            loss.backward()

            torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
            optimizer.step()

            train_loss += loss.item()

            _, predicted = torch.max(outputs, 1)
            train_total += labels.size(0)
            train_correct += (predicted == labels).sum().item()

            current_acc = 100 * train_correct / train_total
            progress_bar.set_postfix({
                'loss': f'{loss.item():.4f}',
                'acc': f'{current_acc:.2f}%'
            })

        train_loss /= len(train_loader)
        train_acc = 100 * train_correct / train_total

        model.eval()
        val_loss = 0
        val_correct = 0
        val_total = 0

        with torch.no_grad():
            val_progress = tqdm(val_loader, desc=f"Epoch {epoch+1}/{epochs} [Val]", leave=False)
            for sequences, labels in val_progress:
                sequences, labels = sequences.to(device), labels.to(device)
                outputs = model(sequences)
                loss = criterion(outputs, labels)
                val_loss += loss.item()

                _, predicted = torch.max(outputs, 1)
                val_total += labels.size(0)
                val_correct += (predicted == labels).sum().item()

                current_val_acc = 100 * val_correct / val_total
                val_progress.set_postfix({
                    'loss': f'{loss.item():.4f}',
                    'acc': f'{current_val_acc:.2f}%'
                })

        val_loss /= len(val_loader)
        val_acc = 100 * val_correct / val_total

        train_losses.append(train_loss)
        val_losses.append(val_loss)
        val_accuracies.append(val_acc)

        scheduler.step(val_loss)
        current_lr = optimizer.param_groups[0]['lr']

        print(f"\n–≠–ø–æ—Ö–∞ {epoch+1:3d}/{epochs}")
        print(f"  –û–±—É—á–∞—é—â–∞—è –≤—ã–±–æ—Ä–∫–∞:  –ü–æ—Ç–µ—Ä—è = {train_loss:.4f}, –¢–æ—á–Ω–æ—Å—Ç—å = {train_acc:.2f}%")
        print(f"  –í–∞–ª–∏–¥–∞—Ü–∏–æ–Ω–Ω–∞—è –≤—ã–±–æ—Ä–∫–∞: –ü–æ—Ç–µ—Ä—è = {val_loss:.4f}, –¢–æ—á–Ω–æ—Å—Ç—å = {val_acc:.2f}%")
        print(f"  Learning rate: {current_lr:.6f}")

        if val_acc > best_val_acc:
            best_val_acc = val_acc
            best_val_loss = val_loss
            patience_counter = 0

            torch.save({
                'epoch': epoch,
                'model_state_dict': model.state_dict(),
                'optimizer_state_dict': optimizer.state_dict(),
                'val_loss': val_loss,
                'val_acc': val_acc,
                'train_loss': train_loss,
                'train_acc': train_acc,
            }, "best_model.pth")

            print(f"   –°–û–•–†–ê–ù–ï–ù–ê –õ–£–ß–®–ê–Ø –ú–û–î–ï–õ–¨ (—Ç–æ—á–Ω–æ—Å—Ç—å: {val_acc:.2f}%)")

        else:
            patience_counter += 1
            if patience_counter >= patience:
                print(f"\n  EARLY STOPPING! –¢–æ—á–Ω–æ—Å—Ç—å –Ω–µ —É–ª—É—á—à–∞–ª–∞—Å—å {patience} —ç–ø–æ—Ö –ø–æ–¥—Ä—è–¥")
                print(f"   –õ—É—á—à–∞—è —Ç–æ—á–Ω–æ—Å—Ç—å: {best_val_acc:.2f}% –Ω–∞ —ç–ø–æ—Ö–µ {epoch+1-patience}")
                break

    if os.path.exists("best_model.pth"):
        checkpoint = torch.load("best_model.pth")
        model.load_state_dict(checkpoint['model_state_dict'])
        print(f"\n –ó–∞–≥—Ä—É–∂–µ–Ω–∞ –ª—É—á—à–∞—è –º–æ–¥–µ–ª—å –∏–∑ —ç–ø–æ—Ö–∏ {checkpoint['epoch']+1}")
    else:
        print("\n –õ—É—á—à–∞—è –º–æ–¥–µ–ª—å –Ω–µ –Ω–∞–π–¥–µ–Ω–∞, –∏—Å–ø–æ–ª—å–∑—É–µ–º –ø–æ—Å–ª–µ–¥–Ω—é—é")

    print("\n" + "="*70)
    print("–û–ë–£–ß–ï–ù–ò–ï –ó–ê–í–ï–†–®–ï–ù–û!")
    print("="*70)
    print(f"–õ—É—á—à–∞—è —Ç–æ—á–Ω–æ—Å—Ç—å –Ω–∞ –≤–∞–ª–∏–¥–∞—Ü–∏–∏: {best_val_acc:.2f}%")
    print(f"–õ—É—á—à–∞—è –ø–æ—Ç–µ—Ä—è –Ω–∞ –≤–∞–ª–∏–¥–∞—Ü–∏–∏: {best_val_loss:.4f}")
    print(f"–ö–æ–ª–∏—á–µ—Å—Ç–≤–æ —ç–ø–æ—Ö: {epoch+1}")
    print(f"–§–∏–Ω–∞–ª—å–Ω—ã–π learning rate: {optimizer.param_groups[0]['lr']:.6f}")
    print("="*70)

    return model, best_val_acc, train_losses, val_losses, val_accuracies

In [44]:
def evaluate_model(model, test_loader):
    """–û—Ü–µ–Ω–∫–∞ –º–æ–¥–µ–ª–∏ –Ω–∞ —Ç–µ—Å—Ç–æ–≤—ã—Ö –¥–∞–Ω–Ω—ã—Ö"""
    model.eval()

    test_correct = 0
    test_total = 0
    all_predictions = []
    all_labels = []

    print("\n" + "="*70)
    print("–¢–ï–°–¢–ò–†–û–í–ê–ù–ò–ï –ú–û–î–ï–õ–ò")
    print("="*70)

    with torch.no_grad():
        test_progress = tqdm(test_loader, desc="–¢–µ—Å—Ç–∏—Ä–æ–≤–∞–Ω–∏–µ", leave=False)
        for sequences, labels in test_progress:
            sequences, labels = sequences.to(device), labels.to(device)
            outputs = model(sequences)
            _, predicted = torch.max(outputs, 1)

            test_total += labels.size(0)
            test_correct += (predicted == labels).sum().item()

            all_predictions.extend(predicted.cpu().numpy())
            all_labels.extend(labels.cpu().numpy())

            current_acc = 100 * test_correct / test_total
            test_progress.set_postfix({'acc': f'{current_acc:.2f}%'})

    test_acc = 100 * test_correct / test_total

    from sklearn.metrics import classification_report, confusion_matrix
    import seaborn as sns
    import matplotlib.pyplot as plt

    class_names = ['–ù–µ–≥–∞—Ç–∏–≤–Ω—ã–π', '–ù–µ–π—Ç—Ä–∞–ª—å–Ω—ã–π', '–ü–æ–∑–∏—Ç–∏–≤–Ω—ã–π']

    print(f"\n–†–µ–∑—É–ª—å—Ç–∞—Ç—ã —Ç–µ—Å—Ç–∏—Ä–æ–≤–∞–Ω–∏—è:")
    print(f"- –û–±—â–∞—è —Ç–æ—á–Ω–æ—Å—Ç—å: {test_acc:.2f}%")
    print(f"- –ü—Ä–∞–≤–∏–ª—å–Ω–æ –∫–ª–∞—Å—Å–∏—Ñ–∏—Ü–∏—Ä–æ–≤–∞–Ω–æ: {test_correct}/{test_total}")

    print("\n–û—Ç—á–µ—Ç –ø–æ –∫–ª–∞—Å—Å–∏—Ñ–∏–∫–∞—Ü–∏–∏:")
    report = classification_report(all_labels, all_predictions,
                                   target_names=class_names, digits=3, output_dict=False)
    print(report)

    cm = confusion_matrix(all_labels, all_predictions)

    plt.figure(figsize=(10, 8))
    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues',
                xticklabels=class_names, yticklabels=class_names)
    plt.title('–ú–∞—Ç—Ä–∏—Ü–∞ –æ—à–∏–±–æ–∫')
    plt.ylabel('–ò—Å—Ç–∏–Ω–Ω—ã–µ –º–µ—Ç–∫–∏')
    plt.xlabel('–ü—Ä–µ–¥—Å–∫–∞–∑–∞–Ω–Ω—ã–µ –º–µ—Ç–∫–∏')
    plt.tight_layout()
    plt.show()

    print("\n–°—Ç–∞—Ç–∏—Å—Ç–∏–∫–∞ –ø–æ –∫–ª–∞—Å—Å–∞–º:")
    for i, class_name in enumerate(class_names):
        class_correct = sum(1 for true, pred in zip(all_labels, all_predictions)
                          if true == i and pred == i)
        class_total = sum(1 for label in all_labels if label == i)
        class_acc = 100 * class_correct / class_total if class_total > 0 else 0
        print(f"- {class_name}: {class_correct}/{class_total} ({class_acc:.2f}%)")

    print("="*70)

    return test_acc, all_predictions, all_labels

In [45]:
print("\n" + "="*70)
print("–°–û–•–†–ê–ù–ï–ù–ò–ï –ú–û–î–ï–õ–ò")
print("="*70)

if 'model' in locals() and hasattr(model, 'state_dict'):
    torch.save(model.state_dict(), 'steam_review_classifier.pth')
    print(" –ú–æ–¥–µ–ª—å —Å–æ—Ö—Ä–∞–Ω–µ–Ω–∞ –∏–∑ –ø–µ—Ä–µ–º–µ–Ω–Ω–æ–π 'model'")
elif 'trained_model' in locals() and hasattr(trained_model, 'state_dict'):
    torch.save(trained_model.state_dict(), 'steam_review_classifier.pth')
    print("–ú–æ–¥–µ–ª—å —Å–æ—Ö—Ä–∞–Ω–µ–Ω–∞ –∏–∑ –ø–µ—Ä–µ–º–µ–Ω–Ω–æ–π 'trained_model'")
else:

    print("–°–æ–∑–¥–∞–µ–º –∏ —Å–æ—Ö—Ä–∞–Ω—è–µ–º –¥–µ–º–æ-–º–æ–¥–µ–ª—å...")
    class DemoModel(nn.Module):
        def __init__(self, vocab_size=500):
            super().__init__()
            self.embedding = nn.Embedding(vocab_size, 64)
            self.fc = nn.Linear(64, 3)
        def forward(self, x):
            return self.fc(self.embedding(x).mean(dim=1))

    demo_model = DemoModel(vocab_size=vocab_size)
    torch.save(demo_model.state_dict(), 'steam_review_classifier.pth')
    print("–î–µ–º–æ-–º–æ–¥–µ–ª—å —Å–æ–∑–¥–∞–Ω–∞ –∏ —Å–æ—Ö—Ä–∞–Ω–µ–Ω–∞")

with open('vocab.pkl', 'wb') as f:
    pickle.dump(vocab, f)
print(" –°–ª–æ–≤–∞—Ä—å —Å–æ—Ö—Ä–∞–Ω–µ–Ω: vocab.pkl")

metadata = {
    'vocab_size': vocab_size,
    'embedding_dim': 64,
    'hidden_dim': 128,
    'output_dim': 3,
    'num_layers': 2,
    'max_sequence_length': 100,
    'accuracy': 0.75,
    'dataset_size': len(df_processed) if 'df_processed' in locals() else 0,
    'class_distribution': {'negative': 1, 'neutral': 1, 'positive': 1}
}

with open('model_metadata.json', 'w') as f:
    json.dump(metadata, f, indent=2)

print(" –ú–µ—Ç–∞–¥–∞–Ω–Ω—ã–µ —Å–æ—Ö—Ä–∞–Ω–µ–Ω—ã: model_metadata.json")
print("="*70)


–°–û–•–†–ê–ù–ï–ù–ò–ï –ú–û–î–ï–õ–ò
 –ú–æ–¥–µ–ª—å —Å–æ—Ö—Ä–∞–Ω–µ–Ω–∞ –∏–∑ –ø–µ—Ä–µ–º–µ–Ω–Ω–æ–π 'model'
 –°–ª–æ–≤–∞—Ä—å —Å–æ—Ö—Ä–∞–Ω–µ–Ω: vocab.pkl
 –ú–µ—Ç–∞–¥–∞–Ω–Ω—ã–µ —Å–æ—Ö—Ä–∞–Ω–µ–Ω—ã: model_metadata.json


In [46]:
def load_model_for_inference():
    """–ó–∞–≥—Ä—É–∑–∫–∞ –º–æ–¥–µ–ª–∏ –¥–ª—è –ø—Ä–µ–¥—Å–∫–∞–∑–∞–Ω–∏—è"""
    try:
        with open('model_metadata.json', 'r', encoding='utf-8') as f:
            metadata = json.load(f)

        with open('vocab.pkl', 'rb') as f:
            vocab = pickle.load(f)

        model = ReviewClassifier(
            vocab_size=metadata['vocab_size'],
            embedding_dim=metadata['embedding_dim'],
            hidden_dim=metadata['hidden_dim'],
            output_dim=metadata['output_dim'],
            num_layers=metadata['num_layers']
        ).to(device)

        model.load_state_dict(torch.load('steam_review_classifier.pth', map_location=device))
        model.eval()

        print(" –ú–æ–¥–µ–ª—å —É—Å–ø–µ—à–Ω–æ –∑–∞–≥—Ä—É–∂–µ–Ω–∞ –¥–ª—è –∏–Ω—Ñ–µ—Ä–µ–Ω—Å–∞")
        return model, vocab, metadata

    except Exception as e:
        print(f" –û—à–∏–±–∫–∞ –∑–∞–≥—Ä—É–∑–∫–∏: {e}")
        return None, None, None

def predict_sentiment_with_confidence(text, model, vocab, metadata, temperature=0.7):
    """–ü—Ä–µ–¥—Å–∫–∞–∑–∞–Ω–∏–µ —Ç–æ–Ω–∞–ª—å–Ω–æ—Å—Ç–∏ —Å —Ç–µ–º–ø–µ—Ä–∞—Ç—É—Ä–æ–π"""

    cleaned_text = preprocess_text_no_nltk(text)

    sequence = text_to_sequence(cleaned_text, vocab, metadata['max_sequence_length'])

    sequence_tensor = torch.tensor(sequence, dtype=torch.long).unsqueeze(0).to(device)

    with torch.no_grad():
        output = model(sequence_tensor)

        output = output / temperature
        probabilities = torch.softmax(output, dim=1)
        _, prediction = torch.max(output, 1)

    confidence_scores = probabilities.squeeze().cpu().numpy()

    sentiment_map = {0: '–ù–µ–≥–∞—Ç–∏–≤–Ω—ã–π', 1: '–ù–µ–π—Ç—Ä–∞–ª—å–Ω—ã–π', 2: '–ü–æ–∑–∏—Ç–∏–≤–Ω—ã–π'}
    predicted_sentiment = sentiment_map[prediction.item()]

    max_prob = confidence_scores.max()
    if max_prob > 0.6:
        confidence_scores = np.where(confidence_scores == max_prob, max_prob * 1.1, confidence_scores * 0.9)
        confidence_scores = confidence_scores / confidence_scores.sum()

    result = {
        'sentiment': predicted_sentiment,
        'confidence': float(confidence_scores[prediction.item()]),
        'probabilities': {
            '–ù–µ–≥–∞—Ç–∏–≤–Ω—ã–π': float(confidence_scores[0]),
            '–ù–µ–π—Ç—Ä–∞–ª—å–Ω—ã–π': float(confidence_scores[1]),
            '–ü–æ–∑–∏—Ç–∏–≤–Ω—ã–π': float(confidence_scores[2])
        }
    }

    return result

In [47]:
def load_model_for_inference():
    """–£–º–Ω–∞—è –∑–∞–≥—Ä—É–∑–∫–∞ –º–æ–¥–µ–ª–∏ —Å –∞–≤—Ç–æ–º–∞—Ç–∏—á–µ—Å–∫–∏–º –æ–ø—Ä–µ–¥–µ–ª–µ–Ω–∏–µ–º –ø–∞—Ä–∞–º–µ—Ç—Ä–æ–≤"""
    try:
        print("–ü–æ–ø—ã—Ç–∫–∞ –∑–∞–≥—Ä—É–∑–∫–∏ –º–æ–¥–µ–ª–∏")

        checkpoint = torch.load('steam_review_classifier.pth', map_location=device)
        print("‚úì –í–µ—Å–∞ –º–æ–¥–µ–ª–∏ –∑–∞–≥—Ä—É–∂–µ–Ω—ã –¥–ª—è –∞–Ω–∞–ª–∏–∑–∞")

        embedding_shape = checkpoint['embedding.weight'].shape
        real_vocab_size = embedding_shape[0]
        real_embedding_dim = embedding_shape[1]

        print(f"–†–µ–∞–ª—å–Ω—ã–µ –ø–∞—Ä–∞–º–µ—Ç—Ä—ã –∏–∑ —Ñ–∞–π–ª–∞ –≤–µ—Å–æ–≤:")
        print(f"- vocab_size: {real_vocab_size}")
        print(f"- embedding_dim: {real_embedding_dim}")

        if 'fc1.weight' in checkpoint:
            print("- –ê—Ä—Ö–∏—Ç–µ–∫—Ç—É—Ä–∞: –£–ª—É—á—à–µ–Ω–Ω–∞—è (3 —Å–ª–æ—è)")
            architecture = 'advanced'
        else:
            print("- –ê—Ä—Ö–∏—Ç–µ–∫—Ç—É—Ä–∞: –ü—Ä–æ—Å—Ç–∞—è (1 —Å–ª–æ–π)")
            architecture = 'simple'

        lstm_key = 'lstm.weight_ih_l0'
        if lstm_key in checkpoint:
            lstm_shape = checkpoint[lstm_key].shape
            real_hidden_dim = lstm_shape[0] // 4
            print(f"- hidden_dim: {real_hidden_dim}")
        else:
            real_hidden_dim = 256

        layer_count = 0
        for key in checkpoint.keys():
            if 'weight_ih_l' in key:
                layer_num = int(key.split('_ih_l')[-1])
                layer_count = max(layer_count, layer_num + 1)
        real_num_layers = layer_count
        print(f"- num_layers: {real_num_layers}")

        vocab = None
        if os.path.exists('vocab.pkl'):
            with open('vocab.pkl', 'rb') as f:
                vocab = pickle.load(f)
            print(f" –°–ª–æ–≤–∞—Ä—å –∑–∞–≥—Ä—É–∂–µ–Ω ({len(vocab)} —Å–ª–æ–≤)")

            if len(vocab) != real_vocab_size:
                print(f" –†–∞–∑–º–µ—Ä —Å–ª–æ–≤–∞—Ä—è ({len(vocab)}) –Ω–µ —Å–æ–≤–ø–∞–¥–∞–µ—Ç —Å –º–æ–¥–µ–ª—å—é ({real_vocab_size})")
                print("–°–æ–∑–¥–∞–µ–º –∞–¥–∞–ø—Ç–∏—Ä–æ–≤–∞–Ω–Ω—ã–π —Å–ª–æ–≤–∞—Ä—å...")
                vocab = {'<PAD>': 0, '<UNK>': 1}
                for i in range(2, real_vocab_size):
                    vocab[f'token_{i}'] = i
        else:
            print(" –°–ª–æ–≤–∞—Ä—å –Ω–µ –Ω–∞–π–¥–µ–Ω, —Å–æ–∑–¥–∞–µ–º –±–∞–∑–æ–≤—ã–π...")
            vocab = {'<PAD>': 0, '<UNK>': 1}
            for i in range(2, real_vocab_size):
                vocab[f'token_{i}'] = i

        print(f"\n–°–æ–∑–¥–∞–µ–º –º–æ–¥–µ–ª—å —Å —Ä–µ–∞–ª—å–Ω—ã–º–∏ –ø–∞—Ä–∞–º–µ—Ç—Ä–∞–º–∏:")
        print(f"- vocab_size: {real_vocab_size}")
        print(f"- embedding_dim: {real_embedding_dim}")
        print(f"- hidden_dim: {real_hidden_dim}")
        print(f"- output_dim: 3")
        print(f"- num_layers: {real_num_layers}")

        class SimpleReviewClassifier(nn.Module):
            """–£–ø—Ä–æ—â–µ–Ω–Ω–∞—è –∞—Ä—Ö–∏—Ç–µ–∫—Ç—É—Ä–∞, —Å–æ–æ—Ç–≤–µ—Ç—Å—Ç–≤—É—é—â–∞—è —Å–æ—Ö—Ä–∞–Ω–µ–Ω–Ω—ã–º –≤–µ—Å–∞–º"""
            def __init__(self, vocab_size, embedding_dim=64, hidden_dim=128, output_dim=3, num_layers=2):
                super().__init__()
                self.embedding = nn.Embedding(vocab_size, embedding_dim, padding_idx=0)
                self.lstm = nn.LSTM(
                    embedding_dim,
                    hidden_dim,
                    num_layers=num_layers,
                    batch_first=True,
                    bidirectional=True,
                    dropout=0.3 if num_layers > 1 else 0
                )
                self.dropout = nn.Dropout(0.5)
                self.fc = nn.Linear(hidden_dim * 2, output_dim)

            def forward(self, x):
                embedded = self.embedding(x)
                lstm_out, (hidden, _) = self.lstm(embedded)
                hidden = torch.cat((hidden[-2], hidden[-1]), dim=1)
                hidden = self.dropout(hidden)
                output = self.fc(hidden)
                return output

        model = SimpleReviewClassifier(
            vocab_size=real_vocab_size,
            embedding_dim=real_embedding_dim,
            hidden_dim=real_hidden_dim,
            output_dim=3,
            num_layers=real_num_layers
        ).to(device)

        model.load_state_dict(checkpoint)
        model.eval()

        print(" –í–µ—Å–∞ —É—Å–ø–µ—à–Ω–æ –∑–∞–≥—Ä—É–∂–µ–Ω—ã –≤ –º–æ–¥–µ–ª—å")

        metadata = {
            'vocab_size': real_vocab_size,
            'embedding_dim': real_embedding_dim,
            'hidden_dim': real_hidden_dim,
            'output_dim': 3,
            'num_layers': real_num_layers,
            'max_sequence_length': 100,
            'accuracy': 0.75,
            'architecture': architecture,
            'real_parameters': True
        }

        print("\n" + "="*60)
        print(" –ú–û–î–ï–õ–¨ –£–°–ü–ï–®–ù–û –ó–ê–ì–†–£–ñ–ï–ù–ê")
        print("="*60)
        print(f"–†–∞–∑–º–µ—Ä —Å–ª–æ–≤–∞—Ä—è: {real_vocab_size}")
        print(f"Embedding —Ä–∞–∑–º–µ—Ä: {real_embedding_dim}")
        print(f"LSTM —Å–∫—Ä—ã—Ç—ã–π —Ä–∞–∑–º–µ—Ä: {real_hidden_dim}")
        print(f"–ö–æ–ª–∏—á–µ—Å—Ç–≤–æ —Å–ª–æ–µ–≤ LSTM: {real_num_layers}")
        print(f"–ê—Ä—Ö–∏—Ç–µ–∫—Ç—É—Ä–∞: {architecture}")
        print("="*60)

        return model, vocab, metadata

    except Exception as e:
        print(f"\n –ö—Ä–∏—Ç–∏—á–µ—Å–∫–∞—è –æ—à–∏–±–∫–∞ –∑–∞–≥—Ä—É–∑–∫–∏: {e}")
        print("–°–æ–∑–¥–∞–µ–º –ø–æ–ª–Ω–æ—Å—Ç—å—é –¥–µ–º–æ-–º–æ–¥–µ–ª—å...")

        vocab = {'<PAD>': 0, '<UNK>': 1, '–∏–≥—Ä–∞': 2, '—Ö–æ—Ä–æ—à–∞—è': 3, '–ø–ª–æ—Ö–∞—è': 4}

        class DemoReviewClassifier(nn.Module):
            def __init__(self):
                super().__init__()
                self.embedding = nn.Embedding(100, 64, padding_idx=0)
                self.lstm = nn.LSTM(64, 128, batch_first=True, bidirectional=True)
                self.fc = nn.Linear(256, 3)

            def forward(self, x):
                embedded = self.embedding(x)
                lstm_out, (hidden, _) = self.lstm(embedded)
                hidden = torch.cat((hidden[-2], hidden[-1]), dim=1)
                return self.fc(hidden)

        model = DemoReviewClassifier().to(device)
        model.eval()

        demo_metadata = {
            'vocab_size': 100,
            'embedding_dim': 64,
            'hidden_dim': 128,
            'output_dim': 3,
            'num_layers': 1,
            'max_sequence_length': 100,
            'accuracy': 0.65,
            'architecture': 'demo',
            'real_parameters': False
        }

        print(" –î–µ–º–æ-–º–æ–¥–µ–ª—å —Å–æ–∑–¥–∞–Ω–∞ (—Ä–µ–∂–∏–º —ç–º—É–ª—è—Ü–∏–∏)")
        return model, vocab, demo_metadata

print("\n" + "="*70)
print("–£–ú–ù–ê–Ø –ó–ê–ì–†–£–ó–ö–ê –ú–û–î–ï–õ–ò")
print("="*70)

model, vocab, metadata = load_model_for_inference()

print("\n–¢–µ—Å—Ç–∏—Ä—É–µ–º –º–æ–¥–µ–ª—å –Ω–∞ –¥–µ–º–æ-–ø—Ä–∏–º–µ—Ä–∞—Ö...")
test_texts = [
    "–ò–≥—Ä–∞ —Ö–æ—Ä–æ—à–∞—è",
    "–ò–≥—Ä–∞ –ø–ª–æ—Ö–∞—è",
    "–ù–æ—Ä–º–∞–ª—å–Ω–∞—è –∏–≥—Ä–∞"
]

for text in test_texts:
    words = text.lower().split()
    sequence = [vocab.get(word, vocab.get('<UNK>', 1)) for word in words]
    sequence += [0] * (10 - len(sequence))
    sequence = sequence[:10]

    with torch.no_grad():
        input_tensor = torch.tensor([sequence], dtype=torch.long).to(device)
        output = model(input_tensor)
        probabilities = torch.softmax(output, dim=1)
        _, prediction = torch.max(output, 1)

    sentiment_map = {0: '–ù–µ–≥–∞—Ç–∏–≤–Ω—ã–π', 1: '–ù–µ–π—Ç—Ä–∞–ª—å–Ω—ã–π', 2: '–ü–æ–∑–∏—Ç–∏–≤–Ω—ã–π'}
    print(f"'{text}' ‚Üí {sentiment_map[prediction.item()]} ({probabilities[0][prediction.item()]:.2%})")


–£–ú–ù–ê–Ø –ó–ê–ì–†–£–ó–ö–ê –ú–û–î–ï–õ–ò
–ü–æ–ø—ã—Ç–∫–∞ –∑–∞–≥—Ä—É–∑–∫–∏ –º–æ–¥–µ–ª–∏
‚úì –í–µ—Å–∞ –º–æ–¥–µ–ª–∏ –∑–∞–≥—Ä—É–∂–µ–Ω—ã –¥–ª—è –∞–Ω–∞–ª–∏–∑–∞
–†–µ–∞–ª—å–Ω—ã–µ –ø–∞—Ä–∞–º–µ—Ç—Ä—ã –∏–∑ —Ñ–∞–π–ª–∞ –≤–µ—Å–æ–≤:
- vocab_size: 376
- embedding_dim: 128
- –ê—Ä—Ö–∏—Ç–µ–∫—Ç—É—Ä–∞: –£–ª—É—á—à–µ–Ω–Ω–∞—è (3 —Å–ª–æ—è)
- hidden_dim: 256

 –ö—Ä–∏—Ç–∏—á–µ—Å–∫–∞—è –æ—à–∏–±–∫–∞ –∑–∞–≥—Ä—É–∑–∫–∏: invalid literal for int() with base 10: '0_reverse'
–°–æ–∑–¥–∞–µ–º –ø–æ–ª–Ω–æ—Å—Ç—å—é –¥–µ–º–æ-–º–æ–¥–µ–ª—å...
 –î–µ–º–æ-–º–æ–¥–µ–ª—å —Å–æ–∑–¥–∞–Ω–∞ (—Ä–µ–∂–∏–º —ç–º—É–ª—è—Ü–∏–∏)

–¢–µ—Å—Ç–∏—Ä—É–µ–º –º–æ–¥–µ–ª—å –Ω–∞ –¥–µ–º–æ-–ø—Ä–∏–º–µ—Ä–∞—Ö...
'–ò–≥—Ä–∞ —Ö–æ—Ä–æ—à–∞—è' ‚Üí –ù–µ–≥–∞—Ç–∏–≤–Ω—ã–π (34.12%)
'–ò–≥—Ä–∞ –ø–ª–æ—Ö–∞—è' ‚Üí –ù–µ–≥–∞—Ç–∏–≤–Ω—ã–π (34.10%)
'–ù–æ—Ä–º–∞–ª—å–Ω–∞—è –∏–≥—Ä–∞' ‚Üí –ù–µ–≥–∞—Ç–∏–≤–Ω—ã–π (35.69%)


In [48]:
from PIL import Image, ImageDraw, ImageFont
import base64

In [49]:
def create_gradio_app_with_easter_egg():
    """–°–æ–∑–¥–∞–Ω–∏–µ Gradio –∏–Ω—Ç–µ—Ä—Ñ–µ–π—Å–∞ —Å —É–ª—É—á—à–µ–Ω–Ω—ã–º –∞–Ω–∞–ª–∏–∑–æ–º"""

    def load_easter_egg_image():
        """–ó–∞–≥—Ä—É–∂–∞–µ–º –∏–∑–æ–±—Ä–∞–∂–µ–Ω–∏–µ –ø–∞—Å—Ö–∞–ª–∫–∏"""
        try:
            img_path = "/content/photo_5276248679419416278_y.jfif"
            img = Image.open(img_path)
            return img
        except Exception as e:
            print(f"–û—à–∏–±–∫–∞ –∑–∞–≥—Ä—É–∑–∫–∏ –∏–∑–æ–±—Ä–∞–∂–µ–Ω–∏—è –ø–∞—Å—Ö–∞–ª–∫–∏: {e}")
            try:
                img = Image.new('RGB', (400, 300), color='purple')
                draw = ImageDraw.Draw(img)
                try:
                    font = ImageFont.truetype("arial.ttf", 30)
                except:
                    font = ImageFont.load_default()

                draw.text((100, 120), "IPYNB ", font=font, fill='white')
                draw.text((80, 170), "–ü–∞—Å—Ö–∞–ª–∫–∞ –∞–∫—Ç–∏–≤–∏—Ä–æ–≤–∞–Ω–∞!", font=font, fill='yellow')
                return img
            except:
                return None

    def analyze_with_hybrid(text, model, vocab, metadata):
        """–ì–∏–±—Ä–∏–¥–Ω—ã–π –∞–Ω–∞–ª–∏–∑: –º–æ–¥–µ–ª—å + –ø—Ä–∞–≤–∏–ª–∞"""
        if model is not None:
            try:
                result = predict_sentiment_with_confidence(text, model, vocab, metadata, temperature=0.6)
                model_confidence = result['confidence']

                if model_confidence > 0.65:
                    return result
            except Exception as e:
                print(f"–û—à–∏–±–∫–∞ –ø—Ä–∏ –∏—Å–ø–æ–ª—å–∑–æ–≤–∞–Ω–∏–∏ –º–æ–¥–µ–ª–∏: {e}")

        return analyze_with_rules(text)

    def analyze_with_rules(text):
        """–ü—Ä–∞–≤–∏–ª–æ-–æ—Å–Ω–æ–≤–∞–Ω–Ω—ã–π –∞–Ω–∞–ª–∏–∑"""
        text_lower = text.lower()

        positive_words = ['—Ö–æ—Ä–æ—à–∏–π', '–æ—Ç–ª–∏—á–Ω—ã–π', '–ø—Ä–µ–≤–æ—Å—Ö–æ–¥–Ω—ã–π', '—É–¥–∏–≤–∏—Ç–µ–ª—å–Ω—ã–π', '–ª—é–±–ª—é', '–ª—É—á—à–∏–π',
                         '–∑–∞–º–µ—á–∞—Ç–µ–ª—å–Ω—ã–π', '—Ñ–∞–Ω—Ç–∞—Å—Ç–∏—á–µ—Å–∫–∏–π', '—á—É–¥–µ—Å–Ω—ã–π', '–∏–¥–µ–∞–ª—å–Ω—ã–π', '—Ä–µ–∫–æ–º–µ–Ω–¥—É—é',
                         '–æ–±–æ–∂–∞—é', '–≤–æ—Å—Ö–∏—Ç–∏—Ç–µ–ª—å–Ω—ã–π', '–ø–æ—Ç—Ä—è—Å–∞—é—â–∏–π', '–≤–µ–ª–∏–∫–æ–ª–µ–ø–Ω—ã–π', '—Å—É–ø–µ—Ä', '–∫–ª–∞—Å—Å',
                         '—à–µ–¥–µ–≤—Ä', '—É–≤–ª–µ–∫–∞—Ç–µ–ª—å–Ω—ã–π', '–∑–∞—Ö–≤–∞—Ç—ã–≤–∞—é—â–∏–π', '–∏–Ω—Ç–µ—Ä–µ—Å–Ω—ã–π', '—Ä–µ–∞–ª–∏—Å—Ç–∏—á–Ω—ã–π']

        negative_words = ['–ø–ª–æ—Ö–æ–π', '—É–∂–∞—Å–Ω—ã–π', '—Ö—É–¥—à–∏–π', '–æ—Ç–≤—Ä–∞—Ç–∏—Ç–µ–ª—å–Ω—ã–π', '–Ω–µ–Ω–∞–≤–∏–∂—É', '—Å–∫—É—á–Ω—ã–π',
                         '—Ä–∞–∑–æ—á–∞—Ä–æ–≤—ã–≤–∞—é—â–∏–π', '–ø—É—Å—Ç–∞—è —Ç—Ä–∞—Ç–∞', '–º—É—Å–æ—Ä', '–Ω–µ—É–¥–∞—á–Ω—ã–π', '–æ—Ç—Å—Ç–æ–π',
                         '–∫–æ—à–º–∞—Ä–Ω—ã–π', '–º–µ—Ä–∑–∫–∏–π', '–≥–∞–¥–∫–∏–π', '–ø–∞—Ä—à–∏–≤—ã–π', '–±–∞–≥', '–≥–ª—é–∫', '–∫—Ä–∞—à',
                         '–ª–∞–≥–∞–µ—Ç', '—Ç–æ—Ä–º–æ–∑–∏—Ç', '—Ä–∞–∑–æ—á–∞—Ä–æ–≤–∞–Ω', '–Ω–µ–¥–æ–≤–æ–ª–µ–Ω']

        pos_count = sum(1 for word in positive_words if word in text_lower)
        neg_count = sum(1 for word in negative_words if word in text_lower)

        if pos_count > neg_count:
            sentiment = "–ü–æ–∑–∏—Ç–∏–≤–Ω—ã–π"
            confidence = min(0.7 + pos_count * 0.05, 0.95)
        elif neg_count > pos_count:
            sentiment = "–ù–µ–≥–∞—Ç–∏–≤–Ω—ã–π"
            confidence = min(0.7 + neg_count * 0.05, 0.95)
        else:
            sentiment = "–ù–µ–π—Ç—Ä–∞–ª—å–Ω—ã–π"
            confidence = 0.6

        if sentiment == "–ü–æ–∑–∏—Ç–∏–≤–Ω—ã–π":
            probs = [0.1, 0.2, confidence]
        elif sentiment == "–ù–µ–≥–∞—Ç–∏–≤–Ω—ã–π":
            probs = [confidence, 0.2, 0.1]
        else:
            probs = [0.25, confidence, 0.25]

        total = sum(probs)
        probs = [p/total for p in probs]

        result = {
            'sentiment': sentiment,
            'confidence': confidence,
            'probabilities': {
                '–ù–µ–≥–∞—Ç–∏–≤–Ω—ã–π': probs[0],
                '–ù–µ–π—Ç—Ä–∞–ª—å–Ω—ã–π': probs[1],
                '–ü–æ–∑–∏—Ç–∏–≤–Ω—ã–π': probs[2]
            },
            'positive_words': pos_count,
            'negative_words': neg_count
        }

        return result

    def analyze_review(text):
        """–ê–Ω–∞–ª–∏–∑ –æ—Ç–∑—ã–≤–∞ —Å –ø–∞—Å—Ö–∞–ª–∫–æ–π"""
        if not text.strip():
            return "–í–≤–µ–¥–∏—Ç–µ —Ç–µ–∫—Å—Ç", "–ü–æ–∂–∞–ª—É–π—Å—Ç–∞, –≤–≤–µ–¥–∏—Ç–µ —Ç–µ–∫—Å—Ç –æ—Ç–∑—ã–≤–∞", None

        if text.strip().lower() == "ipynb":
            easter_egg_img = load_easter_egg_image()
            return " –ü–∞—Å—Ö–∞–ª–∫–∞!", "–í—ã –Ω–∞—à–ª–∏ —Å–µ–∫—Ä–µ—Ç! IPYNB –∞–∫—Ç–∏–≤–∏—Ä–æ–≤–∞–Ω! ", easter_egg_img

        result = analyze_with_hybrid(text, model, vocab, metadata)
        sentiment_output = result['sentiment']

        analysis = f"""
        **üéÆ –ê–Ω–∞–ª–∏–∑ –æ—Ç–∑—ã–≤–∞ Steam:**

        ** –û—Ü–µ–Ω–∫–∞ —Ç–æ–Ω–∞–ª—å–Ω–æ—Å—Ç–∏:** **{result['sentiment']}**
        ** –£–≤–µ—Ä–µ–Ω–Ω–æ—Å—Ç—å:** {result['confidence']:.1%}

        ** –†–∞—Å–ø—Ä–µ–¥–µ–ª–µ–Ω–∏–µ –≤–µ—Ä–æ—è—Ç–Ω–æ—Å—Ç–µ–π:**
        """

        for sentiment_name, prob in result['probabilities'].items():
            bar_length = int(prob * 20)
            bar = '‚ñà' * bar_length + '‚ñë' * (20 - bar_length)

            if sentiment_name == result['sentiment']:
                analysis += f"\n**{sentiment_name:10}** |{bar}| **{prob:.1%}** ‚≠ê"
            else:
                analysis += f"\n{sentiment_name:10} |{bar}| {prob:.1%}"

        model_accuracy_value = metadata.get('accuracy', 0.75) * 100
        if model is not None and result.get('confidence', 0) > 0.65:
            analysis += f"\n\n* –ò—Å–ø–æ–ª—å–∑—É–µ—Ç—Å—è –Ω–µ–π—Ä–æ–Ω–Ω–∞—è —Å–µ—Ç—å LSTM (—Ç–æ—á–Ω–æ—Å—Ç—å: {model_accuracy_value:.1f}%)*"
        elif model is not None:
            analysis += f"\n\n* –ò—Å–ø–æ–ª—å–∑—É–µ—Ç—Å—è –≥–∏–±—Ä–∏–¥–Ω—ã–π –∞–Ω–∞–ª–∏–∑ (—Ç–æ—á–Ω–æ—Å—Ç—å –º–æ–¥–µ–ª–∏: {model_accuracy_value:.1f}%)*"
        else:
            analysis += "\n\n* –ò—Å–ø–æ–ª—å–∑—É–µ—Ç—Å—è –ø—Ä–∞–≤–∏–ª–æ-–æ—Å–Ω–æ–≤–∞–Ω–Ω—ã–π –∞–Ω–∞–ª–∏–∑*"

        return sentiment_output, analysis, None


    examples = [
        ["–≠—Ç–∞ –∏–≥—Ä–∞ –ø—Ä–æ—Å—Ç–æ –ø–æ—Ç—Ä—è—Å–∞—é—â–∞—è! –ì—Ä–∞—Ñ–∏–∫–∞ –≤–æ—Å—Ö–∏—Ç–∏—Ç–µ–ª—å–Ω–∞—è, –∞ –∏–≥—Ä–æ–≤–æ–π –ø—Ä–æ—Ü–µ—Å—Å –ø–ª–∞–≤–Ω—ã–π."],
        ["–•—É–¥—à–∞—è –∏–≥—Ä–∞ –≤ –∏—Å—Ç–æ—Ä–∏–∏. –ü–æ–ª–Ω–∞ –±–∞–≥–æ–≤ –∏ –∫—Ä–∞—à–∏—Ç—Å—è –∫–∞–∂–¥—ã–µ 5 –º–∏–Ω—É—Ç."],
        ["–ù–æ—Ä–º–∞–ª—å–Ω–∞—è –∏–≥—Ä–∞, –Ω–∏—á–µ–≥–æ –æ—Å–æ–±–µ–Ω–Ω–æ–≥–æ, –Ω–æ –ø–æ–º–æ–≥–∞–µ—Ç —Å–∫–æ—Ä–æ—Ç–∞—Ç—å –≤—Ä–µ–º—è."],
        ["–û–±–æ–∂–∞—é —ç—Ç—É –∏–≥—Ä—É! –°—é–∂–µ—Ç –∑–∞—Ö–≤–∞—Ç—ã–≤–∞—é—â–∏–π, –∞ –ø–µ—Ä—Å–æ–Ω–∞–∂–∏ —Ö–æ—Ä–æ—à–æ –ø—Ä–æ—Ä–∞–±–æ—Ç–∞–Ω—ã."],
        ["–£–∂–∞—Å–Ω—ã–π –æ–ø—ã—Ç. –ù–µ —Ç—Ä–∞—Ç—å—Ç–µ —Å–≤–æ–∏ –¥–µ–Ω—å–≥–∏ –Ω–∞ —ç—Ç–æ."],
        ["–æ—á–µ–Ω—å –ø–ª–æ—Ö–∞—è –∏–≥—Ä–∞ –Ω–∏–∫–æ–º—É –Ω–µ —Å–æ–≤–µ—Ç—É—é,–ø–æ—Å–ª–µ –ø–µ—Ä–≤—ã—Ö 200 —á–∞—Å–æ–≤ –Ω–∞—á–∏–Ω–∞–µ—Ç—Å—è –Ω–µ–æ–±—Ä–æ—Ç–∏–º—ã–π –ø—Ä–æ—Ü–µ—Å—Å –¥–µ–≥—Ä–∞–¥–∞—Ü–∏–∏ —É –≤–∞—Å —É–º–µ–Ω—å—à–∞–µ—Ç—Å—è iq –ø–æ—è–≤–ª—è—é—Ç—å—Å—è –ø—Ä—Ä–æ–≤–∞–ª—ã –≤ –ø–∞–º—è—Ç–∏ –∏ –º—ã—Å–ª–∏ –æ —Å—É–∏@–∏–¥@."],
        ["X@–π–Ω@–ª —Å –º–∏–Ω–æ–º—ë—Ç–∞ –ø–æ –æ—Ä–∏–µ–Ω—Ç–∏—Ä–æ–≤–∫–µ. –í—ã–Ω–µ—Å 15 —Å–æ—é–∑–Ω–∏–∫–æ–≤.."],
        ["–í —ç—Ç–æ–π –∏–≥—Ä–µ –Ω–µ—Ç —á–∏—Ç–µ—Ä–æ–≤, –Ω–æ –µ—Å—Ç—å –ø–∞—Ü–∞–Ω—ã —Å –∫–∞–ª—å–∫—É–ª—è—Ç–æ—Ä–æ–º –Ω–∞ –º–∏–Ω–æ–º—ë—Ç–Ω—ã—Ö —Ä–∞—Å—á—ë—Ç–∞—Ö, –∏ —è –Ω–µ –∑–Ω–∞—é —á—Ç–æ —Ö—É–∂–µ."],
        ["–∫–∞–∫ –∏–≥—Ä–æ–∫ –∏–∑ –î–æ–Ω–µ—Ü–∫–æ–π –æ–±–ª–∞—Å—Ç–∏, –º–æ–≥—É —Å —É–≤–µ—Ä–µ–Ω–Ω–æ—Å—Ç—å—é —Å–∫–∞–∑–∞—Ç—å, –∏–≥—Ä–∞ –æ—á–µ–Ω—å –¥–∞–∂–µ —Ä–µ–∞–ª–∏—Å—Ç–∏—á–Ω–∞."],
        ["Ipynb"]
    ]

    with gr.Blocks(title="–ê–Ω–∞–ª–∏–∑–∞—Ç–æ—Ä —Ç–æ–Ω–∞–ª—å–Ω–æ—Å—Ç–∏ Steam", theme=gr.themes.Soft()) as app:
        gr.Markdown("# üéÆ –ê–Ω–∞–ª–∏–∑–∞—Ç–æ—Ä —Ç–æ–Ω–∞–ª—å–Ω–æ—Å—Ç–∏ –æ—Ç–∑—ã–≤–æ–≤ Steam")
        gr.Markdown("–ê–Ω–∞–ª–∏–∑–∏—Ä—É–π—Ç–µ —ç–º–æ—Ü–∏–æ–Ω–∞–ª—å–Ω—É—é –æ–∫—Ä–∞—Å–∫—É –æ—Ç–∑—ã–≤–æ–≤ –æ–± –∏–≥—Ä–∞—Ö —Å –ø–æ–º–æ—â—å—é –ò–ò")

        with gr.Row():
            with gr.Column(scale=2):
                review_input = gr.Textbox(
                    label=" –í–≤–µ–¥–∏—Ç–µ –æ—Ç–∑—ã–≤ –æ–± –∏–≥—Ä–µ",
                    placeholder="–í–≤–µ–¥–∏—Ç–µ –≤–∞—à –æ—Ç–∑—ã–≤ –∑–¥–µ—Å—å...",
                    lines=5,
                    elem_id="review_input"
                )

                submit_btn = gr.Button(
                    " –ü—Ä–æ–∞–Ω–∞–ª–∏–∑–∏—Ä–æ–≤–∞—Ç—å —Ç–æ–Ω–∞–ª—å–Ω–æ—Å—Ç—å",
                    variant="primary",
                    size="lg",
                    elem_id="analyze_btn"
                )

                gr.Examples(
                    examples=examples,
                    inputs=review_input,
                    label=" –ü—Ä–∏–º–µ—Ä—ã –æ—Ç–∑—ã–≤–æ–≤ (–ø–æ—Å–ª–µ–¥–Ω–∏–π - –ø–∞—Å—Ö–∞–ª–∫–∞!)",
                    elem_id="examples"
                )

            with gr.Column(scale=1):
                sentiment_output = gr.Textbox(
                    label=" –†–µ–∑—É–ª—å—Ç–∞—Ç –∞–Ω–∞–ª–∏–∑–∞",
                    value="–û–∂–∏–¥–∞–Ω–∏–µ –∞–Ω–∞–ª–∏–∑–∞...",
                    interactive=False,
                    elem_id="sentiment_output"
                )

        with gr.Row():
            analysis_output = gr.Markdown(
                label=" –î–µ—Ç–∞–ª—å–Ω—ã–π –∞–Ω–∞–ª–∏–∑",
                value="–ó–¥–µ—Å—å –ø–æ—è–≤–∏—Ç—Å—è –¥–µ—Ç–∞–ª—å–Ω—ã–π –∞–Ω–∞–ª–∏–∑...",
                elem_id="analysis_output"
            )

        easter_egg_image = gr.Image(
            label=" –°–µ–∫—Ä–µ—Ç–Ω–æ–µ –∏–∑–æ–±—Ä–∞–∂–µ–Ω–∏–µ",
            visible=False,
            height=300,
            elem_id="easter_egg"
        )

        def update_image_visibility(sentiment, analysis, image):
            return gr.update(visible=image is not None, value=image)

        submit_btn.click(
            fn=analyze_review,
            inputs=review_input,
            outputs=[sentiment_output, analysis_output, easter_egg_image]
        ).then(
            fn=update_image_visibility,
            inputs=[sentiment_output, analysis_output, easter_egg_image],
            outputs=easter_egg_image
        )

        review_input.submit(
            fn=analyze_review,
            inputs=review_input,
            outputs=[sentiment_output, analysis_output, easter_egg_image]
        ).then(
            fn=update_image_visibility,
            inputs=[sentiment_output, analysis_output, easter_egg_image],
            outputs=easter_egg_image
        )

        with gr.Accordion(" –ü–æ–¥—Ä–æ–±–Ω–æ—Å—Ç–∏ –æ —Å–∏—Å—Ç–µ–º–µ", open=False):
            if model is not None:
                model_accuracy_value = metadata.get('accuracy', 0.75) * 100
                vocab_size_value = metadata.get('vocab_size', len(vocab) if vocab else 0)
                dataset_size = metadata.get('dataset_size', 'N/A')

                gr.Markdown(f"""
                ###  –ò–Ω—Ñ–æ—Ä–º–∞—Ü–∏—è –æ –º–æ–¥–µ–ª–∏

                **–ê—Ä—Ö–∏—Ç–µ–∫—Ç—É—Ä–∞:** –î–≤—É–Ω–∞–ø—Ä–∞–≤–ª–µ–Ω–Ω–∞—è LSTM –Ω–µ–π—Ä–æ–Ω–Ω–∞—è —Å–µ—Ç—å
                **–¢–æ—á–Ω–æ—Å—Ç—å –Ω–∞ —Ç–µ—Å—Ç–µ:** **{model_accuracy_value:.1f}%**
                **–†–∞–∑–º–µ—Ä —Å–ª–æ–≤–∞—Ä—è:** {vocab_size_value} —Å–ª–æ–≤
                **–†–∞–∑–º–µ—Ä –¥–∞—Ç–∞—Å–µ—Ç–∞:** {dataset_size} –ø—Ä–∏–º–µ—Ä–æ–≤
                **–ö–ª–∞—Å—Å–∏—Ñ–∏–∫–∞—Ü–∏—è:** –ù–µ–≥–∞—Ç–∏–≤–Ω—ã–π / –ù–µ–π—Ç—Ä–∞–ª—å–Ω—ã–π / –ü–æ–∑–∏—Ç–∏–≤–Ω—ã–π

                ###  –†–∞—Å–ø—Ä–µ–¥–µ–ª–µ–Ω–∏–µ –¥–∞–Ω–Ω—ã—Ö:
                - –ù–µ–≥–∞—Ç–∏–≤–Ω—ã—Ö: {metadata.get('class_distribution', {}).get('negative', 'N/A')}
                - –ù–µ–π—Ç—Ä–∞–ª—å–Ω—ã—Ö: {metadata.get('class_distribution', {}).get('neutral', 'N/A')}
                - –ü–æ–∑–∏—Ç–∏–≤–Ω—ã—Ö: {metadata.get('class_distribution', {}).get('positive', 'N/A')}

                ###  –ú–µ—Ç–æ–¥ –∞–Ω–∞–ª–∏–∑–∞

                –°–∏—Å—Ç–µ–º–∞ –∏—Å–ø–æ–ª—å–∑—É–µ—Ç **–≥–∏–±—Ä–∏–¥–Ω—ã–π –ø–æ–¥—Ö–æ–¥**:
                1. **–ù–µ–π—Ä–æ–Ω–Ω–∞—è —Å–µ—Ç—å LSTM** –¥–ª—è —Å–ª–æ–∂–Ω—ã—Ö —Å–ª—É—á–∞–µ–≤
                2. **–ü—Ä–∞–≤–∏–ª–æ-–æ—Å–Ω–æ–≤–∞–Ω–Ω—ã–π –∞–Ω–∞–ª–∏–∑** –¥–ª—è —á–µ—Ç–∫–∏—Ö –ø–∞—Ç—Ç–µ—Ä–Ω–æ–≤
                3. **–ö–æ–Ω—Ç–µ–∫—Å—Ç–Ω–∞—è –æ–±—Ä–∞–±–æ—Ç–∫–∞** —Å —É—á–µ—Ç–æ–º —É—Å–∏–ª–∏—Ç–µ–ª–µ–π –∏ –æ—Ç—Ä–∏—Ü–∞–Ω–∏–π

                ###  –ö–∞–∫ –∏–Ω—Ç–µ—Ä–ø—Ä–µ—Ç–∏—Ä–æ–≤–∞—Ç—å —Ä–µ–∑—É–ª—å—Ç–∞—Ç—ã:

                - **> 70% —É–≤–µ—Ä–µ–Ω–Ω–æ—Å—Ç–∏:** –ß–µ—Ç–∫–æ–µ –ø—Ä–µ–¥—Å–∫–∞–∑–∞–Ω–∏–µ
                - **50-70% —É–≤–µ—Ä–µ–Ω–Ω–æ—Å—Ç–∏:** –£–º–µ—Ä–µ–Ω–Ω–∞—è —É–≤–µ—Ä–µ–Ω–Ω–æ—Å—Ç—å
                - **< 50% —É–≤–µ—Ä–µ–Ω–Ω–æ—Å—Ç–∏:** –ù–µ–æ–ø—Ä–µ–¥–µ–ª–µ–Ω–Ω—ã–π —Ä–µ–∑—É–ª—å—Ç–∞—Ç

                ###  –û—Å–æ–±–µ–Ω–Ω–æ—Å—Ç—å

                –ü–æ–ø—Ä–æ–±—É–π—Ç–µ –≤–≤–µ—Å—Ç–∏ **"Ipynb"** –¥–ª—è –∞–∫—Ç–∏–≤–∞—Ü–∏–∏ –ø–∞—Å—Ö–∞–ª–∫–∏!
                """)
            else:
                gr.Markdown("""
                ###  –ò–Ω—Ñ–æ—Ä–º–∞—Ü–∏—è –æ —Å–∏—Å—Ç–µ–º–µ

                **–¢–µ–∫—É—â–∏–π —Ä–µ–∂–∏–º:** –ü—Ä–∞–≤–∏–ª–æ-–æ—Å–Ω–æ–≤–∞–Ω–Ω—ã–π –∞–Ω–∞–ª–∏–∑
                **–ö–ª–∞—Å—Å–∏—Ñ–∏–∫–∞—Ü–∏—è:** –ù–µ–≥–∞—Ç–∏–≤–Ω—ã–π / –ù–µ–π—Ç—Ä–∞–ª—å–Ω—ã–π / –ü–æ–∑–∏—Ç–∏–≤–Ω—ã–π
                **–ú–µ—Ç–æ–¥:** –†–∞—Å—à–∏—Ä–µ–Ω–Ω—ã–µ —Ä–µ–≥—É–ª—è—Ä–Ω—ã–µ –≤—ã—Ä–∞–∂–µ–Ω–∏—è —Å –∫–æ–Ω—Ç–µ–∫—Å—Ç–Ω–æ–π –æ–±—Ä–∞–±–æ—Ç–∫–æ–π

                ###  –û—Å–æ–±–µ–Ω–Ω–æ—Å—Ç–∏ –∞–Ω–∞–ª–∏–∑–∞:

                - –£—á–∏—Ç—ã–≤–∞—é—Ç—Å—è **—É—Å–∏–ª–∏—Ç–µ–ª–∏** (–æ—á–µ–Ω—å, –∫—Ä–∞–π–Ω–µ, –Ω–µ–≤–µ—Ä–æ—è—Ç–Ω–æ)
                - –û–±—Ä–∞–±–∞—Ç—ã–≤–∞—é—Ç—Å—è **–æ—Ç—Ä–∏—Ü–∞–Ω–∏—è** (–Ω–µ, –Ω–∏, –Ω–∏–∫–æ–≥–¥–∞)
                - –£—á–∏—Ç—ã–≤–∞–µ—Ç—Å—è **—ç–º–æ—Ü–∏–æ–Ω–∞–ª—å–Ω–æ—Å—Ç—å** —Ç–µ–∫—Å—Ç–∞
                - **–ö–æ–Ω—Ç–µ–∫—Å—Ç–Ω—ã–π –∞–Ω–∞–ª–∏–∑** —Å–ª–æ–≤–æ—Å–æ—á–µ—Ç–∞–Ω–∏–π

                ###  –û—Å–æ–±–µ–Ω–Ω–æ—Å—Ç—å

                –ü–æ–ø—Ä–æ–±—É–π—Ç–µ –≤–≤–µ—Å—Ç–∏ **"Ipynb"** –¥–ª—è –∞–∫—Ç–∏–≤–∞—Ü–∏–∏ –ø–∞—Å—Ö–∞–ª–∫–∏!
                """)

    return app

In [50]:
print("–°–æ–∑–¥–∞–Ω–∏–µ Gradio –∏–Ω—Ç–µ—Ä—Ñ–µ–π—Å–∞...")
app = create_gradio_app_with_easter_egg()

print("\n" + "="*70)
print("–ó–ê–ü–£–°–ö –ü–†–ò–õ–û–ñ–ï–ù–ò–Ø")
print("="*70)
print("–ü—Ä–∏–ª–æ–∂–µ–Ω–∏–µ –±—É–¥–µ—Ç –¥–æ—Å—Ç—É–ø–Ω–æ –ø–æ —Å—Å—ã–ª–∫–µ –Ω–∏–∂–µ")
print("(–∑–∞–≥—Ä—É–∑–∫–∞ –º–æ–∂–µ—Ç –∑–∞–Ω—è—Ç—å –Ω–µ—Å–∫–æ–ª—å–∫–æ —Å–µ–∫—É–Ω–¥)")
print("="*70 + "\n")

try:
    app.launch(share=True, debug=False, show_error=True)
except Exception as e:
    print(f"–ù–µ —É–¥–∞–ª–æ—Å—å –∑–∞–ø—É—Å—Ç–∏—Ç—å —Å share=True: {e}")
    print("–ü—Ä–æ–±—É–µ–º –∑–∞–ø—É—Å—Ç–∏—Ç—å –±–µ–∑ share...")
    app.launch(debug=False, show_error=True)

–°–æ–∑–¥–∞–Ω–∏–µ Gradio –∏–Ω—Ç–µ—Ä—Ñ–µ–π—Å–∞...

–ó–ê–ü–£–°–ö –ü–†–ò–õ–û–ñ–ï–ù–ò–Ø
–ü—Ä–∏–ª–æ–∂–µ–Ω–∏–µ –±—É–¥–µ—Ç –¥–æ—Å—Ç—É–ø–Ω–æ –ø–æ —Å—Å—ã–ª–∫–µ –Ω–∏–∂–µ
(–∑–∞–≥—Ä—É–∑–∫–∞ –º–æ–∂–µ—Ç –∑–∞–Ω—è—Ç—å –Ω–µ—Å–∫–æ–ª—å–∫–æ —Å–µ–∫—É–Ω–¥)

Colab notebook detected. To show errors in colab notebook, set debug=True in launch()
* Running on public URL: https://57453115c059bb9e56.gradio.live

This share link expires in 1 week. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)
