# This is ML Rewiew rating model, so let's go!

In [None]:
import os
import pandas as pd
from pathlib import Path
from sklearn.model_selection import train_test_split
import torch
from torch.utils.data import Dataset, DataLoader
import torch.nn as nn
import torch.nn.functional as F
from tqdm import tqdm
from transformers import AutoTokenizer

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

### Set up data reader

In [None]:
DATA_DIR = Path('../DataSets/dataset-LONG-films')

labels_map = {
    'negative': 0,
    'neutral': 1,
    'positive': 2
}


def load_data_from_folders(base_path, mapping):
    data = []
    
    # go for each folder
    for folder_name, label_id in mapping.items():
        folder_path = base_path / folder_name
        
        # list of all txt files in there
        files = list(folder_path.glob('*.txt'))
        
        print(f"Loading {folder_name}: found {len(files)} files")
        
        # Read every file
        for file_path in tqdm(files, desc=folder_name):
            try:
                # Skip unreadable files
                with open(file_path, 'r', encoding='utf-8', errors='ignore') as f:
                    text = f.read()
                    # Add [text, label] in list
                    data.append({'text': text, 'label': label_id})
            except Exception as e:
                print(f"Ошибка чтения файла {file_path}: {e}")
                
    return pd.DataFrame(data)

##### Start it:

In [None]:
df_of_data_in_folders = load_data_from_folders(DATA_DIR, labels_map)
print(f"\nLoaded rows: {len(df_of_data_in_folders)}")

##### Save it:

In [None]:
df_of_data_in_folders.to_csv('data_in_folders.csv')

##### Load data:

In [None]:
df_of_data_in_folders = pd.read_csv('data_in_folders.csv')
print(f"\nLoaded rows: {len(df_of_data_in_folders)}")

##### Clear toolong data

In [None]:
# Count ammount words
df_of_data_in_folders['word_count'] = df_of_data_in_folders['text'].astype(str).str.split().str.len()

# Delete all rows where ammount words > 256
initial_count = len(df_of_data_in_folders)
df_filtered = df_of_data_in_folders[df_of_data_in_folders['word_count'] <= 256].copy()

# Log removed rows
removed_count = initial_count - len(df_filtered)
print(f"Before was rows: {initial_count}")
print(f"Deleted rows (>256 words): {removed_count}")
print(f"Now rows: {len(df_filtered)}")

# Show examples of removed rows
removed = df_of_data_in_folders[df_of_data_in_folders['word_count'] > 256]
if not removed.empty:
    print("\nExamples of deleted rows:")
    print(removed[['text', 'word_count', 'label']].head())

# Delete helper column
df_filtered = df_filtered.drop('word_count', axis=1)

# Save results
df_filtered.to_csv('filtered_dataset.csv', index=False)



##### Load filtered data

In [None]:
df_of_data_in_folders = pd.read_csv('filtered_dataset.csv')


##### Split data:

In [None]:
train_df, test_df = train_test_split(df_of_data_in_folders, test_size=0.2, random_state=42, stratify=df_of_data_in_folders['label'])

print(f"Train data: {len(train_df)}")
print(f"test data: {len(test_df)}")

##### Add dataloader + built-in tokenizer 

In [None]:
class BertTextDataset(Dataset):
    def __init__(self, dataframe, tokenizer, max_len=256):
        self.data = dataframe
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        # Get text and label
        row = self.data.iloc[idx]
        text = str(row['text'])
        label = row['label']

        encoding = self.tokenizer.encode_plus(
            text,
            add_special_tokens=True,  # Add [CLS], [SEP]
            max_length=self.max_len,
            padding='max_length',
            truncation=True,          # Cut if bigger than max_length
            return_attention_mask=True,
            return_tensors='pt',
        )

        # Return label and ID
        # .flatten() cut extra dimentions
        return {
            'input_ids': encoding['input_ids'].flatten(),
            'label': torch.tensor(label, dtype=torch.long)
        }


# Multi language tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-multilingual-cased')

# set up datasets
train_dataset = BertTextDataset(train_df, tokenizer)
test_dataset = BertTextDataset(test_df, tokenizer)

# Create data loaders
train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=32, shuffle=False)

# Setup of model

In [None]:
class TextCNN(nn.Module):
    def __init__(self, vocab_size, embed_dim, n_filters, filter_sizes, output_dim, dropout):
        super().__init__()
        
        # 1. Слой эмбеддингов: превращает индексы слов в плотные векторы
        self.embedding = nn.Embedding(vocab_size, embed_dim)
        
        # ✅ ИСПРАВЛЕНИЕ 1: Несколько сверточных слоев для разных размеров фильтров
        # Вместо одного conv теперь список convs для [2,3,4]
        self.convs = nn.ModuleList([
            nn.Conv1d(in_channels=embed_dim, 
                      out_channels=n_filters, 
                      kernel_size=fs)
            for fs in filter_sizes  # filter_sizes теперь список [2,3,4]
        ])
        
        # 2. Слой классификации (Линейный) - размер теперь n_filters * len(filter_sizes)
        self.fc = nn.Linear(n_filters * len(filter_sizes), output_dim)
        
        # 3. Дропаут (для регуляризации)
        self.dropout = nn.Dropout(dropout)
        
    def forward(self, text):
        # text = [batch size, sent len] (например: 32, 256)
        
        # Превращаем цифры в векторы
        embedded = self.embedding(text)
        # embedded = [batch size, sent len, emb dim]
        
        # PyTorch Conv1d хочет размерность каналов второй
        embedded = embedded.permute(0, 2, 1)
        # embedded = [batch size, emb dim, sent len]
        
        # ✅ ИСПРАВЛЕНИЕ 2: Применяем ВСЕ свертки параллельно
        conved = [F.relu(conv(embedded)) for conv in self.convs]
        # conved = список: [[batch, n_filters, sent_len-fs+1] для каждого fs]
        
        # ✅ ИСПРАВЛЕНИЕ 3: Max pooling для КАЖДОГО свертки
        pooled = [F.max_pool1d(conv, conv.shape[2]).squeeze(2) for conv in conved]
        # pooled = список: [[batch, n_filters] для каждого fs]
        
        # Конкатенируем все результаты
        cat = torch.cat(pooled, dim=1)
        # cat = [batch size, n_filters * len(filter_sizes)]
        
        # Dropout и финальное предсказание
        cat = self.dropout(cat)
        return self.fc(cat)

# Гиперпараметры (ИСПРАВЛЕННЫЕ)
INPUT_DIM = tokenizer.vocab_size
EMBEDDING_DIM = 100               
N_FILTERS = 100                   
FILTER_SIZES = [2, 3, 4]  # ✅ ИСПРАВЛЕНИЕ 4: Теперь СПИСОК размеров!
OUTPUT_DIM = 3                    
DROPOUT = 0.5                     

# Создаем модель
model = TextCNN(INPUT_DIM, EMBEDDING_DIM, N_FILTERS, FILTER_SIZES, OUTPUT_DIM, DROPOUT)
model = model.to(device)

optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)
criterion = nn.CrossEntropyLoss().to(device)

# Print parametrs
print(f"Параметры: {sum(p.numel() for p in model.parameters()):,}")

In [None]:
print(tokenizer.vocab_size)

In [None]:
tokenizer = AutoTokenizer.from_pretrained('distilbert-base-uncased')
print(f"Vocab: {tokenizer.vocab_size:,}")  # ДОЛЖНО БЫТЬ ~18k!

INPUT_DIM = tokenizer.vocab_size
model = TextCNN(INPUT_DIM, 100, 100, [2,3,4], 3, 0.5)
print(f"Параметры: {sum(p.numel() for p in model.parameters()):,} ")  # 1.8M!
