# This is ML Rewiew rating model, so let's go!

In [1]:
import os
import pandas as pd
from pathlib import Path
from sklearn.model_selection import train_test_split
import torch
from torch.utils.data import Dataset, DataLoader
import torch.nn as nn
import torch.nn.functional as F
from tqdm import tqdm
from transformers import BertTokenizer

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

  from .autonotebook import tqdm as notebook_tqdm


### Set up data reader

In [None]:
DATA_DIR = Path('../DataSets/dataset-LONG-films')

labels_map = {
    'negative': 0,
    'neutral': 1,
    'positive': 2
}


def load_data_from_folders(base_path, mapping):
    data = []
    
    # go for each folder
    for folder_name, label_id in mapping.items():
        folder_path = base_path / folder_name
        
        # list of all txt files in there
        files = list(folder_path.glob('*.txt'))
        
        print(f"Loading {folder_name}: found {len(files)} files")
        
        # Read every file
        for file_path in tqdm(files, desc=folder_name):
            try:
                # Skip unreadable files
                with open(file_path, 'r', encoding='utf-8', errors='ignore') as f:
                    text = f.read()
                    # Add [text, label] in list
                    data.append({'text': text, 'label': label_id})
            except Exception as e:
                print(f"Ошибка чтения файла {file_path}: {e}")
                
    return pd.DataFrame(data)

##### Start it:

In [None]:
df_of_data_in_folders = load_data_from_folders(DATA_DIR, labels_map)
print(f"\nLoaded rows: {len(df_of_data_in_folders)}")

##### Save it:

In [None]:
df_of_data_in_folders.to_csv('data_in_folders.csv')

##### Load data:

In [2]:
df_of_data_in_folders = pd.read_csv('data_in_folders.csv')
print(f"\nLoaded rows: {len(df_of_data_in_folders)}")


Loaded rows: 131669


##### Clear toolong data

In [None]:
# Count ammount words
df_of_data_in_folders['word_count'] = df_of_data_in_folders['text'].astype(str).str.split().str.len()

# Delete all rows where ammount words > 256
initial_count = len(df_of_data_in_folders)
df_filtered = df_of_data_in_folders[df_of_data_in_folders['word_count'] <= 256].copy()

# Log removed rows
removed_count = initial_count - len(df_filtered)
print(f"Before was rows: {initial_count}")
print(f"Deleted rows (>256 words): {removed_count}")
print(f"Now rows: {len(df_filtered)}")

# Show examples of removed rows
removed = df_of_data_in_folders[df_of_data_in_folders['word_count'] > 256]
if not removed.empty:
    print("\nExamples of deleted rows:")
    print(removed[['text', 'word_count', 'label']].head())

# Delete helper column
df_filtered = df_filtered.drop('word_count', axis=1)

# Save results
df_filtered.to_csv('filtered_dataset.csv', index=False)



Исходно строк: 131669
Удалено строк (>256 слов): 75902
Осталось строк: 55767

Примеры удаленных строк:
                                                text  word_count  label
0  Что же написать о фильме? По прошествии 2 лет ...         373      0
1  Очень не хотелось писать рецензию по сериалам,...         290      0
3  Вам что-то говорит имя Александр Уитт? Мне нет...         463      0
4  После первых двух частей чего-то от третьей я ...         270      0
5  Тамай Сиина – обычная двенадцатилетняя девочка...         924      0

Фильтрация завершена. df_filtered содержит отфильтрованные данные.


##### Load filtered data

In [5]:
df_of_data_in_folders = pd.read_csv('filtered_dataset.csv')


##### Split data:

In [7]:
train_df, test_df = train_test_split(df_of_data_in_folders, test_size=0.2, random_state=42, stratify=df_of_data_in_folders['label'])

print(f"Train data: {len(train_df)}")
print(f"test data: {len(test_df)}")

Train data: 44613
test data: 11154


##### Add dataloader + built-in tokenizer 

In [8]:
class BertTextDataset(Dataset):
    def __init__(self, dataframe, tokenizer, max_len=256):
        self.data = dataframe
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        # Get text and label
        row = self.data.iloc[idx]
        text = str(row['text'])
        label = row['label']

        encoding = self.tokenizer.encode_plus(
            text,
            add_special_tokens=True,  # Add [CLS], [SEP]
            max_length=self.max_len,
            padding='max_length',
            truncation=True,          # Cut if bigger than max_length
            return_attention_mask=True,
            return_tensors='pt',
        )

        # Return label and ID
        # .flatten() cut extra dimentions
        return {
            'input_ids': encoding['input_ids'].flatten(),
            'label': torch.tensor(label, dtype=torch.long)
        }


# Multi language tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-multilingual-cased')

# set up datasets
train_dataset = BertTextDataset(train_df, tokenizer)
test_dataset = BertTextDataset(test_df, tokenizer)

# Create data loaders
train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=32, shuffle=False)