In [8]:
import os, re, json
import numpy as np
import pandas as pd
import torch
from torch import nn
from torch.utils.data import Dataset, DataLoader
from typing import List, Dict, Tuple, Sequence
from tqdm.auto import tqdm

SEED = 42
np.random.seed(SEED)
torch.manual_seed(SEED)
if torch.cuda.is_available():
    torch.cuda.manual_seed_all(SEED)

DATA_DIR = 'data'
RAW1 = os.path.join(DATA_DIR, 'raw_dataset.csv')
RAW2 = os.path.join(DATA_DIR, 'training.1600000.processed.noemoticon.csv')
PROCESSED = os.path.join(DATA_DIR, 'dataset_processed.csv')
TRAIN = os.path.join(DATA_DIR, 'train.csv')
VAL = os.path.join(DATA_DIR, 'val.csv')
TEST = os.path.join(DATA_DIR, 'test.csv')
VAL_RATIO = 0.1
TEST_RATIO = 0.1
MAX_VOCAB = 30000
SEQ_LEN = 16
BATCH_SIZE = 512
LR = 1e-3
EPOCHS = 1
DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
SPECIAL_TOKENS = ['<pad>', '<unk>', '<eos>']


In [None]:
# Загрузка/чистка и сплиты
os.makedirs(DATA_DIR, exist_ok=True)

# Поиск исходного CSV, если стандартные пути отсутствуют
def find_source_csv() -> str | None:
    candidates = []
    for root, _, files in os.walk(DATA_DIR):
        for fn in files:
            if not fn.lower().endswith('.csv'):
                continue
            full = os.path.join(root, fn)
            if os.path.basename(full) in {'dataset_processed.csv','train.csv','val.csv','test.csv'}:
                continue
            candidates.append((full, os.path.getsize(full)))
    if not candidates:
        return None
    candidates.sort(key=lambda x: x[1], reverse=True)
    return candidates[0][0]

def nonempty_rows(path: str) -> int:
    try:
        dfh = pd.read_csv(path, nrows=5)
        return int(dfh.shape[0])
    except Exception:
        return 0

src_path = None
# Предпочитаем Kaggle-файл, если он есть
if os.path.exists(RAW2):
    src_path = RAW2
elif os.path.exists(RAW1) and nonempty_rows(RAW1) > 0:
    src_path = RAW1
else:
    src_path = find_source_csv()

if src_path is None:
    raise FileNotFoundError('Положите CSV с колонкой text в data/raw_dataset.csv или Kaggle CSV в data/training.1600000.processed.noemoticon.csv')

print('Source CSV:', src_path)

# Чтение исходного CSV
if os.path.basename(src_path).lower().startswith('training.1600000.processed.noemoticon'):
    df = pd.read_csv(src_path, encoding='latin-1', header=None)
    df = df[[5]].rename(columns={5:'text'})
else:
    try:
        df = pd.read_csv(src_path)
        if 'text' not in df.columns:
            tmp = pd.read_csv(src_path, header=None, encoding='latin-1')
            df = tmp[[5]].rename(columns={5:'text'})
    except Exception:
        tmp = pd.read_csv(src_path, header=None, encoding='latin-1')
        df = tmp[[5]].rename(columns={5:'text'})


def clean_text(t: str) -> str:
    t = str(t).lower()
    t = re.sub(r'https?://\\S+|www\\.\\S+', ' ', t)
    t = re.sub(r'@[\\w_]+', ' ', t)
    t = re.sub(r'\\s+', ' ', t).strip()
    return t

df['text'] = df['text'].map(clean_text)
df = df.dropna().drop_duplicates().reset_index(drop=True)
mask = df['text'].astype(str).str.len() > 0
df = df[mask].reset_index(drop=True)


MAX_ROWS = 1000
if MAX_ROWS:
    df = df.iloc[:MAX_ROWS]


df.to_csv(PROCESSED, index=False, encoding='utf-8')
print('Processed rows:', len(df))

rng = np.random.default_rng(SEED)
idx = rng.permutation(len(df))
n = len(df)
n_test = int(n * TEST_RATIO)
n_val = int(n * VAL_RATIO)
n_train = max(0, n - n_val - n_test)
train_idx = idx[:n_train]
val_idx = idx[n_train:n_train+n_val]
test_idx = idx[n_train+n_val:]

pd.DataFrame({'text': df.iloc[train_idx]['text']}).to_csv(TRAIN, index=False, encoding='utf-8')
pd.DataFrame({'text': df.iloc[val_idx]['text']}).to_csv(VAL, index=False, encoding='utf-8')
pd.DataFrame({'text': df.iloc[test_idx]['text']}).to_csv(TEST, index=False, encoding='utf-8')

print('Splits ->', 'train:', len(train_idx), 'val:', len(val_idx), 'test:', len(test_idx))
len(df), len(train_idx), len(val_idx), len(test_idx)


Source CSV: data\training.1600000.processed.noemoticon.csv
Processed rows: 1000
Splits -> train: 800 val: 100 test: 100


(1000, 800, 100, 100)

In [10]:
from collections import Counter

def tokenize(s: str) -> List[str]:
    return s.split()


texts = pd.read_csv(TRAIN)['text'].astype(str).tolist()
cnt = Counter()
for t in tqdm(texts, desc="vocab"):
    cnt.update(tokenize(t))

vocab = SPECIAL_TOKENS + [w for w,_ in cnt.most_common(MAX_VOCAB) if w not in SPECIAL_TOKENS]
vocab = vocab[:MAX_VOCAB]
stoi: Dict[str,int] = {w:i for i,w in enumerate(vocab)}
itos: Dict[int,str] = {i:w for w,i in stoi.items()}
PAD, UNK, EOS = stoi['<pad>'], stoi['<unk>'], stoi['<eos>']

def encode(s: str, add_eos: bool=True) -> List[int]:
    ids = [stoi.get(t, UNK) for t in tokenize(s)]
    if add_eos:
        ids.append(EOS)
    return ids

len(vocab), list(stoi)[:10]


vocab:   0%|          | 0/800 [00:00<?, ?it/s]

vocab: 100%|██████████| 800/800 [00:00<00:00, 177096.28it/s]


(3655, ['<pad>', '<unk>', '<eos>', 'i', 'to', 'the', 'my', 'a', 'and', 'is'])

In [11]:
class NextTokenDataset(Dataset):
    def __init__(self, sequences: Sequence[List[int]], seq_len: int):
        self.samples: List[Tuple[List[int], List[int]]] = []
        for s in sequences:
            if len(s) < 2: continue
            for st in range(0, max(0, len(s)-1)):
                ed = st + seq_len
                x = s[st:ed]
                y = s[st+1:ed+1]
                if len(x) < seq_len or len(y) < seq_len: break
                self.samples.append((x,y))
    def __len__(self): return len(self.samples)
    def __getitem__(self, idx):
        x,y = self.samples[idx]
        return torch.tensor(x, dtype=torch.long), torch.tensor(y, dtype=torch.long)

def collate_batch(batch):
    xs, ys = zip(*batch)
    return torch.stack(xs,0), torch.stack(ys,0)

train_ids = [encode(s) for s in pd.read_csv(TRAIN)['text'].astype(str).tolist()]
val_ids = [encode(s) for s in pd.read_csv(VAL)['text'].astype(str).tolist()]

train_ds = NextTokenDataset(train_ids, SEQ_LEN)
val_ds = NextTokenDataset(val_ids, SEQ_LEN)
train_dl = DataLoader(train_ds, batch_size=BATCH_SIZE, shuffle=True, collate_fn=collate_batch)
val_dl = DataLoader(val_ds, batch_size=BATCH_SIZE, shuffle=False, collate_fn=collate_batch)

len(train_ds), len(val_ds)


(1856, 236)

In [12]:
class LSTMLM(nn.Module):
    def __init__(self, vocab_size: int, emb: int=128, hid: int=64, layers: int=1, dropout: float=0.0):
        super().__init__()
        self.emb = nn.Embedding(vocab_size, emb, padding_idx=PAD)
        self.rnn = nn.LSTM(input_size=emb, hidden_size=hid, num_layers=layers, dropout=dropout if layers>1 else 0.0, batch_first=True)
        self.proj = nn.Linear(hid, vocab_size)
    def forward(self, x):
        e = self.emb(x)
        o,_ = self.rnn(e)
        return self.proj(o)
    @torch.no_grad()
    def generate(self, prefix_ids: List[int], max_new: int=20, eos_id: int=EOS):
        self.eval()
        x = torch.tensor([prefix_ids], dtype=torch.long, device=DEVICE)
        out = x
        for _ in range(max_new):
            logits = self.forward(out)
            nxt = torch.argmax(logits[:,-1,:], dim=-1, keepdim=True)
            out = torch.cat([out, nxt], dim=1)
            if int(nxt.item()) == eos_id:
                break
        return out[0].tolist()

model = LSTMLM(len(vocab)).to(DEVICE)
opt = torch.optim.AdamW(model.parameters(), lr=LR)
crit = nn.CrossEntropyLoss(ignore_index=PAD)
model


LSTMLM(
  (emb): Embedding(3655, 128, padding_idx=0)
  (rnn): LSTM(128, 64, batch_first=True)
  (proj): Linear(in_features=64, out_features=3655, bias=True)
)

In [13]:
from typing import Tuple

def build_fixed_window_samples(seqs: Sequence[List[int]], seq_len: int) -> List[Tuple[List[int], List[int]]]:
    samples: List[Tuple[List[int], List[int]]] = []
    for s in tqdm(seqs, desc="windows"):
        if not s:
            continue
        x = s[:seq_len]
        y = s[1:seq_len+1]

        if len(x) < seq_len:
            x = x + [PAD] * (seq_len - len(x))

        if len(y) < seq_len:
            y = y + [EOS] + [PAD] * max(0, seq_len - len(y) - 1)
        samples.append((x, y))
    return samples

class FixedWindowDataset(Dataset):
    def __init__(self, samples: Sequence[Tuple[List[int], List[int]]]):
        self.samples = list(samples)
    def __len__(self):
        return len(self.samples)
    def __getitem__(self, idx):
        x, y = self.samples[idx]
        return torch.tensor(x, dtype=torch.long), torch.tensor(y, dtype=torch.long)

train_ids = [encode(s) for s in pd.read_csv(TRAIN)['text'].astype(str).tolist()]
val_ids = [encode(s) for s in pd.read_csv(VAL)['text'].astype(str).tolist()]

train_samples = build_fixed_window_samples(train_ids, SEQ_LEN)
val_samples = build_fixed_window_samples(val_ids, SEQ_LEN)

train_ds = FixedWindowDataset(train_samples)
val_ds = FixedWindowDataset(val_samples)

train_dl = DataLoader(train_ds, batch_size=BATCH_SIZE, shuffle=True, num_workers=0, pin_memory=True)
val_dl = DataLoader(val_ds, batch_size=BATCH_SIZE, shuffle=False, num_workers=0, pin_memory=True)

print('train samples:', len(train_ds), 'val samples:', len(val_ds))


windows: 100%|██████████| 800/800 [00:00<00:00, 400363.11it/s]
windows: 100%|██████████| 100/100 [00:00<00:00, 100007.25it/s]

train samples: 800 val samples: 100





In [14]:
import torch
import os, json
from math import inf

DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")

print(DEVICE);
print(torch.__version__)
print(torch.version.cuda)
print(torch.cuda.is_available())

os.makedirs('models', exist_ok=True)

model = model.to(DEVICE)

def train_epoch(dl):
    model.train()
    total, n = 0.0, 0
    for xb, yb in tqdm(dl, total=len(dl), desc="train", leave=False):
        xb, yb = xb.to(DEVICE), yb.to(DEVICE)
        opt.zero_grad(set_to_none=True)
        logits = model(xb)
        loss = crit(logits.view(-1, logits.size(-1)), yb.view(-1))
        loss.backward()
        opt.step()
        total += float(loss.item()); n += 1
    return total / max(1, n)

@torch.no_grad()
def val_loss(dl):
    model.eval()
    total, n = 0.0, 0
    for xb, yb in tqdm(dl, total=len(dl), desc="val", leave=False):
        xb, yb = xb.to(DEVICE), yb.to(DEVICE)
        logits = model(xb)
        loss = crit(logits.view(-1, logits.size(-1)), yb.view(-1))
        total += float(loss.item()); n += 1
    return total / max(1, n)

best = inf
for epoch in range(1, EPOCHS+1):
    tr = train_epoch(train_dl)
    vl = val_loss(val_dl)
    print(f"epoch {epoch}/{EPOCHS} - train: {tr:.4f}  val: {vl:.4f}")
    if vl < best:
        best = vl
        torch.save(model.state_dict(), 'models/lstm_model.pt')
        with open('models/vocab.json','w',encoding='utf-8') as f:
            json.dump(stoi, f, ensure_ascii=False)

print("OK")

cuda
2.6.0+cu124
12.4
True


train:   0%|          | 0/2 [00:00<?, ?it/s]

                                                    

epoch 1/1 - train: 8.2025  val: 8.1779
OK




In [15]:
!pip -q install evaluate rouge_score
import evaluate

rouge = evaluate.load('rouge')

def ids_to_text(ids: List[int]) -> str:
    toks = []
    for i in ids:
        if i == EOS: break
        toks.append(itos.get(i,'<unk>'))
    return ' '.join(toks)

preds, refs = [], []
val_texts = pd.read_csv(VAL)['text'].astype(str).tolist()
for t in tqdm(val_texts[:1000], desc="eval-lstm"):
    ids = encode(t)
    if len(ids) < 4: continue
    cut = int(len(ids)*0.75)
    prefix, tail = ids[:cut], ids[cut:]
    gen = model.generate(prefix, max_new=20, eos_id=EOS)[len(prefix):]
    preds.append(ids_to_text(gen))
    refs.append(ids_to_text(tail))

scores = rouge.compute(predictions=preds, references=refs, use_stemmer=True)
scores



[notice] A new release of pip is available: 25.0.1 -> 25.2
[notice] To update, run: python.exe -m pip install --upgrade pip
eval-lstm: 100%|██████████| 100/100 [00:01<00:00, 63.84it/s]


{'rouge1': np.float64(0.003945197294950989),
 'rouge2': np.float64(0.0),
 'rougeL': np.float64(0.003916125837308103),
 'rougeLsum': np.float64(0.00394519729495099)}

In [16]:
!pip -q install transformers
from transformers import pipeline

gen = pipeline('text-generation', model='distilgpt2')

preds_t, refs_t = [], []
for t in tqdm(val_texts[:200], desc="eval-gpt2"):
    ws = t.split()
    if len(ws) < 4: continue
    cut = int(len(ws)*0.75)
    prefix = ' '.join(ws[:cut])
    ref = ' '.join(ws[cut:])
    out = gen(prefix, max_length=cut+20, do_sample=True, top_k=50, top_p=0.95)
    completion = out[0]['generated_text'][len(prefix):].strip()
    preds_t.append(completion)
    refs_t.append(ref)

scores_t = rouge.compute(predictions=preds_t, references=refs_t, use_stemmer=True)
scores_t



[notice] A new release of pip is available: 25.0.1 -> 25.2
[notice] To update, run: python.exe -m pip install --upgrade pip
Device set to use cuda:0
eval-gpt2:   0%|          | 0/100 [00:00<?, ?it/s]Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Both `max_new_tokens` (=256) and `max_length`(=35) seem to have been set. `max_new_tokens` will take precedence. Please refer to the documentation for more information. (https://huggingface.co/docs/transformers/main/en/main_classes/text_generation)
eval-gpt2:   1%|          | 1/100 [00:03<06:21,  3.85s/it]Setting `pad_token_id` to `eos_token_id`:50256 f

{'rouge1': np.float64(0.02971537197311072),
 'rouge2': np.float64(0.0017211784170904241),
 'rougeL': np.float64(0.028309593153499332),
 'rougeLsum': np.float64(0.028711034495299652)}

In [None]:
print("Samples LSTM:\n")
shown = 0
for t in val_texts[:10]:
    ws = t.split()
    if len(ws) < 4: continue
    cut = int(len(ws)*0.75)
    prefix = ' '.join(ws[:cut])
    ref = ' '.join(ws[cut:])
    ids = encode(prefix)
    gen_tail = model.generate(ids, max_new=20, eos_id=EOS)[len(ids):]
    pred = ' '.join([itos.get(i,'<unk>') for i in gen_tail if i!=EOS]).strip()
    print(f"prefix: {prefix}\nref:    {ref}\npred:   {pred}\n")
    shown += 1
    if shown >= 5: break

print("\nSamples DistilGPT2:\n")
from transformers import pipeline
pl = pipeline('text-generation', model='distilgpt2')
shown = 0
for t in val_texts[:10]:
    ws = t.split()
    if len(ws) < 4: continue
    cut = int(len(ws)*0.75)
    prefix = ' '.join(ws[:cut])
    ref = ' '.join(ws[cut:])
    out = pl(prefix, max_length=cut+20, do_sample=True, top_k=50, top_p=0.95)
    pred = out[0]['generated_text'][len(prefix):].strip()
    print(f"prefix: {prefix}\nref:    {ref}\npred:   {pred}\n")
    shown += 1
    if shown >= 5: break


На валидации простая LSTM на ограниченном сабсэмпле (≈1000 строк) ожидаемо показала низкие ROUGE. При этом предобученная DistilGPT2 стабильно выдаёт более осмысленные продолжения и выигрывает по метрикам. Для итогового использования в задаче автодополнения я выбираю DistilGPT2: качество выше «из коробки», а время инференса на GPU приемлемое.

Отдельно поясню выбор размера выборки у меня RTX 3050, и на полном датасете обучение и инференс идут очень медленно; ВМ, которую должны предоставлять, мне не допступна, пытался раз 6-7. Поэтому для проверки пайплайна и сравнения моделей я использовал выборку из 1000 строк этого достаточно для отладки, хотя и мало для высоких метрик.
