In [None]:
import pandas as pd

data = pd.read_csv('/kaggle/input/train-data/train_data.csv')

In [None]:
data.info()

In [None]:
data.tail()

In [None]:
import torch
import ast
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader, random_split

data['input_text'] = data['input_text'].astype(str).str.strip().str.replace('\xa0', ' ')
data['input_text'] = data['input_text'].str.replace(r'\s+', '', regex=True)

char2idx = {"<PAD>": 0, "<UNK>": 1}
all_chars = sorted(set("".join(data['input_text'].tolist())))
for i, c in enumerate(all_chars, start=2):
    char2idx[c] = i
vocab_size = max(char2idx.values()) + 1

In [None]:
def parse_labels(x):
    if isinstance(x, list):
        return x
    if pd.isna(x):
        return None
    if isinstance(x, (int, float)):
        return [int(x)]
    if isinstance(x, str):
        s = x.strip()
    
        try:
            parsed = ast.literal_eval(s)
            return parsed
        except Exception:
            s2 = s.replace('\n', '').replace('\r', '').strip()
            try:
                parsed = ast.literal_eval(s2)
                return parsed
            except Exception:
                parts = s2.replace(',', ' ').split()
                if all(p in {'0','1'} for p in parts):
                    return [int(p) for p in parts]
                return None
    return None

data['labels_parsed'] = data['labels'].apply(parse_labels)

In [None]:
class TextDataset(Dataset):
    def __init__(self, df, char2idx):
        self.rows = list(zip(df['input_text'].tolist(), df['labels_parsed'].tolist()))
        self.char2idx = char2idx

    def __len__(self):
        return len(self.rows)

    def __getitem__(self, idx):
        text, labels = self.rows[idx]
        x = torch.tensor([self.char2idx.get(c, self.char2idx['<UNK>']) for c in text], dtype=torch.long)
        y = torch.tensor(labels, dtype=torch.long)
        return x, y

def collate_fn(batch):
    xs, ys = zip(*batch)
    max_len = max(len(x) for x in xs)
    xs_pad, ys_pad = [], []
    for x, y in zip(xs, ys):
        pad_x = torch.cat([x, torch.full((max_len - len(x),), char2idx["<PAD>"], dtype=torch.long)])
        pad_y = torch.cat([y, torch.full((max_len - len(y),), -100, dtype=torch.long)])
        xs_pad.append(pad_x)
        ys_pad.append(pad_y)
    return torch.stack(xs_pad), torch.stack(ys_pad)

In [None]:
train_size = int(0.9 * len(data))
val_size = len(data) - train_size
train_df, val_df = random_split(data, [train_size, val_size])

train_dataset = TextDataset(pd.DataFrame(train_df.dataset.iloc[train_df.indices]), char2idx)
val_dataset = TextDataset(pd.DataFrame(val_df.dataset.iloc[val_df.indices]), char2idx)

train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True, collate_fn=collate_fn)
val_loader = DataLoader(val_dataset, batch_size=16, shuffle=False, collate_fn=collate_fn)

In [None]:
class BiLSTM(nn.Module):
    def __init__(self, vocab_size, embed_dim=128, hidden_dim=256, num_classes=2, pad_idx=0):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, embed_dim, padding_idx=pad_idx)
        self.lstm = nn.LSTM(embed_dim, hidden_dim, batch_first=True, bidirectional=True)
        self.fc = nn.Linear(hidden_dim*2, num_classes)

    def forward(self, x):
        emb = self.embedding(x)
        out, _ = self.lstm(emb)
        logits = self.fc(out)
        return logits

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = BiLSTM(vocab_size=vocab_size, embed_dim=128, hidden_dim=256, num_classes=2, pad_idx=char2idx["<PAD>"]).to(device)
criterion = nn.CrossEntropyLoss(ignore_index=-100)
optimizer = torch.optim.Adam(model.parameters(), lr=5e-4)

In [None]:
def compute_accuracy(preds, labels):
    mask = labels != -100
    correct = (preds.argmax(-1) == labels) & mask
    return correct.sum().item() / mask.sum().item()

for epoch in range(10):
    model.train()
    total_loss, total_acc, total_cnt = 0, 0, 0
    for x_batch, y_batch in train_loader:
        x_batch, y_batch = x_batch.to(device), y_batch.to(device)
        optimizer.zero_grad()
        logits = model(x_batch)
        loss = criterion(logits.view(-1, 2), y_batch.view(-1))
        loss.backward()
        optimizer.step()
        total_loss += loss.item() * x_batch.size(0)
        total_acc += compute_accuracy(logits, y_batch) * x_batch.size(0)
        total_cnt += x_batch.size(0)
    print(f"Epoch {epoch+1} | Train loss: {total_loss/total_cnt:.4f}, acc: {total_acc/total_cnt:.4f}")

    model.eval()
    val_loss, val_acc, val_cnt = 0, 0, 0
    with torch.no_grad():
        for x_batch, y_batch in val_loader:
            x_batch, y_batch = x_batch.to(device), y_batch.to(device)
            logits = model(x_batch)
            loss = criterion(logits.view(-1,2), y_batch.view(-1))
            val_loss += loss.item() * x_batch.size(0)
            val_acc += compute_accuracy(logits, y_batch) * x_batch.size(0)
            val_cnt += x_batch.size(0)
    print(f"Val loss: {val_loss/val_cnt:.4f}, acc: {val_acc/val_cnt:.4f}")


In [None]:
def predict_spaces(model, text, char2idx):
    model.eval()
    with torch.no_grad():
        x = torch.tensor([char2idx.get(c, char2idx['<UNK>']) for c in text], dtype=torch.long).unsqueeze(0).to(device)
        logits = model(x)
        preds = logits.argmax(-1).squeeze(0).cpu().tolist()
        positions = [i for i, v in enumerate(preds) if v==1]
        return positions

lines = []
with open('/kaggle/input/avito-ds-internship-2025/dataset_1937770_3.txt', 'r', encoding='utf-8') as f:
    next(f) 
    for line in f:
        line = line.strip()
        if not line:
            continue
        parts = line.split(',', 1)
        if len(parts) == 2:
            id_, text_no_spaces = parts
            lines.append((int(id_), text_no_spaces))

test = pd.DataFrame(lines, columns=['id', 'text_no_spaces'])

all_positions = []
for sentence in test['text_no_spaces']:
    all_positions.append(predict_spaces(model, sentence, char2idx))
    
test['predicted_positions'] = all_positions
test.head()

In [None]:
test.tail(15)