In [None]:
pip install sklearn_crfsuite

In [None]:
import pandas as pd
# from sklearn_crfsuite import CRF
from sklearn.model_selection import train_test_split



In [None]:
train_data = pd.read_csv("/content/drive/MyDrive/Datasets and study/data_For_camp/morpheme_train.csv", header = None)


In [None]:
words = train_data[0].tolist()
labels = train_data[1].tolist()
def word_to_features(word):
    return [{char : i} for i, char in enumerate(word)]

X = [ list(word) for word in words]
y = [label.split("/") for label in labels]
Y = []
for razmetka in y:
  cur = []
  for metka in razmetka:
    count, val = metka.split(":")
    [cur.append(val) for _ in range(len(count))]
  Y.append(cur)


In [None]:
crf = CRF(algorithm= 'l2sgd' , max_iterations= 100000)
crf.fit(X, Y)


In [None]:
def convert_morpheme_format(word, labels):



    grouped = []
    current_label = labels[0]
    current_chars = word[0]

    for char, label in zip(word[1:], labels[1:]):
        if label == current_label:
            current_chars += char
        else:
            grouped.append((current_chars, current_label))
            current_label = label
            current_chars = char
    grouped.append((current_chars, current_label))
    morphemes = "/".join([f"{chars}:{label}" for chars, label in grouped])
    return f"{word},{morphemes}"

In [None]:

test_data = pd.read_csv("/content/drive/MyDrive/Datasets and study/data_For_camp/morpheme_test.csv", header = None)
test_words = test_data[0].tolist()
X_test = [word_to_features(word) for word in test_words]
predicted = crf.predict(X_test)

with open("submission.csv", "w") as f:
    for word, morphemes in zip(test_words, predicted):
        f.write(convert_morpheme_format(word, morphemes))
        f.write("\n")

# Второй вариант

In [None]:
from transformers import AutoTokenizer, AutoModelForTokenClassification

model = AutoModelForTokenClassification.from_pretrained(
    "bert-base-multilingual-cased",
    num_labels=len(tag2idx)  \
)


In [None]:
tokenizer = AutoTokenizer.from_pretrained("bert-base-multilingual-cased",
                                         do_word_tokenize=False,
                                         tokenize_chars=True)

In [None]:

tag2idx = {"PREF": 0, "ROOT": 1, "SUFF": 2, "END": 3}
idx2tag = { 0 : "PREF",  1 :"ROOT",  2 :"SUFF", 3 : "END"}

In [None]:
word = "отражение"
inputs = tokenizer(list(word), return_tensors="pt", is_split_into_words=True)
outputs = model(**inputs)
predicted_ids = outputs.logits.argmax(-1)[0]
predicted_tags = [idx2tag[idx.item()] for idx in predicted_ids]


# Третий

In [None]:
import numpy as np
from torch.utils.data import Dataset, DataLoader
import torch
import torch.nn as nn


In [None]:
class BiLSTMTagger(nn.Module):
    def __init__(self, vocab_size, embed_dim, hidden_dim, num_tags):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, embed_dim, padding_idx=0)
        self.lstm = nn.LSTM(embed_dim, hidden_dim // 2,
                           bidirectional=True, batch_first=True)
        self.fc = nn.Linear(hidden_dim, num_tags)

    def forward(self, x):
        x = self.embedding(x)
        x, _ = self.lstm(x)
        x = self.fc(x)
        return x

In [None]:

chars = sorted(set(char for word in train_data[0] for char in word))
tags = ['PREF', 'ROOT', 'SUFF', 'END', 'POST', 'LINK', 'HYPH']

# Словари char2idx и tag2idx
char2idx = {char: idx + 1 for idx, char in enumerate(chars)}
tag2idx = {tag: idx for idx, tag in enumerate(tags)}

# Обратные словари
idx2char = {idx: char for char, idx in char2idx.items()}
idx2tag = {idx: tag for tag, idx in tag2idx.items()}

In [None]:
class MorphemeDataset(Dataset):
    def __init__(self, words, labels, char2idx, tag2idx):
        self.words = words
        self.labels = labels
        self.char2idx = char2idx
        self.tag2idx = tag2idx

    def __len__(self):
        return len(self.words)

    def __getitem__(self, idx):
        word = self.words.iloc[idx]
        labels = self.labels.iloc[idx]


        x = [self.char2idx[char] for char in word]
        y = [self.tag2idx[tag] for tag in labels]


        return torch.tensor(x, dtype=torch.long), torch.tensor(y, dtype=torch.long)

def collate_fn(batch):

  max_len = max(len(x[0]) for x in batch)


  x_padded = []
  y_padded = []
  for x, y in batch:
      pad_size = max_len - len(x)
      x_padded.append(torch.cat([x, torch.zeros(pad_size, dtype=torch.long)]))
      y_padded.append(torch.cat([y, torch.zeros(pad_size, dtype=torch.long)]))

  return torch.stack(x_padded), torch.stack(y_padded)


df = pd.DataFrame({
    'chars': X,
    'labels': Y
}).reset_index(drop=True)

train_data, val_data = train_test_split( df  , test_size=0.2, random_state=42)
train_dataset = MorphemeDataset(train_data["chars"], train_data["labels"], char2idx, tag2idx)
val_dataset = MorphemeDataset(val_data["chars"], val_data["labels"], char2idx, tag2idx)

train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True,collate_fn=collate_fn)
val_loader = DataLoader(val_dataset, batch_size=32)

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

model = BiLSTMTagger(
    vocab_size=len(char2idx) + 1,
    embed_dim=64,
    hidden_dim=128,
    num_tags=len(tag2idx)
).to(device)

criterion = nn.CrossEntropyLoss(ignore_index=-1)
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

In [None]:
def train_epoch(model, dataloader, optimizer, criterion):
    model.train()
    total_loss = 0
    for batch in dataloader:
        x, y = batch
        x, y = x.to(device), y.to(device)

        optimizer.zero_grad()
        outputs = model(x)
        loss = criterion(outputs.view(-1, outputs.shape[-1]), y.view(-1))
        loss.backward()
        optimizer.step()

        total_loss += loss.item()
    return total_loss / len(dataloader)



In [None]:
def evaluate(model, dataloader, criterion):
    model.eval()
    total_loss = 0
    correct = 0
    total = 0

    with torch.no_grad():
        for batch in dataloader:
            x, y = batch
            x, y = x.to(device), y.to(device)

            outputs = model(x)
            loss = criterion(outputs.view(-1, outputs.shape[-1]), y.view(-1))
            total_loss += loss.item()

            preds = outputs.argmax(-1)
            correct += (preds == y).sum().item()
            total += y.numel()

    accuracy = correct / total
    avg_loss = total_loss / len(dataloader)
    return avg_loss, accuracy

In [None]:
def predict(model, word, char2idx, idx2tag):
    model.eval()
    chars = list(word)
    x = [char2idx.get(char, 0) for char in chars]
    x = torch.tensor([x], dtype=torch.long).to(device)

    with torch.no_grad():
        outputs = model(x)
        pred_ids = outputs.argmax(-1)[0].cpu().numpy()

    tags = [idx2tag[idx] for idx in pred_ids]
    return tags

In [None]:

num_epochs = 10
for epoch in range(num_epochs):
    train_loss = train_epoch(model, train_loader, optimizer, criterion)
    print(f"Epoch {epoch + 1}, Loss: {train_loss:.4f}")

Epoch 1, Loss: 0.2208
Epoch 2, Loss: 0.1153
Epoch 3, Loss: 0.0988
Epoch 4, Loss: 0.0884
Epoch 5, Loss: 0.0803
Epoch 6, Loss: 0.0733
Epoch 7, Loss: 0.0676
Epoch 8, Loss: 0.0624
Epoch 9, Loss: 0.0578
Epoch 10, Loss: 0.0534


In [None]:
val_loss, val_acc = evaluate(model, val_loader, criterion)
print(f"Валидационная точность: {val_acc:.2%}")

In [None]:
test_data = pd.read_csv("/content/drive/MyDrive/Datasets and study/data_For_camp/morpheme_test.csv", header = None)
test_words = test_data[0].tolist()
X_test = [list(word) for word in test_words]
predicted = []
for word in X_test:
   predicted.append(predict(model, word, char2idx, idx2tag))



In [None]:
predicted

[['ROOT',
  'ROOT',
  'ROOT',
  'ROOT',
  'ROOT',
  'ROOT',
  'ROOT',
  'ROOT',
  'SUFF',
  'SUFF',
  'SUFF',
  'SUFF',
  'END',
  'END'],
 ['ROOT',
  'ROOT',
  'ROOT',
  'ROOT',
  'ROOT',
  'SUFF',
  'SUFF',
  'SUFF',
  'SUFF',
  'SUFF',
  'END'],
 ['ROOT', 'ROOT', 'ROOT', 'ROOT', 'SUFF', 'END'],
 ['ROOT',
  'ROOT',
  'ROOT',
  'ROOT',
  'ROOT',
  'ROOT',
  'SUFF',
  'SUFF',
  'SUFF',
  'SUFF',
  'SUFF',
  'END'],
 ['ROOT',
  'ROOT',
  'ROOT',
  'ROOT',
  'ROOT',
  'ROOT',
  'SUFF',
  'SUFF',
  'SUFF',
  'SUFF',
  'SUFF',
  'END',
  'END'],
 ['ROOT', 'ROOT', 'ROOT', 'ROOT', 'ROOT', 'ROOT'],
 ['ROOT',
  'ROOT',
  'ROOT',
  'ROOT',
  'ROOT',
  'ROOT',
  'SUFF',
  'SUFF',
  'SUFF',
  'SUFF',
  'SUFF',
  'END'],
 ['ROOT',
  'ROOT',
  'ROOT',
  'ROOT',
  'ROOT',
  'SUFF',
  'END',
  'END',
  'POST',
  'POST'],
 ['ROOT', 'ROOT', 'ROOT', 'ROOT', 'ROOT', 'SUFF', 'SUFF', 'SUFF', 'END'],
 ['ROOT', 'ROOT', 'ROOT', 'ROOT', 'ROOT', 'ROOT', 'ROOT', 'ROOT'],
 ['PREF', 'PREF', 'ROOT', 'ROOT', 'ROOT',

In [None]:

with open("submission.csv", "w") as f:
    for word, morphemes in zip(test_words, predicted):
        f.write(convert_morpheme_format(word, morphemes))
        f.write("\n")