In [21]:
!pip install razdel



In [22]:
!pip install wordfreq



In [27]:
import pandas as pd
from razdel import tokenize
from wordfreq import zipf_frequency
import math
from functools import lru_cache
import string


**Считаем файл**

In [28]:
def read_data(data_dir : string):
  with open(data_dir, encoding="utf-8") as f:
    lines = [line.rstrip("\n") for line in f]

  header = lines[0].split(",", 1)
  rows = [line.split(",", 1) for line in lines[1:]]

  return pd.DataFrame(rows, columns=header)

In [29]:
df = read_data("/content/dataset_1937770_3.txt")
df.head()


Unnamed: 0,id,text_no_spaces
0,0,куплюайфон14про
1,1,ищудомвПодмосковье
2,2,сдаюквартирусмебельюитехникой
3,3,новыйдивандоставканедорого
4,4,отдамдаромкошку


Воспользуемся набором эвристик для первоначальной обработки.

In [30]:
df["razdel_array"] = df["text_no_spaces"].apply(lambda text: [_.text for _ in list(tokenize(text.lower()))])

In [31]:
def word_score(word, lang):
    z = zipf_frequency(word, lang)  # zipf roughly between 0..7. Rare words -> -inf
    if z <= 0:
        return -10
    return z - max(0.0, 8.8 - 0.8 * len(word))

def segment_viterbi(s, lang):
    n = len(s)
    # dp[i] = best score for prefix s[:i], backpointer
    dp = [-1e9] * (n + 1)
    bp = [-1] * (n + 1)
    dp[0] = 0.0

    brand_set = set(['amd', 'acer', 'adidas', 'aliexpress', 'amazon', 'apple', 'asus', 'atlant', 'avito',
                 'benq', 'bosch', 'brother', 'burger king', 'canon', 'casio', 'coca-cola', 'dell', 'dyson',
                 'epson', 'facebook', 'fender', 'fitbit', 'ford', 'garmin', 'gibson', 'google', 'h&m', 'hp',
                 'honor', 'huawei', 'ibm', 'ikea', 'indesit', 'instagram', 'intel', 'jbl', 'jvc', 'kfc', 'lg',
                 'lenovo', 'levi’s', 'linkedin', 'logitech', 'msi', 'mac', 'mastercard', "mcdonald's", 'merida',
                 'microsoft', 'nestle', 'netflix', 'nike', 'nikon', 'nvidia', 'oppo', 'oracle', 'panasonic', 'paypal',
                 'pepsi', 'philips', 'pixel', 'playstation', 'puma', 'python', 'redmond', 'reebok', 'samsung', 'sberbank',
                 'seagate', 'sharp', 'skype', 'sony', 'spotify', 'stels', 'tcl', 'tp-link', 'telegram', 'tesla', 'toshiba',
                 'toyota', 'twitter', 'uniqlo', 'visa', 'western digital', 'whatsapp', 'windows', 'xbox', 'xiaomi', 'yamaha',
                 'youtube', 'zara', 'zoom', 'ebay'])


    for i in range(n):
        if dp[i] < -1e8:
            continue

        for L in range(1, min(50, n - i) + 1):
            w = s[i:i+L]
            if w in brand_set:
              sc = 100
            else:
              sc = word_score(w, lang)

            new = dp[i] + sc
            if new > dp[i+L]:
                dp[i+L] = new
                bp[i+L] = i
    # backtrack
    if dp[n] < -1e8:
        return s  # fallback: вернуть как есть

    parts = []
    cur = n
    while cur > 0:
        prev = bp[cur]
        if prev == -1:
            # cannot segment
            parts = [s]
            break
        parts.append(s[prev:cur])
        cur = prev
    parts.reverse()
    return " ".join(parts)

In [32]:
def segment_no_spaces_words(arr):
    result = []
    for word in arr:
        if len(word) != 0:
            if word[0] in "абвгдеёжзийклмнопрстуфхцчшщъыьэюя":
                result.append(segment_viterbi(word,"ru"))
            elif word[0] in string.ascii_lowercase:
                result.append(segment_viterbi(word,"en"))
            else:
                result.append(word)
    return result

In [33]:

df['razdel_array'] = df['razdel_array'].apply(segment_no_spaces_words)

Преобразуем массив в простую строку

In [34]:
def word_list_to_string(word_list) -> str:
  if not word_list:
        return ""
  parts = []
  first = True

  punct = set(string.punctuation)
  punct.add(' ')
  for word in word_list:
      if not word:
          continue

      if not first and word[0] not in punct:
          parts.append(' ')

      parts.append(word)
      first = False

  return ''.join(parts)


In [36]:
df['text_with_spaces'] = df['razdel_array'].apply(lambda arr: word_list_to_string(arr))

In [37]:
df['text_with_spaces'].head()

Unnamed: 0,text_with_spaces
0,куплю айфон 14 про
1,ищу дом в подмосковье
2,сдаю квартиру с мебелью и техникой
3,новый диван доставка недорого
4,отдам даром кошку


Костыль не бейте

In [38]:
viterbi_unigram_pred = [[i for i, c in enumerate(el) if c == ' '] for el in df["text_with_spaces"]]

In [39]:
for arr in viterbi_unigram_pred:
  for i,el in enumerate(arr):
    arr[i] -= i

Submission

In [40]:

submission_df = pd.DataFrame({
    'id': range(len(viterbi_unigram_pred)),
    'predicted_positions': viterbi_unigram_pred,
})
submission_df = submission_df.set_index('id')

In [41]:
submission_df.to_csv('finale.csv')

In [42]:
def calculate_f1(predictions, truths):
    """
    Вычисляет F1-score для предсказанных позиций пробелов.

    Args:
        predictions: список списков предсказанных индексов пробелов
        truths: список списков истинных индексов пробелов

    Returns:
        Средний F1-score по всем текстам
    """
    f1_scores = []

    for pred, true in zip(predictions, truths):
        # Преобразуем в множества для удобства операций
        pred_set = set(pred)
        true_set = set(true)

        # Рассчитываем метрики
        intersection = len(pred_set & true_set)

        # Если нет предсказанных пробелов и нет истинных
        if len(pred_set) == 0 and len(true_set) == 0:
            precision = 1.0
            recall = 1.0
        else:
            # Precision = TP / (TP + FP)
            precision = intersection / len(pred_set) if len(pred_set) > 0 else 0.0

            # Recall = TP / (TP + FN)
            recall = intersection / len(true_set) if len(true_set) > 0 else 0.0

        # F1-score
        if precision + recall > 0:
            f1 = 2 * (precision * recall) / (precision + recall)
        else:
            f1 = 0.0

        f1_scores.append(f1)

    # Средний F1 по всем текстам
    return sum(f1_scores) / len(f1_scores) if f1_scores else 0.0


Спасибо за проверку ❤️