In [1]:
import pandas as pd
import re

In [2]:
df = pd.read_csv('data/santa_letters_canada.csv')
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5000 entries, 0 to 4999
Data columns (total 6 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   name      5000 non-null   object
 1   province  5000 non-null   object
 2   city      5000 non-null   object
 3   date      5000 non-null   object
 4   letter    5000 non-null   object
 5   gifts     5000 non-null   object
dtypes: object(6)
memory usage: 234.5+ KB


In [3]:
df.head()

Unnamed: 0,name,province,city,date,letter,gifts
0,Lucas,Ontario,Brampton,2024-11-17,"Yo Santa!\n\nLucas checking in from Brampton, ...","electric scooter, virtual reality headset"
1,Donna,Nova Scotia,Sydney,2024-12-03,"Hi Santa!\n\nDonna here, waving from Sydney, N...","Our Generation Siberian Husky Plush, video game"
2,Tammy,Nunavut,Cambridge Bay,2024-11-03,"Hi Santa,\n\nI'm Tammy from Cambridge Bay, Nun...","lego set, art supplies"
3,Lindsey,Alberta,Red Deer,2024-12-23,"Dear Santa,\n\nI hope this letter finds you we...","Furby Furblets, skateboard"
4,Kevin,Alberta,Calgary,2024-11-28,"Santa, my dude!\n\nKevin here, repping Calgary...","solar-powered phone charger, magnetic levitati..."


# Preprocessing

In [4]:
df.drop(['name', 'province', 'city', 'date'], inplace=True, axis= 1)
df.head()

Unnamed: 0,letter,gifts
0,"Yo Santa!\n\nLucas checking in from Brampton, ...","electric scooter, virtual reality headset"
1,"Hi Santa!\n\nDonna here, waving from Sydney, N...","Our Generation Siberian Husky Plush, video game"
2,"Hi Santa,\n\nI'm Tammy from Cambridge Bay, Nun...","lego set, art supplies"
3,"Dear Santa,\n\nI hope this letter finds you we...","Furby Furblets, skateboard"
4,"Santa, my dude!\n\nKevin here, repping Calgary...","solar-powered phone charger, magnetic levitati..."


In [5]:
# приводим текст к нормальному ввиду
df['letter_clean'] = (
    df['letter']
        .str.lower()
        .str.replace(r'[^\w\s]', ' ', regex=True)  # пунктуация → пробел
        .str.replace(r'\s+', ' ', regex=True)     # схлопывание пробелов
        .str.strip()
)

df['target'] =(
    df['gifts']
        .str.lower()
        .str.replace(r'[^\w\s,]', ' ', regex=True)  # пунктуация → пробел
        .str.replace(r'\s+', ' ', regex=True)     # схлопывание пробелов
        .str.strip()
)

In [6]:
df.drop(['letter', 'gifts'], axis=1, inplace=True)
df.head()

Unnamed: 0,letter_clean,target
0,yo santa lucas checking in from brampton ontar...,"electric scooter, virtual reality headset"
1,hi santa donna here waving from sydney nova sc...,"our generation siberian husky plush, video game"
2,hi santa i m tammy from cambridge bay nunavut ...,"lego set, art supplies"
3,dear santa i hope this letter finds you well m...,"furby furblets, skateboard"
4,santa my dude kevin here repping calgary alber...,"solar powered phone charger, magnetic levitati..."


In [9]:
df.to_csv('./data/clear_data.csv', index=False)

In [8]:
def create_training_data(text, gifts_string):
    gifts = [g.strip() for g in gifts_string.split(',')]
    entities = []
    
    for gift in gifts:
        # Ищем упоминания подарка в тексте (регистронезависимо)
        for match in re.finditer(re.escape(gift), text, re.IGNORECASE):
            entities.append((match.start(), match.end(), "GIFT"))            
    
    return (text, {"entities": entities})

In [10]:
training_data = []

for _, row in df.iterrows():
    text, ann = create_training_data(row['letter_clean'], row['target'])
    training_data.append((text, ann))

### Проверка правильности разметки

In [11]:
def validate_annotation_row(text, target, i):
    # что модель (разметка) извлекла из текста
    extracted = [
        text[start:end].strip()
        for start, end, label in training_data[i][1]['entities']
    ]

    # что должно быть извлечено (из target)
    expected = [
        g.strip()
        for g in target.split(',')
        if g.strip()
    ]

    return {
        "expected": set(expected),
        "extracted": set(extracted),
        "missing": set(expected) - set(extracted),
        "extra": set(extracted) - set(expected),
        "is_ok": set(expected) == set(extracted)
    }


In [12]:
errors = []
i = 0
for idx, row in df.iterrows():
    result = validate_annotation_row(
        text=row['letter_clean'],
        target=row['target'],
        i=i
    )
    i+=1

    if not result["is_ok"]:
        errors.append({
            "row": idx,
            "text": row['text'],
            "expected": result["expected"],
            "extracted": result["extracted"],
            "missing": result["missing"],
            "extra": result["extra"]
        })


In [13]:
print(errors)

[]


In [14]:
import spacy
from spacy.tokens import DocBin
from sklearn.model_selection import train_test_split

In [24]:
train_data, dev_data = train_test_split(
    training_data,
    test_size=0.2,
    random_state=42
)

In [16]:
nlp = spacy.blank("en")

In [17]:
def filter_overlapping_spans(spans):
    # сортируем по длине (убывание), потом по start
    spans = sorted(
        spans,
        key=lambda s: (s.end - s.start),
        reverse=True
    )

    result = []
    used_tokens = set()

    for span in spans:
        token_ids = set(range(span.start, span.end))
        if token_ids.isdisjoint(used_tokens):
            result.append(span)
            used_tokens.update(token_ids)

    return result


In [18]:
def make_docbin(data, output_path):
    doc_bin = DocBin()

    for text, ann in data:
        doc = nlp.make_doc(text)
        spans = []
        
        for start, end, label in ann["entities"]:
            span = doc.char_span(
                start,
                end,
                label=label,
                alignment_mode="contract"
            )
            if span is not None:
                spans.append(span)

        if len(spans) != len(set((s.start, s.end) for s in spans)):
            print("DUPLICATE:", text)

        for i, s1 in enumerate(spans):
            for s2 in spans[i+1:]:
                if s1.start < s2.end and s2.start < s1.end:
                    print("OVERLAP:", s1.text, "|", s2.text)

        spans = filter_overlapping_spans(spans)
        doc.ents = spans
        doc_bin.add(doc)

    doc_bin.to_disk(output_path)

Была проблема с созданием спанов когда одно слово в подарке попадает в другое, грубо говоря индексы накладываются. Как решение берется самый большой таргет

In [23]:
make_docbin(train_data, "train.spacy")
make_docbin(dev_data, "dev.spacy")

OVERLAP: robot | virtual pet robot
OVERLAP: lego sets | lego
