In [1]:
import numpy as np
import pandas as pd
import nltk
import os

In [2]:
# nltk.download('wordnet')
from nltk.stem.snowball import SnowballStemmer
from nltk.stem import WordNetLemmatizer

In [3]:
stemmer = SnowballStemmer(language='english')
lemmatizer = WordNetLemmatizer()

In [4]:
import re

In [5]:
train_data = pd.read_parquet("../data/train.parquet")

In [6]:
test_data = pd.read_parquet("../data/test.parquet")

In [59]:
train_data.shape

(120000, 2)

In [7]:
def split_into_sentences(text):
    # (?<!\w\.\w.) - проверяет, что нет слова, за которым следует точка и еще одно слово.
    # (?<![A-Z][a-z]\.) - проверяет, что перед текущей позицией нет заглавной буквы, за которой следует строчная буква и точка.
    # (?<=\.|\?|!) - проверяет, что перед текущей позицией находится точка, вопросительный знак или восклицательный знак.
    # \s - пробельный символ 
    sentence_pattern = re.compile(r'(?<!\w\.\w.)(?<![A-Z][a-z]\.)(?<=\.|\?|!)\s')
    
    # Находим все предложения в тексте
    sentences = sentence_pattern.split(text)
    
    # Удаляем пустые строки, если они есть
    sentences = [sentence.strip() for sentence in sentences if sentence.strip()]
    
    return sentences

In [8]:
split_into_sentences("Who are you? How it's going?")

['Who are you?', "How it's going?"]

In [9]:
def find_emails(sentence):
    # [A-Za-z0-9._%+-]+ - часть почты до @
    # [A-Za-z0-9.-]+ - доменная часть - .com(или другого)
    # [A-Z|a-z]{2,} - доменный уровень (например, .com, .org)
    email_pattern = re.compile(r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b')
    
    # Находим все email-адреса в тексте
    emails = email_pattern.findall(sentence)
    
    return emails

In [10]:
text = 'Please, send your feedback at this adress: {mail}'
emails = ["user.name@example.com","another-email@domain.org", "test123@sub.domain.co.uk"]
for em in emails:
    print(find_emails(text.format(mail=em)))

['user.name@example.com']
['another-email@domain.org']
['test123@sub.domain.co.uk']


In [11]:
def find_phone_number(sentence):
    # \+? - символ + есть ноль или 1 раз
    # [- (]? и [- )]? - разделитель в  виде -, () и пробел
    # \d{3} - три любых цифры
    # 
    number_pattern = re.compile(r'\+?7?[- (]?\d{3}[- )]?\d{3}[- ]?\d{2}[- ]?\d{2}')
    
    # Находим все номера телефонов в тексте
    phone_numbers = number_pattern.findall(sentence)
    
    return phone_numbers

In [12]:
text_phone = 'Please, call us back: {number}'
phones = ["+79261234567","89261234567", "79261234567","+7 926 123 45 67", "8(926)123-45-67"]
for phone in phones:
    print(find_phone_number(text_phone.format(number=phone)))

['+79261234567']
[' 8926123456']
[' 7926123456']
['+7 926 123 45 67']
['(926)123-45-67']


In [13]:
text_phones = """Please, call us, to get more information:
Additional phone number: +79261234567
Email: mail@yandex.com"""
find_phone_number(text_phones)

['+79261234567']

In [14]:
def find_dates(sentence):
    # Регулярное выражение для поиска дат
    date_pattern = re.compile(r'\b(\d{1,2})([./-]?)(\d{1,2})\2(\d{2,4})\b')
    
    # Находим все даты в тексте
    dates = date_pattern.findall(sentence)

    # Преобразуем найденные даты в строки
    formatted_dates = [f"{day}{separator}{month}{separator}{year}" for day, separator, month, year in dates]
    
    return formatted_dates

In [15]:
text_date = 'Date to meet: {dat}'
dates = ['19.06.2024', '19-06-2024', '10/12/24']
for date in dates:
    print(find_dates(text_date.format(dat=date)))

['19.06.2024']
['19-06-2024']
['10/12/24']


In [16]:
def tokenize_sentence(sentence):
    # Регулярное выражение для выделения отдельных токенов
    pattern = r"\+?\b[\w@.]+(?:'\w+)?\b|[:;,?.!]"
    return re.findall(pattern, sentence)

In [17]:
tokens = tokenize_sentence(text_phones)

In [18]:
print(text_phones)

Please, call us, to get more information:
Additional phone number: +79261234567
Email: mail@yandex.com


In [19]:
print(tokens)

['Please', ',', 'call', 'us', ',', 'to', 'get', 'more', 'information', ':', 'Additional', 'phone', 'number', ':', '+79261234567', 'Email', ':', 'mail@yandex.com']


In [20]:
stemmer.stem('refridgerator')

'refridger'

In [21]:
stemmer.stem('generations')

'generat'

In [22]:
stemmer.stem('optimization')

'optim'

In [23]:
lemmatizer.lemmatize('refridgerator')

'refridgerator'

In [24]:
lemmatizer.lemmatize('generation')

'generation'

In [48]:
def tokenize_text(text):
    sentences = split_into_sentences(text)
    tokenized = []  # Список для хранения результатов по каждому предложению
    
    for sentence in sentences:
        sentence_data = {
            'tokens': [],  # Оригинальные токены
            'stems': [],   # Стемы
            'lemmas': [],  # Леммы
            'entities': []  # Специальные сущности (email, phone, date)
        }
        
        word_tokens = tokenize_sentence(sentence)
        
        for token in word_tokens:
            # Поиск специальных сущностей
            emails = find_emails(token)
            if emails:
                sentence_data['entities'].append({'type': 'email', 'value': emails[0]})
                continue
            
            phones = find_phone_number(token)
            if phones:
                sentence_data['entities'].append({'type': 'phone', 'value': phones[0]})
                continue
            
            dates = find_dates(token)
            if dates:
                sentence_data['entities'].append({'type': 'date', 'value': dates[0]})
                continue
            
            # Обычная обработка токена
            stem = stemmer.stem(token)
            lemma = lemmatizer.lemmatize(token)
            
            sentence_data['tokens'].append(token)
            sentence_data['stems'].append(stem)
            sentence_data['lemmas'].append(lemma)
        
        tokenized.append(sentence_data)  # Добавляем результат предложения в общий список
    
    return tokenized

In [49]:
def format_to_tsv(tokenized_data):
    tsv_lines = []
    
    for sentence_data in tokenized_data:
        # Обрабатываем обычные токены
        for token, stem, lemma in zip(sentence_data['tokens'], sentence_data['stems'], sentence_data['lemmas']):
            tsv_lines.append(f"{token}\t{stem}\t{lemma}")
        
        # Обрабатываем специальные сущности
        for entity in sentence_data['entities']:
            tsv_lines.append(f"{entity['type']}\t{entity['value']}")
        
        # Добавляем разделитель предложений
        tsv_lines.append('')
    
    # Убираем последний лишний разделитель
    return '\n'.join(tsv_lines[:-1])

In [50]:
train_data['text'][55]

"Hip Hop's Online Shop Celebrity fashion is booming. These webpreneurs are bringing it to main street"

In [51]:
tokenized_data = tokenize_text(train_data['text'][55])

In [55]:
all_tokens = [token for sentence_data in tokenized_data for token in sentence_data['tokens']]

In [57]:
tsv_output = format_to_tsv(tokenized_data)
print(tsv_output)

Hip	hip	Hip
Hop's	hop	Hop's
Online	onlin	Online
Shop	shop	Shop
Celebrity	celebr	Celebrity
fashion	fashion	fashion
is	is	is
booming	boom	booming
.	.	.

These	these	These
webpreneurs	webpreneur	webpreneurs
are	are	are
bringing	bring	bringing
it	it	it
to	to	to
main	main	main
street	street	street


In [26]:
print(train_data['text'][5])

Stocks End Up, But Near Year Lows (Reuters) Reuters - Stocks ended slightly higher on Friday\but stayed near lows for the year as oil prices surged past  #36;46\a barrel, offsetting a positive outlook from computer maker\Dell Inc. (DELL.O)


In [29]:
print(tokenizer(train_data['text'][5]))

Stocks	stock	Stocks
End	end	End
Up	up	Up
,	,	,
But	but	But
Near	near	Near
Year	year	Year
Lows	low	Lows
Reuters	reuter	Reuters
Reuters	reuter	Reuters
Stocks	stock	Stocks
ended	end	ended
slightly	slight	slightly
higher	higher	higher
on	on	on
Friday	friday	Friday
but	but	but
stayed	stay	stayed
near	near	near
lows	low	low
for	for	for
the	the	the
year	year	year
as	as	a
oil	oil	oil
prices	price	price
surged	surg	surged
past	past	past
36	36	36
;	;	;
46	46	46
a	a	a
barrel	barrel	barrel
,	,	,
offsetting	offset	offsetting
a	a	a
positive	posit	positive
outlook	outlook	outlook
from	from	from
computer	comput	computer
maker	maker	maker
Dell	dell	Dell
Inc	inc	Inc
.	.	.

DELL.O	dell.o	DELL.O




In [30]:
def generate_corpus(train_data, test_data):
    path_to_save = '../assets/annotated-corpus/'
    classes = sorted(train_data['label'].unique())
    for type, data in [('train', train_data), ('test',test_data)]:
        for cl in classes:
            folder_name = path_to_save + type + '/' + str(cl)
            annotated_file = ''
            if not os.path.exists(folder_name):
                os.makedirs(folder_name)
                
            for text_item in data.loc[data['label'] == cl]['text'].to_list():
                annotated_file += tokenizer(text_item)
                
            file_path = folder_name + '/' + str(cl) + '.tsv'
            with open(file_path, 'w', encoding='utf-8') as tsv_file:
                tsv_file.write(annotated_file)
                

In [31]:
import os

def generate_corpus(train_data, test_data):
    path_to_save = '../assets/annotated-corpus/'
    # Получаем все уникальные классы из train и test
    train_classes = set(train_data['label'].unique())
    test_classes = set(test_data['label'].unique())
    all_classes = sorted(list(train_classes.union(test_classes)))
    
    for data_type, data in [('train', train_data), ('test', test_data)]:
        # Создаем копию данных, чтобы не изменять оригинальные
        data = data.copy()
        # Проверяем наличие doc_id, генерируем при необходимости
        if 'doc_id' not in data.columns:
            data.reset_index(drop=True, inplace=True)
            data['doc_id'] = data.index.astype(str)
        
        # Проходим по всем классам
        for cl in all_classes:
            # Фильтруем данные по текущему классу
            class_data = data[data['label'] == cl]
            if class_data.empty:
                continue  # Пропускаем, если нет документов
            
            # Создаем папку для класса
            class_folder = os.path.join(path_to_save, data_type, str(cl))
            os.makedirs(class_folder, exist_ok=True)
            
            # Сохраняем каждый документ отдельным файлом
            for _, row in class_data.iterrows():
                doc_id = row['doc_id']
                text_content = tokenizer(row['text'])
                file_path = os.path.join(class_folder, f"{doc_id}.tsv")
                with open(file_path, 'w', encoding='utf-8') as file:
                    file.write(text_content)

In [32]:
generate_corpus(train_data, test_data)