In [16]:
import numpy as np
import pandas as pd
import nltk
import os

In [17]:
# nltk.download('wordnet')
from nltk.stem.snowball import SnowballStemmer
from nltk.stem import WordNetLemmatizer

In [18]:
stemmer = SnowballStemmer(language='english')
lemmatizer = WordNetLemmatizer()

In [19]:
import re

In [20]:
train_data = pd.read_parquet("../data/train.parquet")

In [21]:
test_data = pd.read_parquet("../data/test.parquet")

In [22]:
def split_into_sentences(text):
    # (?<!\w\.\w.) - проверяет, что нет слова, за которым следует точка и еще одно слово.
    # (?<![A-Z][a-z]\.) - проверяет, что перед текущей позицией нет заглавной буквы, за которой следует строчная буква и точка.
    # (?<=\.|\?|!) - проверяет, что перед текущей позицией находится точка, вопросительный знак или восклицательный знак.
    # \s - пробельный символ 
    sentence_pattern = re.compile(r'(?<!\w\.\w.)(?<![A-Z][a-z]\.)(?<=\.|\?|!)\s')
    
    # Находим все предложения в тексте
    sentences = sentence_pattern.split(text)
    
    # Удаляем пустые строки, если они есть
    sentences = [sentence.strip() for sentence in sentences if sentence.strip()]
    
    return sentences

In [23]:
split_into_sentences("Who are you? How it's going?")

['Who are you?', "How it's going?"]

In [24]:
def find_emails(sentence):
    # [A-Za-z0-9._%+-]+ - часть почты до @
    # [A-Za-z0-9.-]+ - доменная часть - .com(или другого)
    # [A-Z|a-z]{2,} - доменный уровень (например, .com, .org)
    email_pattern = re.compile(r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b')
    
    # Находим все email-адреса в тексте
    emails = email_pattern.findall(sentence)
    
    return emails

In [25]:
text = 'Please, send your feedback at this adress: {mail}'
emails = ["user.name@example.com","another-email@domain.org", "test123@sub.domain.co.uk"]
for em in emails:
    print(find_emails(text.format(mail=em)))

['user.name@example.com']
['another-email@domain.org']
['test123@sub.domain.co.uk']


In [26]:
def find_phone_number(sentence):
    # \+? - символ + есть ноль или 1 раз
    # [- (]? и [- )]? - разделитель в  виде -, () и пробел
    # \d{3} - три любых цифры
    # 
    number_pattern = re.compile(r'\+?7?[- (]?\d{3}[- )]?\d{3}[- ]?\d{2}[- ]?\d{2}')
    
    # Находим все номера телефонов в тексте
    phone_numbers = number_pattern.findall(sentence)
    
    return phone_numbers

In [27]:
text_phone = 'Please, call us back: {number}'
phones = ["+79261234567","89261234567", "79261234567","+7 926 123 45 67", "8(926)123-45-67"]
for phone in phones:
    print(find_phone_number(text_phone.format(number=phone)))

['+79261234567']
[' 8926123456']
[' 7926123456']
['+7 926 123 45 67']
['(926)123-45-67']


In [29]:
text_phones = """Please, call us, to get more information:
Additional phone number: +79261234567
Email: mail@yandex.com"""
find_phone_number(text_phones)

['+79261234567']

In [30]:
def find_dates(sentence):
    # Регулярное выражение для поиска дат
    date_pattern = re.compile(r'\b(\d{1,2})([./-]?)(\d{1,2})\2(\d{2,4})\b')
    
    # Находим все даты в тексте
    dates = date_pattern.findall(sentence)

    # Преобразуем найденные даты в строки
    formatted_dates = [f"{day}{separator}{month}{separator}{year}" for day, separator, month, year in dates]
    
    return formatted_dates

In [31]:
text_date = 'Date to meet: {dat}'
dates = ['19.06.2024', '19-06-2024', '10/12/24']
for date in dates:
    print(find_dates(text_date.format(dat=date)))

['19.06.2024']
['19-06-2024']
['10/12/24']


In [32]:
def tokenize_sentence(sentence):
    # Регулярное выражение для выделения отдельных токенов
    pattern = r"\+?\b[\w@.]+(?:'\w+)?\b|[:;,?.!]"
    return re.findall(pattern, sentence)

In [33]:
tokens = tokenize_sentence(text_phones)

In [34]:
print(text_phones)

Please, call us, to get more information:
Additional phone number: +79261234567
Email: mail@yandex.com


In [35]:
print(tokens)

['Please', ',', 'call', 'us', ',', 'to', 'get', 'more', 'information', ':', 'Additional', 'phone', 'number', ':', '+79261234567', 'Email', ':', 'mail@yandex.com']


In [36]:
stemmer.stem('refridgerator')

'refridger'

In [37]:
stemmer.stem('generations')

'generat'

In [38]:
stemmer.stem('optimization')

'optim'

In [39]:
lemmatizer.lemmatize('refridgerator')

'refridgerator'

In [40]:
lemmatizer.lemmatize('generation')

'generation'

In [41]:
def tokenizer(text):
    sentences = split_into_sentences(text)
    annotated_text = ''
    for sentence in sentences:
        annotated_sentences = ''
        word_tokens = tokenize_sentence(sentence)

        for token in word_tokens:
            emails = find_emails(token)
            phones = find_phone_number(token)
            dates = find_dates(token)
            if emails:
                annotated_sentences += email[0] + '\n'
            elif phones:
                annotated_sentences += phone[0] + '\n'
            elif dates:
                annotated_sentences += dates[0] + '\n'
            else:
                stem = stemmer.stem(token)
                lemm = lemmatizer.lemmatize(token)
                annotated_sentences += '\t'.join([token, stem, lemm]) + '\n'
        
        annotated_text += annotated_sentences + '\n'

    return annotated_text

In [42]:
print(train_data['text'][5])

Stocks End Up, But Near Year Lows (Reuters) Reuters - Stocks ended slightly higher on Friday\but stayed near lows for the year as oil prices surged past  #36;46\a barrel, offsetting a positive outlook from computer maker\Dell Inc. (DELL.O)


In [43]:
print(tokenizer(train_data['text'][5]))

Stocks	stock	Stocks
End	end	End
Up	up	Up
,	,	,
But	but	But
Near	near	Near
Year	year	Year
Lows	low	Lows
Reuters	reuter	Reuters
Reuters	reuter	Reuters
Stocks	stock	Stocks
ended	end	ended
slightly	slight	slightly
higher	higher	higher
on	on	on
Friday	friday	Friday
but	but	but
stayed	stay	stayed
near	near	near
lows	low	low
for	for	for
the	the	the
year	year	year
as	as	a
oil	oil	oil
prices	price	price
surged	surg	surged
past	past	past
36	36	36
;	;	;
46	46	46
a	a	a
barrel	barrel	barrel
,	,	,
offsetting	offset	offsetting
a	a	a
positive	posit	positive
outlook	outlook	outlook
from	from	from
computer	comput	computer
maker	maker	maker
Dell	dell	Dell
Inc	inc	Inc
.	.	.

DELL.O	dell.o	DELL.O




In [44]:
def generate_corpus(train_data, test_data):
    path_to_save = '../assets/annotated-corpus/'
    classes = sorted(train_data['label'].unique())
    for type, data in [('train', train_data), ('test',test_data)]:
        for cl in classes:
            folder_name = path_to_save + type + '/' + str(cl)
            annotated_file = ''
            if not os.path.exists(folder_name):
                os.makedirs(folder_name)
                
            for text_item in data.loc[data['label'] == cl]['text'].to_list():
                annotated_file += tokenizer(text_item)
                
            file_path = folder_name + '/' + str(cl) + '.tsv'
            with open(file_path, 'w', encoding='utf-8') as tsv_file:
                tsv_file.write(annotated_file)
                

In [45]:
generate_corpus(train_data, test_data)