In [None]:
!wget http://qwone.com/~jason/20Newsgroups/20news-bydate.tar.gz
!tar -xvzf ./20news-bydate.tar.gz

In [3]:
import os

def read_files_from_folders(root_folder):
    all_files_content = []


    for foldername, subfolders, filenames in os.walk(root_folder):
        for filename in filenames:

            file_path = os.path.join(foldername, filename)
            try:

                with open(file_path, 'r', errors='ignore') as file:
                    content = file.read()
                    content = content.replace('\n', ' ')
                    content = content.replace('\t', ' ')
                    all_files_content.append(content)
            except Exception as e:
                print(f"Ошибка при чтении файла {file_path}: {e}")

    return all_files_content


root_directory = '20news-bydate-test'
all_contents = read_files_from_folders(root_directory)
all_contents = ' '.join(all_contents)

In [4]:
import re

In [None]:
import re

sentence_pattern = r'([.!?])(?=\s|$)'

def split_sentences_with_end_symbols(text):
    parts = re.split(sentence_pattern, text)

    sentences = [''.join([parts[i], parts[i+1]]) for i in range(0, len(parts)-1, 2)]

    return sentences

text = all_contents

sentences = split_sentences_with_end_symbols(text)
for sentence in sentences:
    print(sentence)

In [14]:
import re

word_pattern = r'\b[\w\'-]+@[\w.]+\b|\b[\w\'-]+\b|\d+|[^\w\s]'
email_pattern = r'[a-zA-Z0-9_.+-]+@[a-zA-Z0-9-]+\.[a-zA-Z0-9-.]+'
phone_ru_pattern = r"^(?:\+7|8)(?:(?:-\d{3}-|\(\d{3}\))\d{3}-\d{2}-\d{2}|\d{10})"
phone_usa_pattern = r"^(?:\+1)(?:(?:-\d{3}-|\(\d{3}\))\d{3}-\d{4}|\d{7})"
phone_china_pattern = r"^(?:\+86)(?:(?:-\d{3}-|\(\d{3}\))\d{4}-\d{4}|\d{11})"
phone_pattern = r'(\+\d{1,3}\s?)?(?:\(\d{3}\)|\d{1,3})\s?-?\s?\d{3,4}\s?-?\s?\d{3,4}'
numeral_pattern = r"[0-9]+((th)|(\\'s))" #числительные типо 4th
dates_pattern = r'\d{1,3}[\.|\/]\d{1,4}[\.|\/]\d{1,4}'
times_pattern =  r'\d{1,2}\:\d{2}'

word_re = re.compile(word_pattern)
email_re = re.compile(email_pattern)
phone_re = re.compile(f"({phone_pattern})|({phone_ru_pattern})|({phone_usa_pattern})|({phone_china_pattern})")
numeral_re = re.compile(numeral_pattern)
dates_re = re.compile(dates_pattern)
times_re = re.compile(times_pattern)

def tokenize_and_include_emails(sentence):
    tokens = []
    index = 0
    while index < len(sentence):

        email_match = email_re.match(sentence, index)
        if email_match:
            tokens.append(email_match.group())
            index = email_match.end()
            continue


        phone_match = phone_re.match(sentence, index)
        if phone_match:
            tokens.append(phone_match.group())
            index = phone_match.end()
            continue

        numeral_match = numeral_re.match(sentence, index)
        if numeral_match:
            tokens.append(numeral_match.group())
            index = numeral_match.end()
            continue

        dates_match = dates_re.match(sentence, index)
        if dates_match:
            tokens.append(dates_match.group())
            index = dates_match.end()
            continue

        time_match = times_re.match(sentence, index)
        if time_match:
            tokens.append(time_match.group())
            index = time_match.end()
            continue

        word_match = word_re.match(sentence, index)
        if word_match:
          token = word_match.group()
    # Удаление повторяющихся точек или подчеркиваний (но не одиночных)
          token = re.sub(r'([._])\1+', r'\1', token)  # Замена последовательностей точек или подчеркиваний на одиночный символ
          tokens.append(token)
          index = word_match.end()
          continue

        index += 1

    return tokens


In [None]:


all_sentences_tokens_and_emails = []

for sentence in sentences:
    tokens_and_emails = tokenize_and_include_emails(sentence)
    all_sentences_tokens_and_emails.append(tokens_and_emails)

for i, tokens_and_emails in enumerate(all_sentences_tokens_and_emails):
    print(f"Sentence {i+1}:")
    print(tokens_and_emails)
    print()



In [None]:
import nltk
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.stem import PorterStemmer
from nltk.stem import WordNetLemmatizer

nltk.download('punkt')
nltk.download('wordnet')
stemmer = PorterStemmer()
lemmatizer = WordNetLemmatizer()

In [None]:
def get_stub_stem(token):
    stem = stemmer.stem(token)
    return stem

def get_stub_lemma(token):
    lemma = lemmatizer.lemmatize(token)
    return lemma

def create_annotation(sentences):
    annotations = []
    for sentence_index, sentence in enumerate(sentences):
        tokens_and_emails = tokenize_and_include_emails(sentence)
        for token_index, token in enumerate(tokens_and_emails):
            stem = get_stub_stem(token)
            lemma = get_stub_lemma(token)
            annotations.append(f"{sentence_index + 1}_{token_index + 1}\t{token}\t{stem}\t{lemma}")
        annotations.append("")
    return annotations

annotations = create_annotation(sentences)

with open('annotations.tsv', 'w') as file:
    file.write("\n".join(annotations))

print("Аннотация сохранена в 'annotations.tsv'")

In [17]:
if 'TestTokenizeFunction' in globals():
    del globals()['TestTokenizeFunction']

In [None]:
import unittest

class TestTokenize(unittest.TestCase):
    def test_basic_sentence(self):
        sentence = "Hello, my email is test@example.com and my phone is +79533668096."
        expected_tokens = ["Hello", "my", "email", "is", "test@example.com", "and", "my", "phone", "is", "+79533668096", "."]
        self.assertEqual(tokenize_and_include_emails(sentence), expected_tokens)

    def test_dates_and_times(self):
        sentence = "The meeting is on 12/12/2023 at 10:30."
        expected_tokens = ["The", "meeting", "is", "on", "12/12/2023", "at", "10:30", "."]
        self.assertEqual(tokenize_and_include_emails(sentence), expected_tokens)

    def test_numerals(self):
        sentence = "I have 2's apples and 3 oranges."
        expected_tokens = ["I", "have", "2's", "apples", "and", "3", "oranges", "."]
        self.assertEqual(tokenize_and_include_emails(sentence), expected_tokens)

    def test_underscore_and_dot_removal(self):
        sentence = "This is  . an _ example."
        expected_tokens = ["This", "is", "an", "example", "."]
        self.assertEqual(tokenize_and_include_emails(sentence), expected_tokens)

    def test_combined(self):
        sentence = "Email: user@example.com, phone: +86 138 0013 8000, date: 01/01/2024, time: 14:30."
        expected_tokens = ["Email", ":", "user@example.com", "phone", "+86 138 0013 8000", "date", "01/01/2024", "time", "14:30", "."]
        self.assertEqual(tokenize_and_include_emails(sentence), expected_tokens)


if __name__ == "__main__":
    unittest.main(argv=['first-arg-is-ignored'], exit=False)