### Import

In [15]:
import re
import os

In [16]:
import inflect

In [17]:
from tqdm import tqdm

### Функції

In [18]:
def remove_paragraph(text_corpus):
    new_text_corpus = text_corpus.replace('\n','')
    return new_text_corpus

In [19]:
def to_lowercase(text_corpus):
    """Convert all characters to lowercase from list of tokenized words"""
    words = text_corpus.split(' ')
    new_words = []
    for word in words:
        new_word = word.lower()
        new_words.append(new_word)
    return ' '.join(new_words)

In [20]:
def remove_punctuation(text_corpus):
    """Remove punctuation from list of tokenized words"""
    words = text_corpus.split(' ')
    new_words = []
    for word in words:
        new_word = re.sub(r'[^\w\s]', '', word)
        if new_word != '':
            new_words.append(new_word)
    return ' '.join(new_words)

In [21]:
def replace_numbers(text_corpus):
    """Replace all interger occurrences in list of tokenized words with textual representation"""
    words = text_corpus.split(' ')
    p = inflect.engine()
    new_words = []
    for word in words:
        if word.isdigit():
            new_word = p.number_to_words(word)
            new_words.append(new_word)
        else:
            new_words.append(word)
    return ' '.join(new_words)

In [22]:
SYSTEM_PATH = "/Users/oleksandrkosovan/Documents/GitHub/"
LEM_PATH = "nlp_uk/src/main/groovy/org/nlp_uk/tools/LemmatizeText.groovy"

def lemmatization(
    in_data_path, 
    in_file_name, 
    out_data_path, 
    out_file_name, 
    system_path=SYSTEM_PATH, 
    lem_path=LEM_PATH
):
    in_path = os.path.join(in_data_path, in_file_name)
    out_path = os.path.join(out_data_path, out_file_name)
    
    lem_command = "groovy " + os.path.join(system_path, lem_path) + " -i " + in_path + " -o " + out_path
    os.system(lem_command)

In [23]:
def text_preparation(
    in_data_path,
    in_file_name,
    out_data_path,
    out_file_name,
):
    with open(os.path.join(in_data_path, in_file_name), 'r') as file:
        text = file.read()
        file.close()
    new_text = remove_paragraph(text)
    new_text = to_lowercase(new_text)
    new_text = remove_punctuation(new_text)
    new_text = replace_numbers(new_text)
    with open(os.path.join(out_data_path, out_file_name),"w") as out_put:
        out_put.write(new_text)
        out_put.close()
    
    lemmatization(
        in_data_path,
        in_file_name,
        out_data_path,
        out_file_name
    )

In [24]:
# test

text_preparation(
    in_data_path='',
    in_file_name='text.txt',
    out_data_path='',
    out_file_name='file.txt',
)

### Підготовка тексту

In [25]:
IN_DATA_PATH = 'reviews-data/negative/'
OUT_DATA_PATH = 'lem-data/negative/'

checking_file_kist = os.listdir(OUT_DATA_PATH)

files_list = os.listdir(IN_DATA_PATH)
for file in tqdm(files_list):
    if file not in checking_file_kist:
        try:
            text_preparation(
                in_data_path=IN_DATA_PATH,
                in_file_name=file,
                out_data_path=OUT_DATA_PATH,
                out_file_name=file,
            )
        except:
            print('some error')

100%|██████████| 1923/1923 [00:00<00:00, 37075.42it/s]

some error





In [26]:
IN_DATA_PATH = 'reviews-data/positive/'
OUT_DATA_PATH = 'lem-data/positive/'

checking_file_kist = os.listdir(OUT_DATA_PATH)

files_list = os.listdir(IN_DATA_PATH)
for file in tqdm(files_list):
    if file not in checking_file_kist:
        try:
            text_preparation(
                in_data_path=IN_DATA_PATH,
                in_file_name=file,
                out_data_path=OUT_DATA_PATH,
                out_file_name=file,
            )
        except:
            print('some error')

100%|██████████| 1113/1113 [00:00<00:00, 31153.72it/s]

some error



