# Русские статьи (картинки)

# Фаза 0 - Установка констант

## • Инициализация переменных

In [None]:
# Created to fast turn off Teseract & Hunspell
USE_DEFAULT_SETTINGS = False
# Path to raw .docx and .pdf files
ARTICLES_PATH = './Articles/'
# Path to converted and splitted raw article's paragraphs (.txt)
RAW_PARAGS = './txt_articles/Параграфы/'
# Teseract's path to exe/bin or '<linux>' if teseract installed as package (command tesseract available)
TESERACT_PATH = None if USE_DEFAULT_SETTINGS else r'/usr/bin/tesseract-ocr' #  D:/Software/Tesseract-OCR/tesseract.exe # /usr/bin/tesseract-ocr
# Path to store images extracted from pdfs and words
IMAGE_PATH = None if USE_DEFAULT_SETTINGS else r'extracted_images/'
# Path to clean and preprocessed paragraphs texts
CLEAN_FRAGMENTS = './txt_articles/Очистка_текста/'
# Hunspells dic and aff files. Dict download - https://github.com/LibreOffice/dictionaries/tree/master/ru_RU
HUNSPELL_PATH = None if USE_DEFAULT_SETTINGS else ["/usr/share/hunspell/ru_RU.dic", "/usr/share/hunspell/ru_RU.aff"]
# Regex text file description with clear content
CLEAR_REGEX_PATH = './models/regxs_to_clear.txt'
# Path to fragments folder
FRAGMENTS_PATH = './txt_articles/Фрагменты/'
# Regex text file description with clear content
FRAG_PICK_REGEX_PATH = './models/regs_to_search.txt'
# Path to created articles tokenization
TOKENIZE_PATH = './txt_articles/Токенизация/'
# Path to created articles lemmas 
LEMMATIZE_PATH = './txt_articles/Лемматизация/'
# Morph vocab and navec models
MORPH_NAVEC = {"morph": 'models/slovnet_morph_news_v1.tar', "navec": 'models/navec_news_v1_1B_250K_300d_100q.tar'}
# Word2Vec Model path
WORD_TO_VEC_MODEL_PATH = "./models/word2vec.model"
# File to save temporal result for word2vec processing
COMBINED_TEXT_PATH = "./models/train_text.txt"
# Path to store functional triplets
FUNC_TRIPLETS_PATH = "./models/functional_triplets.json"
# Path to store hier triplets
HIER_TRIPLETS_PATH = "./models/hier_triplets.json"

import importlib

# Фаза 1 - Предобработка исходных файлов

## • Чтение файлов, разбиение на параграфы, OCR

In [None]:
import a_paragraph_processing
importlib.reload(a_paragraph_processing)
a_paragraph_processing.mod_paragraph_processing(ARTICLES_PATH, RAW_PARAGS, TESERACT_PATH, IMAGE_PATH)

## • Выделение фрагментов

In [None]:
import b_filter_fragments
importlib.reload(b_filter_fragments)
b_filter_fragments.mod_filter_fragments(RAW_PARAGS, FRAGMENTS_PATH, FRAG_PICK_REGEX_PATH)

## • Предобработка, spell-check для сканов

In [None]:
import c_text_preprocessing
importlib.reload(c_text_preprocessing)
c_text_preprocessing.mod_text_preprocessing(FRAGMENTS_PATH, CLEAN_FRAGMENTS, CLEAR_REGEX_PATH, HUNSPELL_PATH)

## • Токенизация, лемматизация

In [None]:
import d_models_creation
importlib.reload(d_models_creation)
d_models_creation.mod_model_creation(CLEAN_PARAGS, TOKENIZE_PATH, LEMMATIZE_PATH, MORPH_NAVEC)

# Фаза 2 - Создание векторной модели

In [None]:
import e_build_word2vec
importlib.reload(e_build_word2vec)
e_build_word2vec.mod_build_word2vec(CLEAN_FRAGMENTS, WORD_TO_VEC_MODEL_PATH, COMBINED_TEXT_PATH, MORPH_NAVEC)

# Фаза 3 - Создание связей

## • Функциональные связи

In [None]:
import f_extract_functional_relations
importlib.reload(f_extract_functional_relations)
f_extract_functional_relations.mod_extract_functional_relations(FRAGMENTS_PATH, FUNC_TRIPLETS_PATH)

## • Иерархические связи

In [None]:
import g_extract_hierarchical_relations
importlib.reload(g_extract_hierarchical_relations)
g_extract_hierarchical_relations.mod_extract_hierarchical_relations(FUNC_TRIPLETS_PATH, HIER_TRIPLETS_PATH, WORD_TO_VEC_MODEL_PATH)