# Функционал обработки текста

In [1]:
import pymorphy2

from spacy.lang import ru
from nltk.stem.snowball import SnowballStemmer 
import nltk
from nltk.stem.wordnet import WordNetLemmatizer
nltk.download('wordnet')
from nltk.corpus import wordnet as wn
nltk.download('stopwords')
from nltk.tokenize import sent_tokenize
nltk.download('punkt')
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
stop = set(nltk.corpus.stopwords.words('russian'))
Unused_chars=[':',',','.','-','\n','?','!']
stemmer = SnowballStemmer('russian')
morph = pymorphy2.MorphAnalyzer()


def tokenize(file_text):
    tokens = word_tokenize(file_text)
    tokens = [i for i in tokens if (i not in stop)]
    tokens = [i.replace("«", "").replace("»", "") for i in tokens]
    return tokens


def prepare_text_for_lda(text):
    tokens = tokenize(text)
    tokens = [token for token in tokens if len(token) > 2]
    tokens = [get_lemma(token) for token in tokens]
    tokens = [morph.parse(token)[0].normal_form for token in tokens]
    return tokens

def get_lemma(word):
    lemma = wn.morphy(word)
    if lemma is None:
        return word
    else:
        return lemma
    

def get_lemma2(word):
    return WordNetLemmatizer().lemmatize(word)

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\nikov\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\nikov\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\stopwords.zip.
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\nikov\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


# Выделение имен и именованных сущностей.

In [None]:
# Одиночные вхождения имени, фамилии, отчества
from natasha import NamesExtractor


def Name_recognition(text):
    Names=[]
    Families=[]
    Lasts=[]
    custom_useless_words=['из-за','марка','ведь','патриарший','борменталь','филипп','кихот','санчо','антоний','сансон','шарик','мой','такой','другой','ваш','этот','сам','свой','тот','варенуха','степ','маргарита','иван','воланд','коровьев','пилат','николай','какой-то','свой','самый','быть','стать','это','что-то','весь','какой','мочь','никанор','который','ты','никакой','азазело','азазелло','берлиоз']
    extractor = NamesExtractor()
    matches = extractor(text)
    for match in matches:
        if match.fact.first is not None:
            tokens=prepare_text_for_lda(match.fact.first)
            if len(tokens)>0:
                Names.append(tokens[0])
        if match.fact.last is not None:
            tokens=prepare_text_for_lda(match.fact.last)
            if len(tokens)>0:
                Families.append(tokens[0])
        if match.fact.middle is not None:
            tokens=prepare_text_for_lda(match.fact.middle)
            if len(tokens)>0:
                Lasts.append(tokens[0])
    Names_set=set(Names)
    Fam_set=set(Families)
    Lasts_set=set(Lasts)
    useless_set=set(custom_useless_words)
    for name in Names_set:
        useless_set.add(name)
    for name in Fam_set:
        useless_set.add(name)
    for name in Lasts_set:
        useless_set.add(name)
    return useless_set

# Выделение словосочетаний из текста

In [None]:
decision_value=0.75

Unused_chars=[':',',','.','-','\n','?','!']

#Логический разделитель
Logic_Sep=['PREP','CONJ','PRCL']
Logic_AdVerb=['PRTF','PRTS','GRND']
Logic_Adj=['ADJF','ADJS']
verb=['VERB','INFN']
Separators=Logic_Sep+Logic_AdVerb+verb+Logic_Adj

def logic_construct_extractor(string):
    Main_noun=''
    verbs=[]
    temp_string=''
    state=False
    Logical_structure=[]
    for word in string.split():
        f_word=word
        for char in Unused_chars:
            if char in f_word:
                f_word=f_word.replace(char,'')
        p=morph.parse(f_word)[0]
        if p.tag.POS in Separators and len(temp_string)>0 and not state:
            Logical_structure.append(temp_string)
            temp_string=f_word+' '
            state=True
        else:
            temp_string+=f_word+' '
            state=False
    Logical_structure.append(temp_string)
    return Logical_structure


def check_symbol_string(string,decision_value):
    count=0
    ngramm=string.split()
    res=decision_value*len(ngramm)
    for gramm in ngramm:
        tokens=prepare_text_for_lda(gramm)
        if not(len(tokens)>0 and (tokens[0] not in useless_set)):
            continue
        for symbol in symbols:
            if tokens[0] in symbol:
                count+=1
                break
    if (count>=res)and len(ngramm)>1:
        return True
    else:
        return False
    
def check_file_symbol(filename,decision_value):
    count_change=0
    count_symbols_var=0
    prev_value=False
    ngramm=[]
    result={}
    with open('Lyrics\\'+filename+'.txt',encoding='utf-8') as f:
        for line in f:
            if not len(line)>1:
                continue
            for string in line.split('.'):
                logic_structs=logic_construct_extractor(string)
                for logic_struct in logic_structs:
                    if check_symbol_string(logic_struct,decision_value):
                        result[logic_struct]=string
    return result


# Работа с CSV

In [None]:
for filename in glob.glob('Data//*.csv'):
    print(filename)
    with open(filename) as csvfile:
        input_file1 = csv.DictReader(csvfile,delimiter=";")
        input_file=list(input_file1)
    for row in input_file:
    limit+=1
    print('Обработано {} из {}'.format(limit,len(symbols)))
    line=row['Суперслово/словосочетание']
    tokens=prepare_text_for_lda(line)
    for symbol in symbols:
        if symbol in tokens:
            add_list={}
            count+=1
            #found_symbols.append('{}   {} '.format(symbol,row['Суперслово/словосочетание']))
            add_list['символ']=symbol
            add_list['словосочетание']=row['Суперслово/словосочетание']
            add_list['фрагмент']=row['Фрагмент']
            add_list['значение']=row['Значение']
            found_symbols.append(add_list)
            break
            #print('Found')
print(count)
print('ready')

# Работа с DOC

In [None]:
from docx import Document
from docx.shared import RGBColor
from docx.shared import Pt
from docx.enum.style import WD_STYLE_TYPE
import pymorphy2

custom_useless_words=['патриарший','борменталь','филипп','кихот','санчо','антоний','сансон','шарик','мой','такой','другой','ваш','этот','сам','свой','тот','варенуха','степ','маргарита','иван','воланд','коровьев','пилат','николай','какой-то','свой','самый','быть','стать','это','что-то','весь','какой','мочь','никанор','который','ты','никакой','азазело','азазелло','берлиоз']

#custom_useless_words=['патриарший']
def read_file_to_doc(filename):
    count_change=0
    with open('Lyrics\\'+filename+'.txt',encoding='utf-8') as f:
        document=Document()
        for line in f:
            if len(line)>1:
                paragraph=document.add_paragraph('')
                for word in line.split():
                    run = paragraph.add_run(word+' ')
                    tokens=prepare_text_for_lda(word)
                    p = morph.parse(word)[0]
                    if len(tokens)>0:
                        if (tokens[0] not in Names_set and 
                        tokens[0] not in Fam_set and 
                        tokens[0] not in Lasts_set and 
                        tokens[0] not in custom_useless_words):
                            for symbol in symbols:
                                if tokens[0]==symbol:
                                    run.font.highlight_color=7
                                    count_change+=1
                                    break
                                
        document.save('Outputs//test1.docx')
        print(count_change)

read_file_to_doc('Master')

In [None]:
some=check_file_symbol('Chapter2',0.75)

print('ready')