## How to spoil a good sentence?
1. Locate the sentence with Genitive (or even posessive)
2. Change Genitive to a random case

БУДЕМ ЛОМАТЬ ПАДЕЖ ПРЯМОГО ОБЪЕКТА

In [1]:
import random
from collections import Counter
from razdel import tokenize
from typing import Any, Dict, List, Optional, Tuple, Union
from tqdm import tqdm
from pymorphy2 import MorphAnalyzer
from conllu import parse
from typing import List, Tuple, Union
import glob
morph = MorphAnalyzer()

In [2]:
class TextIlliteracy:
    """Class for tokenizing text and changing variant of grammatical category
       in words of chosen parts of speech"""
    _tokens: List[str]

    def __init__(self, text: str) -> None:
        """Initializes an object"""
        self._text = text
        self._tokens = []
    
    def get_original_text(self) -> str:
        return self._text

    def tokenize_text(self) -> List[str]:
        """Tokenizes text"""
        pass

    def spoil_text(self, gram: str, postag_list: List[str]) -> str:
        """Changes grammatical markers to random inside the choosen category
           in all words of choosen parts of speech,
           if this grammatical category is relevant for such POS"""
        pass

In [3]:
class TextIlliteracyRus(TextIlliteracy):
    """Class tokenize russian text and changes variant of grammatical category
       in words of chosen parts of speech."""
    
    _tokens: List[str]
    # https://pymorphy2.readthedocs.io/en/stable/user/grammemes.html#grammeme-docs
    
    __postags = ["NOUN", "ADJF", "ADJS", "COMP", "VERB", "INFN", "PRTF", "PRTS",
                 "GRND", "NUMR", "ADVB", "NPRO", "PRED", "PREP", "CONJ", "PRCL", "INTJ"]
    
    __grams = {"nmbr": ["sing", "plur"],
               "case": ["nomn", "gent", "datv", "accs", "ablt",
                        "loct", "voct", "gen2", "acc2", "loc2"],
               "anim": ["anim", "inan"],
               "gndr": ["masc", "femn", "neut", "ms-f"],
               "aspc": ["perf", "impf"],
               "trns": ["tran", "intr"],
               "pers": ["1per", "2per", "3per"],
               "tens": ["pres", "past", "futr"],
               "mood": ["indc", "impr"],
               "invl": ["incl", "excl"],
               "voic": ["actv", "pssv"]}

    def tokenize_text(self) -> List[str]:
        """For russian:
           tokenizes text"""
        if self._tokens != []:
            tokens = self._tokens
        else:
            tokens_with_boundaries = list(tokenize(self._text))
            # получили список токенов с границами
            tokens = [] # список токенов с пробелами в нужных местах
            prev_tok_end = 0
            for substring in tokens_with_boundaries:
                if substring.start != prev_tok_end:
                    tokens.append(" ")
                tokens.append(substring.text)
                prev_tok_end = substring.stop
            self._tokens = tokens
        return tokens
        
    def spoil_text(self, gram: str="nmbr", postag_list: List[str]=__postags, case: str="nomn") -> str:
        """For russian:
           changes grammatical markers to random inside the choosen category
           in all words of choosen parts of speech,
           if this grammatical category is relevant for such POS"""
        # берёт список частей речи и категорию,
        # которую у этих частей речи надо портить рандомными вариантами
        
        if self._tokens == []:
            self._tokens = self.tokenize_text()

        tokens = self._tokens

        changed_list = []
        for tok in tokens:
            tok_analysed = morph.parse(tok)[0]
            if 'NOUN' == tok_analysed.tag.POS and ('accs' in tok_analysed.tag or 'gent' in tok_analysed.tag):
                new_gram_val = random.choice(TextIlliteracyRus.__grams[gram])
                
                if new_gram_val in tok_analysed.tag:  # делает вероятнось повторения формы чуть меньше
                    new_gram_val = random.choice(TextIlliteracyRus.__grams[gram])

                if tok_analysed.inflect({new_gram_val}) is not None:
                    changed_tok = tok_analysed.inflect({new_gram_val}).word
                    if tok[0].isupper():
                        # чтобы при изменении буква оставалась заглавной в т.ч. для слова "Ма́трица"
                        changed_tok = changed_tok[0].upper()+changed_tok[1:]
                    changed_list.append(changed_tok)
                else:
                    changed_list.append(tok)
            else:
                changed_list.append(tok)
        changed_text = "".join(changed_list)
        return changed_text

In [40]:
t = TextIlliteracyRus("А я такое видел в репортаже с чемпионата по питью пива на скорость.")
t.spoil_text("case", "accs")

'А я такое видел в репортаже с чемпионатом по питью пиве на скорость.'

In [4]:
class MyIlliteracy(TextIlliteracy):
    """Class tokenize russian text and changes variant of grammatical category
       in words of chosen parts of speech."""
    
    _tokens: List[str]
    # https://pymorphy2.readthedocs.io/en/stable/user/grammemes.html#grammeme-docs
    
    spoiled_words_counter = 0
    __postags = ["NOUN", "ADJF", "ADJS", "COMP", "VERB", "INFN", "PRTF", "PRTS",
                 "GRND", "NUMR", "ADVB", "NPRO", "PRED", "PREP", "CONJ", "PRCL", "INTJ"]
    
    __grams = {"nmbr": ["sing", "plur"],
               "case": ["nomn", "gent", "datv", "accs", "ablt",
                        "loct", "voct", "gen2", "acc2", "loc2"],
               "anim": ["anim", "inan"],
               "gndr": ["masc", "femn", "neut", "ms-f"],
               "aspc": ["perf", "impf"],
               "trns": ["tran", "intr"],
               "pers": ["1per", "2per", "3per"],
               "tens": ["pres", "past", "futr"],
               "mood": ["indc", "impr"],
               "invl": ["incl", "excl"],
               "voic": ["actv", "pssv"]}

    def tokenize_text(self) -> List[str]:
        """For russian:
           tokenizes text"""
        if self._tokens != []:
            tokens = self._tokens
        else:
            tokens_with_boundaries = list(tokenize(self._text))
            # получили список токенов с границами
            tokens = [] # список токенов с пробелами в нужных местах
            prev_tok_end = 0
            for substring in tokens_with_boundaries:
                if substring.start != prev_tok_end:
                    tokens.append(" ")
                tokens.append(substring.text)
                prev_tok_end = substring.stop
            self._tokens = tokens
        tokens = [t for t in tokens if t!= " "]
        return tokens
        
    def spoil_text(self, tokens, places, gram: str="nmbr", postag_list: List[str]=__postags) -> str:
        """For russian:
           changes grammatical markers to random inside the choosen category
           in all words of choosen parts of speech,
           if this grammatical category is relevant for such POS"""
        # берёт список частей речи и категорию,
        # которую у этих частей речи надо портить рандомными вариантами
        
        changed_list = []
        for i in range(len(tokens)):
            tok = tokens[i]
            if i in places:
                oring_tok = tok
                tok_analysed = morph.parse(tok)[0]
                if tok_analysed.tag.POS in postag_list:
                    new_gram_val = random.choice(MyIlliteracy.__grams[gram])
                    
                    if new_gram_val in tok_analysed.tag:  # делает вероятнось повторения формы чуть меньше
                        new_gram_val = random.choice(MyIlliteracy.__grams[gram])

                    if tok_analysed.inflect({new_gram_val}) is not None:
                        changed_tok = tok_analysed.inflect({new_gram_val}).word
                        if tok[0].isupper():
                            # чтобы при изменении буква оставалась заглавной в т.ч. для слова "Ма́трица"
                            changed_tok = changed_tok[0].upper()+changed_tok[1:]
                        changed_list.append(changed_tok)
                        if oring_tok!= changed_tok:
                            self.spoiled_words_counter += 1
                    else:
                        changed_list.append(tok)
                    
            else:
                changed_list.append(tok)
        changed_text = " ".join(changed_list)
        return changed_text, self.spoiled_words_counter

In [5]:
def extract_sentences_and_obj_ids_with_conllu(file_path: str) -> List[Tuple[str, List[Union[int, float]]]]:
    with open(file_path, 'r', encoding='utf-8') as file:
        data = file.read()

    # Parse the CoNLL-U file content
    sentences = parse(data)

    sentences_and_objs = []

    for sentence in sentences:
        words = [token['form'] for token in sentence]
        sentence_text = words

        obj_ids = [token['id']-1 for token in sentence if 'obj' in token['deprel']]
        sentences_and_objs.append((sentence_text, obj_ids))

    return sentences_and_objs


In [7]:
def spoil_data(sentence: tuple) -> str:
    text = sentence[0]
    obj_ids = sentence[1]
    TextIlliteracyRus(text)

In [6]:
files = []
files = glob.glob('Subtitles/tagged_texts/*/*.txt', recursive=True)

In [7]:
len(files)

7899

In [9]:
for file_path in tqdm(files):
    print(file_path)
    with open(file_path, 'r', encoding='utf-8') as file:
        data = file.read().split('\n\n')
        chunk_size = len(data)//10
        for i in range(10):
            chunk = data[i*chunk_size:(i+1)*chunk_size]
            with open(f'social/chunked_texts/{file_path.split("/")[-1].strip(".txt")}_{i}.txt', 'w', encoding='utf-8') as file:
                file.write('\n\n'.join(chunk))

  0%|          | 0/4 [00:00<?, ?it/s]

social/tagged_texts/vktexts.txt


 25%|██▌       | 1/4 [00:05<00:16,  5.48s/it]

social/tagged_texts/LiveJournalPostsandcommentsGICR.txt


 50%|█████     | 2/4 [00:19<00:21, 10.70s/it]

social/tagged_texts/fbtexts.txt


100%|██████████| 4/4 [00:21<00:00,  5.41s/it]

social/tagged_texts/twtexts.txt





In [10]:
files = []
files = glob.glob('social/chunked_texts/*.txt', recursive=True)

In [11]:
len(files)

40

In [8]:
normal_data = ""
spoiled_data = ""

In [10]:
spoiled_words_counter = 0
for file_path in tqdm(files):
    # print(file_path)
    result = extract_sentences_and_obj_ids_with_conllu(file_path)
    for r in result:
        m = MyIlliteracy(r[0])
        spoiled, count = m.spoil_text(r[0], r[1], gram='case')
        spoiled_words_counter += count
        normal = " ".join(r[0])
        spoiled_data += f"\n{spoiled}"
        normal_data += f"\n{normal}"

with open('spoiled_data_subtitles.txt', 'w', encoding='utf-8') as file:
    file.write(spoiled_data)
    spoiled_data = ""

with open('normal_data_subtitles.txt', 'w', encoding='utf-8') as file:
    file.write(normal_data)
    normal_data = ""
    
print(spoiled_words_counter)

100%|██████████| 7899/7899 [21:28<00:00,  6.13it/s]


746842


In [75]:
m = MyIlliteracy('Профильные комитеты Совета Федерации рекомендуют палате одобрить законопроект об изменении границ между Москвой и Московской областью.')


In [76]:
result = extract_sentences_and_obj_ids_with_conllu(file_path)

In [83]:
m.spoil_text(result[0][0], result[0][1], gram='case')

'Профильные комитеты Совета Федерации рекомендуют палате одобрить законопроекте об изменении границ между Москвой и Московской областью .'

In [84]:
" ".join(result[0][0])

'Профильные комитеты Совета Федерации рекомендуют палате одобрить законопроект об изменении границ между Москвой и Московской областью .'

In [51]:
m.tokenize_text().index('привлекательность')

20

In [69]:
# Example usage:
file_path = '/home/askatasuna/Documents/Diploma/diploma/data/Interfax/texts_tagged/business199005.txt'  # Replace with your actual file path
result = extract_sentences_and_obj_ids_with_conllu(file_path)
for sentence, obj_ids in result:
    print(f"Sentence: {sentence}\nObject IDs: {obj_ids}\n")

Sentence: ['Профильные', 'комитеты', 'Совета', 'Федерации', 'рекомендуют', 'палате', 'одобрить', 'законопроект', 'об', 'изменении', 'границ', 'между', 'Москвой', 'и', 'Московской', 'областью', '.']
Object IDs: [7]

Sentence: ['Как', 'отмечается', 'в', 'отзывах', 'комитетов', 'на', 'данный', 'законопроект', ',', 'изменения', 'границ', 'между', 'Москвой', 'и', 'Московской', 'областью', 'позволят', '"', 'повысить', 'инвестиционную', 'привлекательность', 'как', 'Москвы', ',', 'так', 'и', 'области', ',', 'что', 'крайне', 'важно', 'для', 'экономического', 'и', 'градостроительного', 'развития', '"', '.']
Object IDs: [20]

Sentence: ['Соглашение', 'об', 'изменении', 'границ', 'подписано', 'на', 'днях', 'мэром', 'Москвы', 'Сергеем', 'Собяниным', 'и', 'губернатором', 'Московской', 'области', 'Борисом', 'Громовым', '.']
Object IDs: []

Sentence: ['"', 'Изменение', 'границы', 'между', 'Москвой', 'и', 'областью', 'носит', 'характер', 'уточнения', ',', 'цель', 'которого', 'придать', 'юридический', '

In [22]:
normal_data += '\n\n'.join([r[0] for r in result])

In [9]:
files_to_spoil = glob.glob('/home/askatasuna/Documents/Diploma/splitted/*.txt')
files_to_spoil

['/home/askatasuna/Documents/Diploma/splitted/smaller_file_prefix_az.txt',
 '/home/askatasuna/Documents/Diploma/splitted/smaller_file_prefix_av.txt',
 '/home/askatasuna/Documents/Diploma/splitted/smaller_file_prefix_bb.txt',
 '/home/askatasuna/Documents/Diploma/splitted/smaller_file_prefix_at.txt',
 '/home/askatasuna/Documents/Diploma/splitted/smaller_file_prefix_ad.txt',
 '/home/askatasuna/Documents/Diploma/splitted/smaller_file_prefix_bc.txt',
 '/home/askatasuna/Documents/Diploma/splitted/smaller_file_prefix_ao.txt',
 '/home/askatasuna/Documents/Diploma/splitted/smaller_file_prefix_ar.txt',
 '/home/askatasuna/Documents/Diploma/splitted/smaller_file_prefix_aa.txt',
 '/home/askatasuna/Documents/Diploma/splitted/smaller_file_prefix_ag.txt',
 '/home/askatasuna/Documents/Diploma/splitted/smaller_file_prefix_ap.txt',
 '/home/askatasuna/Documents/Diploma/splitted/smaller_file_prefix_ax.txt',
 '/home/askatasuna/Documents/Diploma/splitted/smaller_file_prefix_ab.txt',
 '/home/askatasuna/Docume

In [35]:
count = 0
for f in files_to_spoil:
    with open(f, 'r', encoding='utf-8') as file, open(f"/home/askatasuna/Documents/Diploma/splitted/{count}.txt", 'w', encoding='utf-8') as out_file:
        for line in tqdm(file.read().split('\n')):
            if line == '':
                out_file.write('\n')
            else:
                m = TextIlliteracyRus(line)
                spoiled = m.spoil_text("case", "accs")
                out_file.write(spoiled+'\n')
        count += 1

100%|██████████| 1050854/1050854 [29:33<00:00, 592.65it/s] 
100%|██████████| 723038/723038 [28:10<00:00, 427.69it/s]
100%|██████████| 1068723/1068723 [49:02<00:00, 363.19it/s]  
100%|██████████| 645610/645610 [26:27<00:00, 406.72it/s]
100%|██████████| 890998/890998 [27:22<00:00, 542.52it/s] 
100%|██████████| 1461608/1461608 [27:25<00:00, 888.47it/s] 
100%|██████████| 826197/826197 [48:51<00:00, 281.82it/s] 
100%|██████████| 647709/647709 [26:34<00:00, 406.23it/s]
100%|██████████| 864481/864481 [27:38<00:00, 521.36it/s]  
100%|██████████| 749935/749935 [26:50<00:00, 465.67it/s]
100%|██████████| 743207/743207 [48:10<00:00, 257.14it/s]   
100%|██████████| 796800/796800 [27:20<00:00, 485.68it/s] 
100%|██████████| 785212/785212 [30:30<00:00, 428.99it/s]   
100%|██████████| 845126/845126 [29:32<00:00, 476.83it/s] 
100%|██████████| 916179/916179 [28:03<00:00, 544.35it/s]  
100%|██████████| 657519/657519 [26:58<00:00, 406.35it/s]
100%|██████████| 763933/763933 [29:32<00:00, 431.05it/s]
100%|██

In [38]:
count = 0
for f in files_to_spoil:
    with open(f, 'r', encoding='utf-8') as file, open(f"/home/askatasuna/Documents/Diploma/splitted_normal/{count}.txt", 'w', encoding='utf-8') as out_file:
        out_file.write(file.read())
        count += 1

In [41]:
with open('preprocessed_ru_convers.txt', 'r', encoding='utf-8') as file:
    data_big = file.read()
    
with open('preprocessed_ru_convers_spoiled.txt', 'w', encoding='utf-8') as file:
    for line in tqdm(data_big.split('\n')):
        if line == '':
            file.write('\n')
        else:
            m = TextIlliteracyRus(line)
            m.spoil_text("case", "accs")
            file.write(spoiled+'\n')

UnicodeDecodeError: 'utf-8' codec can't decode byte 0x81 in position 1572864000: invalid start byte