In [None]:
%load_ext autoreload
%autoreload 2

## Parse RS3 files 
output:
 - ``data/file.edus``  - text file with edus from .rs3 - each line contains one edu
 - ``data/file.json``  - json file with du-pairs from gold trees. keys: ``['snippet_x', 'snippet_y', 'category_id']``

<div class="alert alert-block alert-warning">
<b>Note:</b> in the original RuRSTreebank dataset, some deprecated symbols occure (>, <, &, etc.), breaking the xml parser, as well as EDUs with punctuation marks at the beginning (it happens when brackets and dots/commas are separated with space in the original text). The latest version of the corpus (at the time of this notebooks' latest commit) has been corrected and dumped in <b>corpus/RuRsTreebank_full_corrected.zip</b>
</div>

In [None]:
%%bash

cd corpus/
unzip RuRsTreebank_full_v6_corrected.zip

In [None]:
%%bash

mkdir data
python utils/parse_rs3.py corpus/RuRsTreebank_full_6/blogs/blogs_rs3/* > rst_blogs_parsing.log
python utils/parse_rs3.py corpus/RuRsTreebank_full_6/news1/news1_rs3/* > rst_news1_parsing.log
python utils/parse_rs3.py corpus/RuRsTreebank_full_6/news2/news2_rs3/* > rst_news2_parsing.log

#python utils/parse_rs3.py corpus/RuRsTreebank_full_5/sci_comp/sci_comp_rs3/* > rst_scicomp_parsing.log
#python utils/parse_rs3.py corpus/RuRsTreebank_full_5/sci_ling/sci_ling_rs3/* > rst_sciling_parsing.log

## Annotate the texts with isanlp 
output:
 - file.annot.pkl  # morphology, syntax, semantics to use with isanlp

In [None]:
%%bash

pip install -U git+https://github.com/IINemo/isanlp.git@discourse

In [None]:
from isanlp import PipelineCommon
from isanlp.processor_remote import ProcessorRemote
from isanlp.ru.converter_mystem_to_ud import ConverterMystemToUd
from isanlp.ru.processor_mystem import ProcessorMystem

host_udpipe = ''

ppl = PipelineCommon([
    (ProcessorRemote(host_udpipe, 3344, '0'),
     ['text'],
     {'sentences': 'sentences',
      'tokens': 'tokens',
      'lemma': 'lemma',
      'syntax_dep_tree': 'syntax_dep_tree',
      'postag': 'ud_postag'}),
    (ProcessorMystem(delay_init=False),
     ['tokens', 'sentences'],
     {'postag': 'postag'}),
    (ConverterMystemToUd(),
     ['postag'],
     {'morph': 'morph',
      'postag': 'postag'}),
])

In [None]:
import glob
import os
import pickle

from tqdm.autonotebook import tqdm
from utils.file_reading import _prepare_text as prepare_text

directories = ['corpus/RuRsTreebank_full_6/blogs/blogs_txt/',
               'corpus/RuRsTreebank_full_6/news1/news1_txt/',
               'corpus/RuRsTreebank_full_6/news2/news2_txt/'
               ]

for path in directories:
    print('analyze path:', path)
    for file in tqdm(glob.glob(f'{path}*.txt')):
        text = prepare_text(open(file, 'r').read())
        annot = ppl(text)
        filename = file.split('/')[-1].replace('.txt', '.annot.pkl')
        pickle.dump(annot, open(os.path.join('data', filename), 'wb'))


(Optional) parse science texts

In [None]:
import glob
import os
import pickle

from tqdm.autonotebook import tqdm
from utils.file_reading import _prepare_text as prepare_text

directories = ['corpus/RuRsTreebank_full_6/sci_comp/sci_comp_txt/',
               'corpus/RuRsTreebank_full_6/sci_ling/sci_ling_txt/',
               ]

for path in directories:
    print('analyze path:', path)
    for file in tqdm(glob.glob(f'{path}*.txt')):
        text = open(file, 'r').read()
        text = text.replace('  \n', '#####').replace('\n', ' ')
        text = prepare_text(text)
        annot = ppl(text)
        filename = file.split('/')[-1].replace('.txt', '.annot.pkl')
        pickle.dump(annot, open(os.path.join('data', filename), 'wb'))


## Gold trees
### Extract features 
output:
 - models/tf_idf/pipeline.pkl  # is used in default feature extraction
 - file.gold.pkl  # dataset with extracted default features for gold trees

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.decomposition import TruncatedSVD
from sklearn.pipeline import Pipeline
import glob
import pickle
import numpy as np
import pandas as pd
import nltk

from utils.file_reading import read_annotation


IN_PATH = 'data/'
! mkdir models
! mkdir models/tf_idf

corpus = []
for file in glob.glob("%s*.json" % IN_PATH):
    tokens = read_annotation(file.replace('.json', ''))['tokens']
    corpus.append(list(map(lambda token: token.text.lower(), tokens)))

    
from utils.count_vectorizer import MyCountVectorizer
count_vect = MyCountVectorizer(ngram_range=(1, 2), tokenizer=MyCountVectorizer.dummy, preprocessor=MyCountVectorizer.dummy)

svd = TruncatedSVD(n_components=25,
                   tol=0.0,
                   n_iter=7,
                   random_state=42)

pipeline = Pipeline([
    ('vect', count_vect),
    ('svd', svd)
])

pipeline.fit(corpus)
pickle.dump(pipeline, open('models/tf_idf/pipeline.pkl', 'wb'))

In [None]:
%%bash

python -c "import nltk; nltk.download('stopwords')"
pip install dostoevsky
dostoevsky download fasttext-social-network-model

In [None]:
! cp ../isanlp_rst/utils/features_processor_variables.py utils/features_processor_variables.py

In [None]:
! pip install "scikit_learn==0.22.2.post1"

In [None]:
import pandas as pd
import numpy as np
from utils.print_tree import printBTree

import sys
sys.path.append('../')
sys.path.append('../../')
sys.path.append('../../')

from isanlp_rst.src.isanlp_rst.features_processor_default import FeaturesProcessor

features_processor = FeaturesProcessor(model_dir_path='models', verbose=0)

In [None]:
import glob
import pandas as pd
import pickle
from tqdm.autonotebook import tqdm
from utils.file_reading import read_gold, read_annotation


IN_PATH = 'data/'
for file in tqdm(glob.glob("%s*.json" % IN_PATH)):
    table = read_gold(file.replace('.json', ''))
    table = table[table.snippet_x.map(len) > 0]
    table = table[table.snippet_y.map(len) > 0]
    annot = read_annotation(file.replace('.json', ''))
    features = features_processor(table, 
                                  annot['text'], annot['tokens'], 
                                  annot['sentences'], annot['lemma'], 
                                  annot['morph'], annot['ud_postag'], 
                                  annot['syntax_dep_tree'])
    features.to_pickle(file.replace('.json', '.gold.pkl'))