In [6]:
with open('EnglishBible.txt', mode='r', encoding='utf8') as file:
    english = file.read()

In [9]:
from tqdm import tqdm

In [11]:
def convert_bible(txt):
    lines = txt.split("\n")
    values = []
    for line in tqdm(lines):
        sp = line.split("||")
        values.append(sp)
        
    return values

In [16]:
eng_values = convert_bible(english)

100%|██████████| 31104/31104 [00:00<00:00, 372673.66it/s]


In [13]:
values[0]

['01O', '1', '1', 'In the beginning God created the heavens and the earth.']

In [14]:
with open('RussianBible.txt', mode='r', encoding='utf8') as file:
    russian = file.read()

In [15]:
rus_values = convert_bible(russian)

100%|██████████| 31103/31103 [00:00<00:00, 506943.96it/s]


In [25]:
rus_values[0][:3]

['01O', '1', '1']

In [17]:
len(eng_values)

31104

In [18]:
len(rus_values)

31103

In [45]:
rus_values[2]

['01O', '1', '3', 'И сказал Бог: да будет свет. И сталсвет.']

In [34]:
for i, s in enumerate(eng_values):
    if len(s) is not 4:
        print(s)
        print(i)

['']
31103


In [69]:
sentences = dict()
for s in tqdm(eng_values):
    if len(s) is not 4:
        continue
    sentences["-".join(s[:3])] = [s[3]]

100%|██████████| 31104/31104 [00:00<00:00, 445114.95it/s]


In [70]:
for s in tqdm(rus_values):
    if len(s) is not 4:
        continue
    sentences["-".join(s[:3])].append(s[3])

100%|██████████| 31103/31103 [00:00<00:00, 450976.89it/s]


In [44]:
list(sentences.items())[:3]

[('01O-1-1',
  ['In the beginning God created the heavens and the earth.',
   'В начале сотворил Бог небо и землю.']),
 ('01O-1-2',
  ['And the earth was waste and void; and darkness was upon the face of the deep: and the Spirit of God moved upon the face of the waters',
   'Земля же была безвидна и пуста, и тьма над бездною, и Дух Божий носился над водою.']),
 ('01O-1-3',
  ['And God said, Let there be light: and there was light.',
   'И сказал Бог: да будет свет. И сталсвет.'])]

In [71]:
for triplet in sentences.items():
    if len(triplet[1]) is not 2:
        print(triplet[0], triplet[1])

64N-1-15 ['Peace [be] unto thee. The friends salute thee. Salute the friends by name. ']


In [72]:
sentences.pop("64N-1-15")

['Peace [be] unto thee. The friends salute thee. Salute the friends by name. ']

In [49]:
import pandas as pd

In [51]:
verse_names = []
russian = []
english = []
for triplet in tqdm(sentences.items()):
    verse_names.append(triplet[0])
    english.append(triplet[1][0])
    russian.append(triplet[1][1])

100%|██████████| 31102/31102 [00:00<00:00, 593143.53it/s]


In [52]:
len(verse_names), len(russian), len(english)

(31102, 31102, 31102)

In [53]:
verses = pd.DataFrame({
    'verse': verse_names,
    'english': english,
    'russian': russian
})

In [54]:
verses.head(10)

Unnamed: 0,verse,english,russian
0,01O-1-1,In the beginning God created the heavens and t...,В начале сотворил Бог небо и землю.
1,01O-1-2,And the earth was waste and void; and darkness...,"Земля же была безвидна и пуста, и тьма над без..."
2,01O-1-3,"And God said, Let there be light: and there wa...",И сказал Бог: да будет свет. И сталсвет.
3,01O-1-4,"And God saw the light, that it was good: and G...","И увидел Бог свет, что он хорош, и отделил Бог..."
4,01O-1-5,"And God called the light Day, and the darkness...","И назвал Бог свет днем, а тьму ночью. И был ве..."
5,01O-1-6,"And God said, Let there be a firmament in the ...","И сказал Бог: да будет твердь посреди воды, и ..."
6,01O-1-7,"And God made the firmament, and divided the wa...","И создал Бог твердь, и отделил воду, которая п..."
7,01O-1-8,And God called the firmament Heaven. And there...,"И назвал Бог твердь небом. И был вечер, и было..."
8,01O-1-9,"And God said, Let the waters under the heavens...","И сказал Бог: да соберется вода, которая под н..."
9,01O-1-10,And God called the dry land Earth; and the gat...,"И назвал Бог сушу землею, а собрание вод назва..."


In [55]:
verses.to_csv('verses.csv', index=False, encoding='utf8')

In [65]:
import nltk
nltk.download('punkt')

[nltk_data] Downloading package punkt to /home/dslarionov/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [77]:
list(sentences.items())[0][1][1]

'В начале сотворил Бог небо и землю.'

In [81]:
count = 0
for t in tqdm(sentences.items()):
    s_eng = nltk.sent_tokenize(t[1][0])
    s_rus = nltk.sent_tokenize(t[1][1])
    if len(s_eng) is len(s_rus):
        count += 1
        
print(f"Equal number os sentences in {count} verses")
print(f"Unequal in {len(sentences) - count}")

100%|██████████| 31102/31102 [00:03<00:00, 9762.42it/s]

Equal number os sentences in 27693 verses
Unequal in 3409





In [84]:
verse_names = []
ru_sentences = []
en_sentences = []
for t in tqdm(sentences.items()):
    s_eng = nltk.sent_tokenize(t[1][0])
    s_rus = nltk.sent_tokenize(t[1][1])
    if len(s_eng) is len(s_rus):
        for idx in range(len(s_eng)):
            verse_names.append(t[0])
            en_sentences.append(s_eng[idx])
            ru_sentences.append(s_rus[idx])
    else:
        verse_names.append(t[0])
        ru_sentences.append(t[1][1])
        en_sentences.append(t[1][0])

100%|██████████| 31102/31102 [00:03<00:00, 9625.57it/s]


In [85]:
sentences_df = pd.DataFrame({
    'verse_name': verse_names,
    'english': en_sentences,
    'russian': ru_sentences
})

In [86]:
sentences_df.head(10)

Unnamed: 0,verse_name,english,russian
0,01O-1-1,In the beginning God created the heavens and t...,В начале сотворил Бог небо и землю.
1,01O-1-2,And the earth was waste and void; and darkness...,"Земля же была безвидна и пуста, и тьма над без..."
2,01O-1-3,"And God said, Let there be light: and there wa...",И сказал Бог: да будет свет. И сталсвет.
3,01O-1-4,"And God saw the light, that it was good: and G...","И увидел Бог свет, что он хорош, и отделил Бог..."
4,01O-1-5,"And God called the light Day, and the darkness...","И назвал Бог свет днем, а тьму ночью."
5,01O-1-5,"And there was evening and there was morning, o...","И был вечер, и было утро: день один."
6,01O-1-6,"And God said, Let there be a firmament in the ...","И сказал Бог: да будет твердь посреди воды, и ..."
7,01O-1-7,"And God made the firmament, and divided the wa...","И создал Бог твердь, и отделил воду, которая п..."
8,01O-1-8,And God called the firmament Heaven.,И назвал Бог твердь небом.
9,01O-1-8,"And there was evening and there was morning, a...","И был вечер, и было утро: день второй."


In [87]:
sentences_df.to_csv('sentences.csv', index=False, encoding='utf8')