In [1]:
import docx2txt
from docx import Document
import re
from IPython.display import display, clear_output
from itertools import product
from collections import OrderedDict
from collections import namedtuple
import pandas as pd

filepath_en  = './dataset/dataset/September_english_2020.docx'
filepath_fr = './dataset/dataset/September_french_2020.docx'

In [2]:
filenames = ['December', 'February', 'January', 'October', 'September']
filepath = './dataset/dataset/'
doc_en = Document(filepath_en)
doc_fr = Document(filepath_fr)
# document = Document()

In [3]:
def examine_doc(doc, show_temp=False):
    vocabs = []
    vocabs_ = []
    temp = []
    
    for paragraph in doc.paragraphs:
        if paragraph.text == '':
            vocabs.append(len(temp))
            vocabs_.append(temp)
            if show_temp: print(temp)
            temp = []
            continue
        temp.append(paragraph.text)
        
    vocabs = [paragraph for paragraph in vocabs if paragraph != 0]
    vocabs_ = [paragraph for paragraph in vocabs_ if len(paragraph) != 0]
    return vocabs,vocabs_

def process_doc(doc, show_temp=False):
    vocabs = []
    
    for paragraph in doc.paragraphs:
        if paragraph.text == '':
            if show_temp: print(paragraph.text)
            continue
        vocabs.append(str(paragraph.text).lower())
        
    vocabs = [paragraph for paragraph in vocabs if len(paragraph) != 0]
    return vocabs

def compare_docs(vocab_en, vocab_fr):
    results = []
    for idx, num in enumerate(vocab_en):
        temp = OrderedDict()
        temp['Index'] = idx
        temp["English"] = num
        temp['French'] = vocab_fr[idx]
        temp['Status'] = num == vocab_fr[idx]

        results.append(temp)

    df = pd.DataFrame.from_dict(results, orient='columns')
    clear_output(wait=True)
    display(df)

In [4]:
vocab_en_,_vocab_en_ = examine_doc(doc_en)
vocab_fr_,_vocab_fr_ = examine_doc(doc_fr)
compare_docs(vocab_en_, vocab_fr_)

Unnamed: 0,Index,English,French,Status
0,0,9,9,True
1,1,8,8,True
2,2,9,9,True
3,3,9,9,True
4,4,9,9,True
5,5,8,8,True
6,6,10,10,True
7,7,10,10,True
8,8,8,8,True
9,9,9,9,True


In [5]:
ref = 10
print(_vocab_en_[ref][0])
print(_vocab_fr_[ref][0])

MANIFESTING HIS VIRTUES AND EXCELLENCE
MANIFESTEZ SES VERTUS ET SON EXCELLENCE


In [6]:
vocab_en = process_doc(doc_en)
vocab_fr = process_doc(doc_fr)
print(len(vocab_en), len(vocab_fr))

264 264


In [7]:
filenames = ['December', 'February', 'January', 'October', 'September']
path = './dataset/dataset/'

vocab_en = []
vocab_fr = []
for filename in filenames:
    doc_en = Document(f"{path}{filename}_english_2020.docx")
    doc_fr = Document(f"{path}{filename}_french_2020.docx")
    
    vocab_en += process_doc(doc_en)
    vocab_fr += process_doc(doc_fr)

raw_data = {"eng": vocab_en, "fre": vocab_fr}
df = pd.DataFrame(raw_data, columns=['eng', 'fre'])
df.to_csv('./dataset/ror_data.csv', index=False)
df.head()

Unnamed: 0,eng,fre
0,the new heaven and the new earth,le nouveau ciel et la nouvelle terre
1,"for what shall it profit a man, if he shall ga...",et que sert-il à un homme de gagner tout le mo...
2,"w for what shall it profit a man, if he shall ...",lorsqu’on fait la publicité d’un nouveau produ...
3,this is how you should view this current world...,c’est ainsi que vous devriez voir le monde act...
4,the 23rd and 25th verses take it even further:...,les versets 23 et 25 vont plus en profondeur: ...


In [8]:
bible_data = pd.read_csv("./dataset/eng-fre-bible.csv")

In [9]:
bible_data["eng"] = bible_data['eng'].apply(str)
bible_data["fre"] = bible_data['fre'].apply(str)
bible_data.head()

Unnamed: 0,eng,fre
0,Genesis 1:1 In the beginning God created the ...,Genèse 1:1 Au commencement Dieu créa les cie...
1,"Genesis 1:2 And the earth was without form, a...",Genèse 1:2 Et la terre était désolation et v...
2,"Genesis 1:3 And God said, Let there be light:...",Genèse 1:3 Et Dieu dit: Que la lumière soit!...
3,"Genesis 1:4 And God saw the light, that [it w...","Genèse 1:4 Et Dieu vit la lumière, qu'elle é..."
4,"Genesis 1:5 And God called the light Day, and...",Genèse 1:5 Et Dieu appela la lumière Jour; e...


In [10]:
bible_data = bible_data.append(df)

In [11]:
bible_data.tail()

Unnamed: 0,eng,fre
1379,"when jesus heard the words of the centurion, t...",quand jésus a entendu les paroles du centurion...
1380,what’s your desire? in which area do you requi...,quel est votre désir? dans quel domaine avezvo...
1381,"even now, act on his word. he said in john 14:...","même à l’instant, agissez selon sa parole. il ..."
1382,"dear father, thank you for your precious word ...","cher père, merci pour ta précieuse parole qui ..."
1383,romans 10:17; 2 corinthians 4:13; hebrews 11:6,romains 10:17; 2 corinthiens 4:13; hébreux 11:6


In [12]:
gen_data = pd.read_csv("./dataset/train.csv")

In [13]:
gen_data = gen_data[:15000]

In [14]:
gen_data.head()

Unnamed: 0,English,French
0,"It does not bode at all well for Nigeria, at a...","Cela n'augure rien de bon pour le Nigeria, alo..."
1,But these are two parallel exercises.,Mais il s'agit de deux exercices parallèles.
2,They are post-mortem tests.,Il s'agit de tests post mortem.
3,"In addition, it would highlight the need for t...","En outre, elle souligne la nécessité de placer..."
4,"Just as important, however, is the need to saf...","Toutefois, la nécessité de garantir les servic..."


In [15]:
gen_data = gen_data.rename(columns={"English": "eng", "French": "fre"}, inplace=False)

In [16]:
gen_data["eng"] = gen_data['eng'].apply(str)
gen_data["fre"] = gen_data['fre'].apply(str)
gen_data.tail()

Unnamed: 0,eng,fre
14995,Active Euro-Mediterranean institutions have be...,Des institutions euro-méditerranéennes actives...
14996,"Mr President, my Group firmly supports this re...","Monsieur le Président, notre groupe soutient r..."
14997,We need a much deeper discussion around this p...,In faut une discussion bien plus approfondie s...
14998,This problem rarely affects small or medium-si...,Ce problème concerne rarement les petites et m...
14999,We are only a few weeks away from the Cancun c...,Quelques semaines nous séparent de la conféren...


In [17]:
bible_data = bible_data.append(gen_data)
bible_data.head()

Unnamed: 0,eng,fre
0,Genesis 1:1 In the beginning God created the ...,Genèse 1:1 Au commencement Dieu créa les cie...
1,"Genesis 1:2 And the earth was without form, a...",Genèse 1:2 Et la terre était désolation et v...
2,"Genesis 1:3 And God said, Let there be light:...",Genèse 1:3 Et Dieu dit: Que la lumière soit!...
3,"Genesis 1:4 And God saw the light, that [it w...","Genèse 1:4 Et Dieu vit la lumière, qu'elle é..."
4,"Genesis 1:5 And God called the light Day, and...",Genèse 1:5 Et Dieu appela la lumière Jour; e...


In [18]:
test_data = pd.read_csv("./dataset/train.csv")
test_data = test_data[20000:25000]

test_data = test_data.rename(columns={"English": "eng", "French": "fre"}, inplace=False)

test_data["eng"] = test_data['eng'].apply(str)
test_data["fre"] = test_data['fre'].apply(str)

test_data.head()

Unnamed: 0,eng,fre
20000,"Take for instance this year, which was an extr...","Prenons l'exemple de cette année, qui a été ex..."
20001,I must point out that we are proceeding to vot...,Je vous signale que nous procédons à des votes...
20002,The object of this is to enable us to eventual...,"En fin de compte, cette procédure a pour but d..."
20003,"rapporteur. - Mr President, I really do not kn...","rapporteur. - Monsieur le Président, je ne sai..."
20004,Integrated pollution prevention and control: i...,Émissions industrielles (prévention et réducti...


In [19]:
bible_data.to_csv("./final_data/eng_fre_train.csv")
bible_data.to_json("./final_data/eng_fre_train.json", orient='records', lines=True)

test_data.to_csv("./final_data/eng_fre_test.csv")
test_data.to_json("./final_data/eng_fre_test.json", orient='records', lines=True)

In [20]:
valid_data = pd.read_csv("./dataset/valid.csv")
valid_data = valid_data[:5000]

valid_data = valid_data.rename(columns={"English": "eng", "French": "fre"}, inplace=False)

valid_data["eng"] = valid_data['eng'].apply(str)
valid_data["fre"] = valid_data['fre'].apply(str)

valid_data.to_csv("./final_data/eng_fre_valid.csv")
valid_data.to_json("./final_data/eng_fre_valid.json", orient='records', lines=True)

In [23]:
df = pd.read_csv("./final_data/eng_fre_test.csv")
df[:10]

Unnamed: 0.1,Unnamed: 0,eng,fre
0,20000,"Take for instance this year, which was an extr...","Prenons l'exemple de cette année, qui a été ex..."
1,20001,I must point out that we are proceeding to vot...,Je vous signale que nous procédons à des votes...
2,20002,The object of this is to enable us to eventual...,"En fin de compte, cette procédure a pour but d..."
3,20003,"rapporteur. - Mr President, I really do not kn...","rapporteur. - Monsieur le Président, je ne sai..."
4,20004,Integrated pollution prevention and control: i...,Émissions industrielles (prévention et réducti...
5,20005,"Firstly, as a European Union, we should remain...","En tant qu'Union européenne, nous devons reste..."
6,20006,The safety of Christians living in Turkey and ...,La sécurité des chrétiens qui vivent en Turqui...
7,20007,Prior authorisation is the reason patients wen...,"L'autorisation préalable, c'est ce qui a pouss..."
8,20008,Hell on earth is the description of how Europe...,L'enfer sur terre décrit parfaitement les épre...
9,20009,The question of setting MRLs for certain bioci...,On s'est également mis d'accord sur la questio...
