<h1>Beyond Spell-Checking: Word-Checking</h1>
<h2>An attention based Transformer approach</h2>

<h2> Abdurrahman Shahid</h2>
<h3>L3 MIASHS - SC, University of Lille, France</h3>
<h3>Study and Research Work - Travaux d’Etude et de Recherche (TER) </h3>

In [1]:
import tensorflow_datasets as tfds
import tensorflow as tf

import time
import numpy as np
import matplotlib.pyplot as plt
import os
import pickle
from datetime import datetime

In [2]:
def concatfile(inp_dir, out_dir, year_from=2007, year_to=2019, lower=True):
    
    years = [year for year in range(year_from,year_to+1)]
    print("Years considered: {}\n".format(years))
    
    filename_out = "NewsConcat_{}-{}".format(year_from,year_to)
    if lower:
        filename_out = filename_out + "_lowercase"
    filename_out = filename_out + ".txt"
    outputfile = open(os.path.join(out_dir,filename_out), "w", encoding="utf-8")
    
    start_time = datetime.now()
    
    lines_count = {}
    for year in years:
        filename = "news."+str(year)+".fr.shuffled.txt"
        filepath = os.path.join(inp_dir, filename)
        file = open(filepath, "r", encoding="utf-8" )
        print("start processing {}".format(filename))
        stop = False
        count_lines = 0
        while not stop:
            line = file.readline()
            if lower:
                line = line.lower()
            count_lines += 1
            outputfile.write(line)
            if not line:
                stop = True
                print("finished processing {}".format(filename))
                file.close()
                lines_count[filename] = count_lines
    outputfile.close()
    end_time = datetime.now()
    delta = end_time - start_time
    delta_minutes = delta.seconds // 60
    d = delta_minutes
    d_unit = "minutes"
    if delta_minutes == 0:
        d = delta.seconds
        d_unit = "seconds"
    print("\nFile created with success (~{} {}): \n{}\n".format(d, d_unit, filename_out))
    
    for k in list(lines_count.keys()):
        print("{}: {} lines".format(k,lines_count[k]))
    print("Total lines in new file: {}".format(sum(list(lines_count.values()))))

In [3]:
concatfile("Raw_Data","Clean_Data", year_from=2007, year_to=2012, lower=True)

Years considered: [2007, 2008, 2009, 2010, 2011, 2012]

start processing news.2007.fr.shuffled.txt
finished processing news.2007.fr.shuffled.txt
start processing news.2008.fr.shuffled.txt
finished processing news.2008.fr.shuffled.txt
start processing news.2009.fr.shuffled.txt
finished processing news.2009.fr.shuffled.txt
start processing news.2010.fr.shuffled.txt
finished processing news.2010.fr.shuffled.txt
start processing news.2011.fr.shuffled.txt
finished processing news.2011.fr.shuffled.txt
start processing news.2012.fr.shuffled.txt
finished processing news.2012.fr.shuffled.txt

File created with success (~1 minutes): 
NewsConcat_2007-2012_lowercase.txt

news.2007.fr.shuffled.txt: 118959 lines
news.2008.fr.shuffled.txt: 4718842 lines
news.2009.fr.shuffled.txt: 4366673 lines
news.2010.fr.shuffled.txt: 1846494 lines
news.2011.fr.shuffled.txt: 6030153 lines
news.2012.fr.shuffled.txt: 4337029 lines
Total lines in new file: 21418150


In [4]:
homophones = [
    ("a","à"),
    ("est","et"),
    ("ces","ses"),
    ("ce","se"),
    ("ou","où"),
    ("la","là"),
    ("tout","tous"),
    ("leur","leurs"),
    ("ceux","ce"),
    ("cette","cet")
]

In [5]:
def limit_sentences_length(homophones, inp_dir, out_dir, filename_inp, lmin, lmax):
    """Preprocess to keep sentences with length within lmin and lmax and containing at least one of the homophone
       present in input=homophones"""
    
    h_liste = []
    for a, b in homophones:
        if a not in h_liste:
            h_liste.append(a)
        if b not in h_liste:
            h_liste.append(b)
     
    file = open(os.path.join(inp_dir, filename_inp), "r", encoding="utf-8")
    filename_out = filename_inp[:-4] + "_limit_{}-{}w.txt".format(lmin, lmax)
    file_out = open(os.path.join(out_dir, filename_out), "w", encoding="utf-8")
    
    start = datetime.now()
    lines_nb = 0
    stop = False
    while not stop:
        line = file.readline()
        to_w = False
        len_l = len(line.split(' '))
        if len_l>= lmin and len_l <= lmax:
            for w in h_liste:
                target = " "+ w +" "
                if target in line:
                    to_w = True
            if to_w:
                file_out.write(line)
                lines_nb += 1
        
        if not line:
            stop = True
            file.close()
            file_out.close()
    
    end = datetime.now()
    delta = end - start
    delta_minutes = delta.seconds // 60
    d = delta_minutes
    d_unit = "minutes"
    if delta_minutes == 0:
        d = delta.seconds
        d_unit = "seconds"
    print("\nFile created with success ({} lines in new file, ~{} {}): \n{}\n".format(
                                                                                    lines_nb, d, d_unit, filename_out))

In [6]:
limit_sentences_length(homophones,"Clean_Data","Clean_Data","NewsConcat_2007-2012_lowercase.txt",5,20)


File created with success (7495691 lines in new file, ~1 minutes): 
NewsConcat_2007-2012_lowercase_limit_5-20w.txt



In [7]:
def get_location(homophones, inp_dir, filename_data, save=False, savename=False):
    """Returns all line numbers for each homophone that appears in filename_data.
    output: dictionary containing for each homophone a list of line numbers where this homophone occurs.
    """
    start = datetime.now()
    dataset = open(os.path.join(inp_dir, filename_data), 'r', encoding="utf-8")
    
    out_dic = {}
    h_liste = []
    for a, b in homophones:
        if a not in h_liste:
            h_liste.append(a)
        if b not in h_liste:
            h_liste.append(b)
    
    for h in h_liste:
        out_dic[h] = []
        
    stop = False
    line_num = 0
    while not stop:
        line = dataset.readline()
        line_num +=1
        for h in h_liste:
            target = " " + h + " "
            if target in line:
                out_dic[h].append(line_num)
        if not line:
            stop = True
            
    dataset.close()
    
    if save:
        if savename:
            outputname = savename + ".pickle"
            pickle.dump(out_dic, open(outputname, "wb"))
        if not savename:
            outputname = "homophones_location-"+filename_data[:-4] + ".pickle"
            pickle.dump(out_dic, open(outputname, "wb"))
        end = datetime.now()
        delta = end - start
        delta_minutes = delta.seconds // 60
        d = delta_minutes
        d_unit = "minutes"
        if delta_minutes == 0:
            d = delta.seconds
            d_unit = "seconds"
        print("\nDictionary created with success (~{} {}): \n{}\n".format(d, d_unit, outputname))
    if not save:
        end = datetime.now()
        delta = end - start
        delta_minutes = delta.seconds // 60
        d = delta_minutes
        d_unit = "minutes"
        if delta_minutes == 0:
            d = delta.seconds
            d_unit = "seconds"
        print("\nget_location function executed successfully (~{} {})\n".format(d, d_unit))
    return out_dic

In [8]:
location_dict = get_location(homophones, "Clean_Data", "NewsConcat_2007-2012_lowercase_limit_5-20w.txt", save=False, savename=False)


get_location function executed successfully (~39 seconds)



In [9]:
def sentence_number(sets, printmes=True, ret=False):
    h_rank = []
    for key in list(sets.keys()):
        num_sentence = len(sets[key])
        h_rank.append((num_sentence, key))
        if printmes:
            print("homophone: {}, number of sentences: {} (~{}k)".format(key,num_sentence, round(num_sentence/10**3)))
    
    if ret:
        h_rank.sort()
        return h_rank

In [10]:
h_rank = sentence_number(location_dict, True, False)

homophone: a, number of sentences: 1463794 (~1464k)
homophone: à, number of sentences: 2471098 (~2471k)
homophone: est, number of sentences: 900333 (~900k)
homophone: et, number of sentences: 1790874 (~1791k)
homophone: ces, number of sentences: 157747 (~158k)
homophone: ses, number of sentences: 257173 (~257k)
homophone: ce, number of sentences: 493528 (~494k)
homophone: se, number of sentences: 568760 (~569k)
homophone: ou, number of sentences: 195584 (~196k)
homophone: où, number of sentences: 79319 (~79k)
homophone: la, number of sentences: 2757402 (~2757k)
homophone: là, number of sentences: 33093 (~33k)
homophone: tout, number of sentences: 204412 (~204k)
homophone: tous, number of sentences: 101203 (~101k)
homophone: leur, number of sentences: 208740 (~209k)
homophone: leurs, number of sentences: 97720 (~98k)
homophone: ceux, number of sentences: 27904 (~28k)
homophone: cette, number of sentences: 292243 (~292k)
homophone: cet, number of sentences: 55014 (~55k)


In [11]:
def create_file_for_tokenizer(homophones, inp_dir, inp_file, num_h = 25000):
    file = open(os.path.join(inp_dir, inp_file), "r", encoding="utf-8")
    
    
    start = datetime.now()
    
    h_dic = {}
    for a, b in homophones:
        if a not in list(h_dic.keys()):
            h_dic[a] = 0
        if b not in list(h_dic.keys()):
            h_dic[b] = 0
    
    filename_out = "file_train_tokenizer-{}lines.txt".format(int(len(h_dic.keys())*num_h))
    file_out = open(filename_out, "w", encoding="utf-8")
    
    counter=0
    stop = False
    while not stop:
        line = file.readline()
        for w in list(h_dic.keys()):
            target = " "+ w +" "
            if target in line and h_dic[w] < num_h:
                file_out.write(line)
                h_dic[w] += 1
                counter += 1
                break
                
        if len(h_dic.keys())*num_h == counter or (not line):
            stop = True
            file.close()
            file_out.close()
    
    
    end = datetime.now()
    delta = end - start
    delta_minutes = delta.seconds // 60
    d = delta_minutes
    d_unit = "minutes"
    if delta_minutes == 0:
        d = delta.seconds
        d_unit = "seconds"
    print("\nFile created with success ({} lines in new file, ~{} {}): \n{}\n".format(counter, d, d_unit, filename_out))
    print("{}".format(h_dic))

In [12]:
create_file_for_tokenizer(homophones,"Clean_Data","NewsConcat_2007-2012_lowercase_limit_5-20w.txt",20000)


File created with success (380000 lines in new file, ~33 seconds): 
file_train_tokenizer-380000lines.txt

{'a': 20000, 'à': 20000, 'est': 20000, 'et': 20000, 'ces': 20000, 'ses': 20000, 'ce': 20000, 'se': 20000, 'ou': 20000, 'où': 20000, 'la': 20000, 'là': 20000, 'tout': 20000, 'tous': 20000, 'leur': 20000, 'leurs': 20000, 'ceux': 20000, 'cette': 20000, 'cet': 20000}


In [13]:
def init_tokenizer(filename_data, vocab_size, save=False):
    
    start = datetime.now()
    
    filename_data = os.path.join(filename_data)
    dataset = tf.data.TextLineDataset(filename_data)
    tokenizer = tfds.features.text.SubwordTextEncoder.build_from_corpus(
        (line.numpy().decode("utf-8") for line in dataset), vocab_size)
    
    if save:
        backup_name = "tokenizer_{}_{}".format(tokenizer.vocab_size,filename_data)
        tokenizer.save_to_file(backup_name)
    
    end = datetime.now()
    delta = end - start
    delta_minutes = delta.seconds // 60
    d = delta_minutes
    d_unit = "minutes"
    if delta_minutes == 0:
        d = delta.seconds
        d_unit = "seconds"
    if save:
        print("\nTokenizer created with success (~{} {}):\n{}\n".format(d, d_unit, backup_name))
    if not save:
        print("\nTokenizer created with success (~{} {}).".format(d, d_unit))
    return tokenizer

In [14]:
init_tokenizer("file_train_tokenizer-380000lines.txt", 1024, save=True)


Tokenizer created with success (~1 minutes):
tokenizer_1034_file_train_tokenizer-380000lines.txt



<SubwordTextEncoder vocab_size=1034>

In [15]:
tokenizer_1024 = tfds.features.text.SubwordTextEncoder.load_from_file(
                                                            "tokenizer_1034_file_train_tokenizer-380000lines.txt")

In [16]:
def select_clean_data(inp_dir, inp_file, tokenizer, limit):
    dirtydata = open(os.path.join(inp_dir, inp_file), "r", encoding="utf-8")
    output_name = "Selected_data" + str("-vs") + str(tokenizer.vocab_size) + str("-toklim") + str(limit) + ".txt"
    cleandata = open(os.path.join(inp_dir, output_name), "w", encoding="utf-8")
    
    start = datetime.now()
    
    lines_nb = 0
    write_nb = 0
    stop = False
    while not stop:
        line = dirtydata.readline()
        lines_nb += 1
        if lines_nb % 1000000 == 0:
            print("lines treated until now: {} ({} Million)".format(lines_nb, round(lines_nb//1000000)))
        len_line = len(tokenizer.encode(line))
        if len_line <= limit:
            cleandata.write(line)
            write_nb += 1
        if not line:
            stop = True
    
    dirtydata.close()
    cleandata.close()
    end = datetime.now()
    delta = end - start
    delta_minutes = delta.seconds // 60
    d = delta_minutes
    d_unit = "minutes"
    if delta_minutes == 0:
        d = delta.seconds
        d_unit = "seconds"
    print("\nFile created with success ({} lines in new file, ~{} {}): \n{}\n".format(
                                                                                    write_nb, d, d_unit, output_name))
    print("Total number of lines treated: {} (~{} M)".format(lines_nb, lines_nb//1000000))
    print("Total number of lines kept: {} (~{} M)".format(write_nb, write_nb//1000000))

In [17]:
select_clean_data("Clean_Data","NewsConcat_2007-2012_lowercase_limit_5-20w.txt", tokenizer_1024, 30)

lines treated until now: 1000000 (1 Million)
lines treated until now: 2000000 (2 Million)
lines treated until now: 3000000 (3 Million)
lines treated until now: 4000000 (4 Million)
lines treated until now: 5000000 (5 Million)
lines treated until now: 6000000 (6 Million)
lines treated until now: 7000000 (7 Million)

File created with success (2787248 lines in new file, ~4 minutes): 
Selected_data-vs1034-toklim30.txt

Total number of lines treated: 7495692 (~7 M)
Total number of lines kept: 2787248 (~2 M)


In [18]:
location_dict = get_location(homophones, "Clean_Data", "Selected_data-vs1034-toklim30.txt", save=False, savename=False)


get_location function executed successfully (~12 seconds)



In [19]:
h_rank = sentence_number(location_dict, True, False)

homophone: a, number of sentences: 455567 (~456k)
homophone: à, number of sentences: 788817 (~789k)
homophone: est, number of sentences: 368907 (~369k)
homophone: et, number of sentences: 452475 (~452k)
homophone: ces, number of sentences: 46899 (~47k)
homophone: ses, number of sentences: 76241 (~76k)
homophone: ce, number of sentences: 175835 (~176k)
homophone: se, number of sentences: 206140 (~206k)
homophone: ou, number of sentences: 49183 (~49k)
homophone: où, number of sentences: 18021 (~18k)
homophone: la, number of sentences: 875422 (~875k)
homophone: là, number of sentences: 15906 (~16k)
homophone: tout, number of sentences: 87980 (~88k)
homophone: tous, number of sentences: 38460 (~38k)
homophone: leur, number of sentences: 63657 (~64k)
homophone: leurs, number of sentences: 25830 (~26k)
homophone: ceux, number of sentences: 7737 (~8k)
homophone: cette, number of sentences: 93773 (~94k)
homophone: cet, number of sentences: 17350 (~17k)


In [20]:
create_file_for_tokenizer(homophones,"Clean_Data","Selected_data-vs1034-toklim30.txt", 7000)


File created with success (133000 lines in new file, ~12 seconds): 
file_train_tokenizer-133000lines.txt

{'a': 7000, 'à': 7000, 'est': 7000, 'et': 7000, 'ces': 7000, 'ses': 7000, 'ce': 7000, 'se': 7000, 'ou': 7000, 'où': 7000, 'la': 7000, 'là': 7000, 'tout': 7000, 'tous': 7000, 'leur': 7000, 'leurs': 7000, 'ceux': 7000, 'cette': 7000, 'cet': 7000}


In [21]:
init_tokenizer("file_train_tokenizer-133000lines.txt", 1024, save=True)


Tokenizer created with success (~1 minutes):
tokenizer_1024_file_train_tokenizer-133000lines.txt



<SubwordTextEncoder vocab_size=1024>

In [22]:
tokenizer_1024b = tfds.features.text.SubwordTextEncoder.load_from_file(
                                                            "tokenizer_1024_file_train_tokenizer-133000lines.txt")

In [23]:
select_clean_data("Clean_Data","Selected_data-vs1034-toklim30.txt", tokenizer_1024b, 30)

lines treated until now: 1000000 (1 Million)
lines treated until now: 2000000 (2 Million)

File created with success (2701227 lines in new file, ~1 minutes): 
Selected_data-vs1024-toklim30.txt

Total number of lines treated: 2787248 (~2 M)
Total number of lines kept: 2701227 (~2 M)


In [24]:
def filter_ggle_forms(inp_dir, inp_file, ggle_forms):
    dirtydata = open(os.path.join(inp_dir, inp_file), "r", encoding="utf-8")
    output_name = inp_file
    cleandata = open(output_name, "w", encoding="utf-8")
    
    start = datetime.now()
    
    list_sentences = []
    
    forms = open(ggle_forms, "r", encoding="utf-8")
    counter_sentences = 0
    stop_forms = False
    while not stop_forms:
        line = forms.readline()
        if line:
            list_sentences.append(line.strip())
            counter_sentences += 1
        else:
            stop_forms = True
    
    forms.close()
    print("Number of sentences to search: {},(len list = {})".format(counter_sentences, len(list_sentences)))
    
    index = []
    lines_nb = 0
    write_nb = 0
    counter = 0
    stop = False
    while not stop:
        line = dirtydata.readline()
        lines_nb += 1
        if line.strip() in list_sentences:
            counter += 1
            print("sentence matched ({}/{}): {}".format(counter,counter_sentences,line.strip()))
            index.append(list_sentences.index(line.strip()))
        if line.strip() not in list_sentences:
            cleandata.write(line)
            write_nb += 1
        if lines_nb % 1000000 == 0:
            print("\nlines treated until now: {} ({} Million)".format(lines_nb, round(lines_nb//1000000)))
        if not line:
            stop = True
    
    dirtydata.close()
    cleandata.close()
    
    index_set = set(index)
    print("\nTotal sentences matched: {}".format(len(index_set)))
    
    end = datetime.now()
    delta = end - start
    delta_minutes = delta.seconds // 60
    d = delta_minutes
    d_unit = "minutes"
    if delta_minutes == 0:
        d = delta.seconds
        d_unit = "seconds"
    print("\nFile created with success ({} lines in new file, ~{} {}): \n{}\n".format(
                                                                                    write_nb, d, d_unit, output_name))
    print("Total number of lines treated: {} (~{} M)".format(lines_nb, lines_nb//1000000))
    print("Total number of lines kept: {} (~{} M)".format(write_nb, write_nb//1000000))

In [25]:
filter_ggle_forms("Clean_Data","Selected_data-vs1024-toklim30.txt","test_sentence_ggle_forms_correct.txt")

Number of sentences to search: 40,(len list = 40)
sentence matched (1/40): les studios de cinéma sont célèbres pour leur créativité comptable.
sentence matched (2/40): ce niveau délimite la borne basse d'un mini canal horizontal.
sentence matched (3/40): face à cette situation de surchauffe, une nette correction a été engagée.
sentence matched (4/40): mais quand t'es embarqué dans l'engrenage, tu ne penses même pas à ça.
sentence matched (5/40): cela fuse dans tous les sens, c’est du grand n’importe quoi.
sentence matched (6/40): que du bonheur pour ceux qui sauront dépasser leurs douleurs.
sentence matched (7/40): la pratique courante de l'anglais est un atout.
sentence matched (8/40): les studios de cinéma sont célèbres pour leur créativité comptable.
sentence matched (9/40): les inscriptions se feront pendant les heures d'entrainements.
sentence matched (10/40): la vente de carte journalière au tarif de 8 € se fera sur place.
sentence matched (11/40): puisqu'en mode comme en fleuris

In [26]:
def match(fname, kfname):
    
    start = datetime.now()
    
    list_sentences = []
    
    kf = open(kfname, "r", encoding="utf-8")
    counter_sentences = 0
    stop_kf = False
    while not stop_kf:
        line = kf.readline()
        if line:
            list_sentences.append(line.strip())
            counter_sentences += 1
        else:
            stop_kf = True
    
    kf.close()
    print("Number of sentences to search: {},(len list = {})".format(counter_sentences, len(list_sentences)))
    
    index = []
    counter = 0
    f = open(fname, "r", encoding="utf-8")
    stop = False
    while not stop:
        line = f.readline()
        if line.strip() in list_sentences:
            counter += 1
            print("sentence matched ({}/{}): {}".format(counter,counter_sentences,line.strip()))
            index.append(list_sentences.index(line.strip()))
        if not line:
            stop = True
    f.close()
    
    index_set = set(index)
    print("Total sentences matched: {}\n".format(len(index_set)))
    
    end = datetime.now()
    delta = end - start
    delta_minutes = delta.seconds // 60
    d = delta_minutes
    d_unit = "minutes"
    if delta_minutes == 0:
        d = delta.seconds
        d_unit = "seconds"
    
    for s in list_sentences:
        if list_sentences.index(s) not in index:
            print("NOT FOUND ! {}, {}".format(list_sentences.index(s),s))
        else:
            print("{}, {}".format(list_sentences.index(s),s))
    
    print("\nSearch done, (~{} {})".format(d, d_unit))

In [27]:
match("Selected_data-vs1024-toklim30.txt","test_sentence_ggle_forms_correct.txt")

Number of sentences to search: 40,(len list = 40)
Total sentences matched: 0

NOT FOUND ! 0, la vente de carte journalière au tarif de 8 € se fera sur place.
NOT FOUND ! 1, sauront-ils exploiter la technologie de cet engin furtif ?
NOT FOUND ! 2, c'est évidemment peu, ou déjà trop.
NOT FOUND ! 3, face à cette situation de surchauffe, une nette correction a été engagée.
NOT FOUND ! 4, puisqu'en mode comme en fleurissement, les tendances se font et se défont.
NOT FOUND ! 5, le thème de cette dixième édition était « fêtes et traditions ».
NOT FOUND ! 6, le journal a appris de nouveaux détails sur cette affaire.
NOT FOUND ! 7, au début, ce n’était pas évident, donc j’ai fait très attention.
NOT FOUND ! 8, si la réponse à ça, c'est oui, je l'entends.
NOT FOUND ! 9, cela fuse dans tous les sens, c’est du grand n’importe quoi.
NOT FOUND ! 10, tu es là, maintenant, et tu n'es plus en 1994 ou en 1996.
NOT FOUND ! 11, c'est probablement la mesure la plus proche du ressenti de ces derniers.
NOT F

### Replace Selected_data-vs1024-toklim30.txt in \Clean_Data with the newly created file of the same name located in the current working directory

In [28]:
location_dict = get_location(homophones, "Clean_Data", "Selected_data-vs1024-toklim30.txt", save=False, savename=False)


get_location function executed successfully (~11 seconds)



In [29]:
h_rank = sentence_number(location_dict, True, False)

homophone: a, number of sentences: 439953 (~440k)
homophone: à, number of sentences: 761414 (~761k)
homophone: est, number of sentences: 358067 (~358k)
homophone: et, number of sentences: 434448 (~434k)
homophone: ces, number of sentences: 44913 (~45k)
homophone: ses, number of sentences: 73573 (~74k)
homophone: ce, number of sentences: 171115 (~171k)
homophone: se, number of sentences: 199989 (~200k)
homophone: ou, number of sentences: 46861 (~47k)
homophone: où, number of sentences: 17374 (~17k)
homophone: la, number of sentences: 844904 (~845k)
homophone: là, number of sentences: 15606 (~16k)
homophone: tout, number of sentences: 86107 (~86k)
homophone: tous, number of sentences: 37370 (~37k)
homophone: leur, number of sentences: 61553 (~62k)
homophone: leurs, number of sentences: 24781 (~25k)
homophone: ceux, number of sentences: 7470 (~7k)
homophone: cette, number of sentences: 90617 (~91k)
homophone: cet, number of sentences: 16806 (~17k)


In [30]:
def get_split(homophones,h_rank,location_dict, trainsize = 20000, valsize=1/20, testsize=1/50):
    """
    Returns a randomly sampled (without replacement) list of line numbers for each homophone in each couple of homophones.
    Each line can appear only once during trainig, so once it has been used for a homophone it can not be used 
    for another one.
    output: Dictionary containig for each homophone in each couple, a list of line numbers.
    {h1:{h1_target:[...], h1_target_bis:[...]}, h2:{h2_target:[...]},...}
    If a homophone h1 appears in two different couples, we use the other homophone of the couple as a key
    in the dictionaries.
    The set of all used line numbers is also returned.
    """
    start = datetime.now()
    
    h_liste = [h_rank[i][1] for i in range(len(h_rank))]
    num_to_select = int(trainsize*(1 + valsize + testsize))
    out_dic = {h:{} for h in h_liste}
    
    used = set()
    for h in h_liste:
        #print("processing '{}'".format(h))
        for (a,b) in homophones:
            if h==a or h==b:
                target = b
                if h==b:
                    target = a
                #print("match - '({},{})' with target = '{}'".format(a,b,target))
                h_set = set(location_dict[h])
                h_set.difference_update(used)
                h_set_list = list(h_set)
                h_set_list_selected = list(np.random.choice(h_set_list, num_to_select, replace=False))
                set_to_be_used = set(h_set_list_selected)
                used.update(set_to_be_used)
                out_dic[h][target] = h_set_list_selected
                #print("length: {}".format(len(h_set_list_selected)))
                #print("Successfully created 'out_dic[{}][{}]'".format(h,target))
    end = datetime.now()
    delta = end - start
    delta_minutes = delta.seconds // 60
    d = delta_minutes
    d_unit = "minutes"
    if delta_minutes == 0:
        d = delta.seconds
        d_unit = "seconds"
    print("\nget_split function executed successfully (~{} {})\n".format(d, d_unit))
    
    return out_dic, used

In [31]:
def buil_train_val_test(homophones, location_dict, filename_data, trainsize = 20000, valsize=1/20, testsize=1/50):
    
    start = datetime.now()
    
    h_rank = sentence_number(location_dict, printmes=False, ret=True)
    # h_rank =[(num_s,key),...] , num_s = sentence number for homophone=key
    
    h_rank.sort()
    
    #sets_lists = pickle.load(open("NSelected_h_dicsets.pickle", "rb"))
    #sets_lists = get_set(homophones,filename_data, False)
    #sets_lists = {h1:[line numbers], h2:[line numbers], ....}
    
    sets_lists_selected, used_set = get_split(homophones,h_rank,location_dict, 
                                                 trainsize = trainsize, valsize=valsize, testsize=testsize)
    #sets_lists_selected = {h1:{h1_target:[], h1_target_bis:[]}, h2:{h2_target:[]},...}
    now = datetime.now()
    outfile_time= str(now.month)+"-"+str(now.day)+"-"+str(now.hour)+"-"+str(now.minute)
    final_sets_name = "final_sets_selected_{}.pickle".format(outfile_time)
    pickle.dump(sets_lists_selected, open(final_sets_name, "wb"))
    
    dataset = open(filename_data, 'r', encoding="utf-8")
    final_dataset_name = "final_dataset_{}_{}.txt".format(outfile_time, len(used_set))
    final_dataset = open(final_dataset_name, "w", encoding="utf-8")
    
    
    line_nb = 0
    data_h = {}
    stop_new_file = False
    while not stop_new_file:
        line = dataset.readline()
        line_nb += 1
        if line_nb in used_set:
            final_dataset.write(line)
            data_h[line_nb] = line
        if not line:
            stop_new_file = True
    final_dataset.close()
    dataset.close()
    
    print("Extra files successfully created:\n{}\n{}\n".format(final_sets_name,final_dataset_name))

    
    ytrain_name = "ytrain"
    yval_name = "yval"
    ytest_name = "ytest"
    
    xtrain_name = "xtrain"
    xval_name = "xval"
    xtest_name = "xtest"
    
    to_dir = "train_val_test_datasets-{}".format(outfile_time)
    os.makedirs(to_dir, exist_ok=True)
    dir_train = "train_datasets"
    os.makedirs(os.path.join(to_dir,dir_train), exist_ok=True)
    dir_val = "val_datasets"
    os.makedirs(os.path.join(to_dir,dir_val), exist_ok=True)
    dir_test = "test_datasets"
    os.makedirs(os.path.join(to_dir,dir_test), exist_ok=True)
    
    # for each homophone of each pair of homophones (a,b), we create a training, validation and test set.
    # for each homophone, the other homophone of the pair is used as a noise.
    
    # so a homophone h has 6 files (the pair is (h,h')): 
    # 2 files for trainig, one with h corectly written (ytrain), the other with h replaced by h' (xtrain) half of the time
    # 2 validation files, one with h corectly written (yval), the other with h replaced by h' (xval) half of the time
    # 2 files for testing, one with h corectly written (ytest), the other with h replaced by h' (xtest) half of the time
    # files prefixed with x will be the inputs (effectively modeling a noisy input)
    # files prefixed with y will be the targets (true labels)
    # then we do the same but we reverse the role of h and h'
    # we continue by doing the same for each pair of homophones
    
    
    for (a,b) in homophones:
        ytraina = open(os.path.join(to_dir,dir_train,"{}-{}-{}-{}.txt".format(ytrain_name,a,b,outfile_time)),
                       "w", encoding="utf-8")
        yvala = open(os.path.join(to_dir,dir_val,"{}-{}-{}-{}.txt".format(yval_name,a,b,outfile_time)),
                     "w", encoding="utf-8")
        ytesta = open(os.path.join(to_dir,dir_test,"{}-{}-{}-{}.txt".format(ytest_name,a,b,outfile_time)),
                      "w", encoding="utf-8")
        ytesta_incorrect = open(os.path.join(to_dir,dir_test,"{}-{}-{}-{}-incorrect.txt".format(ytest_name,a,b,outfile_time)),
                                "w", encoding="utf-8")
        xtraina = open(os.path.join(to_dir,dir_train,"{}-{}-{}-{}.txt".format(xtrain_name,a,b,outfile_time)),
                       "w", encoding="utf-8")
        xvala = open(os.path.join(to_dir,dir_val,"{}-{}-{}-{}.txt".format(xval_name,a,b,outfile_time)),
                     "w", encoding="utf-8")
        xtesta = open(os.path.join(to_dir,dir_test,"{}-{}-{}-{}.txt".format(xtest_name,a,b,outfile_time)),
                      "w", encoding="utf-8")
        xtesta_incorrect = open(os.path.join(to_dir,dir_test,"{}-{}-{}-{}-incorrect.txt".format(xtest_name,a,b,outfile_time)),
                                "w", encoding="utf-8")
        
        counter_a = 0
        for num in sets_lists_selected[a][b]:
            counter_a += 1
            ysample = " "+ data_h[num].strip() +" "
            aspace = " " + a + " "
            bspace = " " + b + " "
            xsample = ysample.replace(aspace,bspace,1)
            ysample = ysample.strip()+"\n"
            xsample = xsample.strip()+"\n"
            
            if counter_a <= int(trainsize):
                ytraina.write(ysample)
                if counter_a % 2 == 0:
                    xtraina.write(ysample)
                else:
                    xtraina.write(xsample)
            if counter_a > int(trainsize) and counter_a <= int(trainsize*(1+valsize)):
                yvala.write(ysample)
                if counter_a % 2 == 0:
                    xvala.write(ysample)
                else:
                    xvala.write(xsample)
            if counter_a > int(trainsize*(1+valsize)):
                if counter_a % 2 == 0:
                    xtesta.write(ysample)
                    ytesta.write(ysample)
                else:
                    xtesta_incorrect.write(xsample)
                    ytesta_incorrect.write(ysample)
            
        ytraina.close()
        yvala.close()
        ytesta.close()
        ytesta_incorrect.close()
        xtraina.close()
        xvala.close()
        xtesta.close()
        xtesta_incorrect.close()
            
        ytrainb = open(os.path.join(to_dir,dir_train,"{}-{}-{}-{}.txt".format(ytrain_name,b,a,outfile_time)),
                       "w", encoding="utf-8")
        yvalb = open(os.path.join(to_dir,dir_val,"{}-{}-{}-{}.txt".format(yval_name,b,a,outfile_time)),
                     "w", encoding="utf-8")
        ytestb = open(os.path.join(to_dir,dir_test,"{}-{}-{}-{}.txt".format(ytest_name,b,a,outfile_time)),
                      "w", encoding="utf-8")
        ytestb_incorrect = open(os.path.join(to_dir,dir_test,"{}-{}-{}-{}-incorrect.txt".format(ytest_name,b,a,outfile_time)),
                                "w", encoding="utf-8")
        xtrainb = open(os.path.join(to_dir,dir_train,"{}-{}-{}-{}.txt".format(xtrain_name,b,a,outfile_time)),
                       "w", encoding="utf-8")
        xvalb = open(os.path.join(to_dir,dir_val,"{}-{}-{}-{}.txt".format(xval_name,b,a,outfile_time)),
                     "w", encoding="utf-8")
        xtestb = open(os.path.join(to_dir,dir_test,"{}-{}-{}-{}.txt".format(xtest_name,b,a,outfile_time)),
                      "w", encoding="utf-8")
        xtestb_incorrect = open(os.path.join(to_dir,dir_test,"{}-{}-{}-{}-incorrect.txt".format(xtest_name,b,a,outfile_time)),
                                "w", encoding="utf-8")
        
        counter_b = 0
        for num in sets_lists_selected[b][a]:
            counter_b += 1
            ysample = " "+ data_h[num].strip() +" "
            bspace = " " + b + " "
            aspace = " " + a + " "
            xsample = ysample.replace(bspace,aspace,1)
            ysample = ysample.strip()+"\n"
            xsample = xsample.strip()+"\n"
            
            if counter_b <= int(trainsize):
                ytrainb.write(ysample)
                if counter_b % 2 == 0:
                    xtrainb.write(ysample)
                else:
                    xtrainb.write(xsample)
            if counter_b > int(trainsize) and counter_b <= int(trainsize*(1+valsize)):
                yvalb.write(ysample)
                if counter_b % 2 == 0:
                    xvalb.write(ysample)
                else:
                    xvalb.write(xsample)
            if counter_b > int(trainsize*(1+valsize)):
                if counter_b % 2 == 0:
                    xtestb.write(ysample)
                    ytestb.write(ysample)
                else:
                    xtestb_incorrect.write(xsample)
                    ytestb_incorrect.write(ysample)
            
        ytrainb.close()
        yvalb.close()
        ytestb.close()
        ytestb_incorrect.close()
        xtrainb.close()
        xvalb.close()
        xtestb.close()
        xtestb_incorrect.close()
        
    end = datetime.now()
    delta = end - start
    delta_minutes = delta.seconds // 60
    d = delta_minutes
    d_unit = "minutes"
    if delta_minutes == 0:
        d = delta.seconds
        d_unit = "seconds"
    print("buil_train_val_test function executed successfully (~{} {})\n".format(d, d_unit))
    print("Files are located in the following directory: {}".format(to_dir))
    print("{}\n{}\n{}".format(dir_train,dir_val,dir_test))
    print("Time index for the current split: {}\n".format(outfile_time))

In [32]:
def visualize_data_after_tokenization(tokenizer,data_filename,visualize=True,saveplot=False,ret=False,maxlen=60):
    dataset = tf.data.TextLineDataset(data_filename)
    vocab_size = tokenizer.vocab_size
    total_tokens=0
    nb_lines = 0
    maximum = 0
    minimum = 60
    dic_length = {i:0 for i in range(1,maxlen+1)}
    for line in dataset:
        nb_lines += 1
        len_line = len(tokenizer.encode(line.numpy().decode("utf-8")))
        #if len_line >= 30 :
        #    print(nb_lines)
        #    print(line.numpy().decode("utf-8"))
        dic_length[len_line] += 1
        total_tokens += len_line
        if len_line > maximum:
            maximum = len_line
        #if nb_lines == 0:
        #    minimum = len_line
        if len_line < minimum:
            minimum = len_line
        
    
    mu = round(total_tokens/nb_lines,2)
    #print("{} : {}".format(["s","nb_lines","s/nb_lines","min","max"],[s,nb_lines, s/nb_lines, minimum, maximum]))
    
    if visualize:
        plt.bar(list(dic_length.keys()),list(dic_length.values()))
        plt.title("Tokenized sentence length (vocab_size={})".format(vocab_size))
        plt.xlabel("Sentence length after Tokenization")
        plt.ylabel("Number of sentenences")
        plt.text(0.75*maxlen,0.90*max(dic_length.values()),"mean = {}".format(mu))
        plt.text(1.1*maxlen,0.90*max(dic_length.values()),"Summary statistics:")
        plt.text(1.1*maxlen,0.83*max(dic_length.values()),"Mean token number per sentence: {}".format(mu))
        plt.text(1.1*maxlen,0.76*max(dic_length.values()),"Minimum sentence length: {}".format(minimum))
        plt.text(1.1*maxlen,0.69*max(dic_length.values()),"Maximum sentence length: {}".format(maximum))
        plt.text(1.1*maxlen,0.62*max(dic_length.values()),"Total number of tokens: {}".format(total_tokens))
        plt.text(1.1*maxlen,0.55*max(dic_length.values()),"Total number of sentences: {}".format(nb_lines))
        #plt.xticks(range(0,maxlen+1,int(round((maxlen+1)/12))))
        plt.show()
    
    if saveplot:
        now = datetime.now()
        plotdate = str(now.month)+"-"+str(now.day)+"-"+str(now.hour)+"-"+str(now.minute)
        plotname = "after_tokenization-{}-{}-{}.png".format(vocab_size,data_filename,plotdate)
        plt.savefig(plotname, dpi=600, bbox_inches = 'tight')
        print("Plot successfully saved, \n{}".format(plotname))
    
    print("\nSummary statistics:")
    print("Mean token number per sentence: {}".format(mu))
    print("Minimum sentence length: {} || Maximum sentence length: {}".format(minimum,maximum))
    print("Total number of tokens: {}".format(total_tokens))
    print("Total number of sentences: {}".format(nb_lines))
    
    if ret:
        return (dic_length,mu,minimum,maximum,total_tokens,nb_lines)

In [33]:
filename_data = os.path.join("Clean_Data","Selected_data-vs1024-toklim30.txt")

In [34]:
buil_train_val_test(homophones, location_dict, filename_data, trainsize = 5000, valsize=1/5, testsize=1/5)


get_split function executed successfully (~0 seconds)

Extra files successfully created:
final_sets_selected_2-8-22-50.pickle
final_dataset_2-8-22-50_140000.txt

buil_train_val_test function executed successfully (~3 seconds)

Files are located in the following directory: train_val_test_datasets-2-8-22-50
train_datasets
val_datasets
test_datasets
Time index for the current split: 2-8-22-50



In [35]:
def final_datasets_creation(homophones, to_dir, dir_train, dir_val, outfile_time, train_size, val_size, test_size=None):
    
    start = datetime.now()
    
    ytrain_name = "ytrain"
    yval_name = "yval"
    #ytest_name = "ytest"
    
    xtrain_name = "xtrain"
    xval_name = "xval"
    #xtest_name = "xtest"
    
    
    #os.path.join(to_dir, "ytest-{}.txt".format(outfile_time))
    xtrain = open("xtrain-{}.txt".format(outfile_time), "w", encoding="utf-8")
    ytrain = open("ytrain-{}.txt".format(outfile_time), "w", encoding="utf-8")
    xval = open("xval-{}.txt".format(outfile_time), "w", encoding="utf-8")
    yval = open("yval-{}.txt".format(outfile_time), "w", encoding="utf-8")
    #xtest = open(os.path.join(to_dir, "xtest-{}.txt".format(outfile_time)), "w", encoding="utf-8")
    #ytest = open(os.path.join(to_dir, "ytest-{}.txt".format(outfile_time)), "w", encoding="utf-8")
    
    for (a,b) in homophones:
        xtraina = open(os.path.join(to_dir,dir_train,"{}-{}-{}-{}.txt".format(xtrain_name,a,b,outfile_time)),
                       "r", encoding="utf-8")
        print("open file: {}-{}-{}-{}.txt".format(xtrain_name,a,b,outfile_time))
        ytraina = open(os.path.join(to_dir,dir_train,"{}-{}-{}-{}.txt".format(ytrain_name,a,b,outfile_time)),
                       "r", encoding="utf-8")
        print("open file: {}-{}-{}-{}.txt".format(ytrain_name,a,b,outfile_time))
        xvala = open(os.path.join(to_dir,dir_val,"{}-{}-{}-{}.txt".format(xval_name,a,b,outfile_time)),
                     "r", encoding="utf-8")
        print("open file: {}-{}-{}-{}.txt".format(xval_name,a,b,outfile_time))
        yvala = open(os.path.join(to_dir,dir_val,"{}-{}-{}-{}.txt".format(yval_name,a,b,outfile_time)),
                     "r", encoding="utf-8")
        print("open file: {}-{}-{}-{}.txt".format(yval_name,a,b,outfile_time))
        #xtesta = open("{}-{}-{}-{}.txt".format(xtest_name,a,b,outfile_time), "r", encoding="utf-8")
        #print("open file: {}-{}-{}-{}.txt".format(xtest_name,a,b,outfile_time))
        #ytesta = open("{}-{}-{}-{}.txt".format(ytest_name,a,b,outfile_time), "r", encoding="utf-8")
        #print("open file: {}-{}-{}-{}.txt".format(ytest_name,a,b,outfile_time))
        
        xtrainb = open(os.path.join(to_dir,dir_train,"{}-{}-{}-{}.txt".format(xtrain_name,b,a,outfile_time)),
                       "r", encoding="utf-8")
        print("open file: {}-{}-{}-{}.txt".format(xtrain_name,b,a,outfile_time))
        ytrainb = open(os.path.join(to_dir,dir_train,"{}-{}-{}-{}.txt".format(ytrain_name,b,a,outfile_time)),
                       "r", encoding="utf-8")
        print("open file: {}-{}-{}-{}.txt".format(ytrain_name,b,a,outfile_time))
        xvalb = open(os.path.join(to_dir,dir_val,"{}-{}-{}-{}.txt".format(xval_name,b,a,outfile_time)),
                     "r", encoding="utf-8")
        print("open file: {}-{}-{}-{}.txt".format(xval_name,b,a,outfile_time))
        yvalb = open(os.path.join(to_dir,dir_val,"{}-{}-{}-{}.txt".format(yval_name,b,a,outfile_time)),
                     "r", encoding="utf-8")
        print("open file: {}-{}-{}-{}.txt".format(yval_name,b,a,outfile_time))
        #xtestb = open("{}-{}-{}-{}.txt".format(xtest_name,b,a,outfile_time), "r", encoding="utf-8")
        #print("open file: {}-{}-{}-{}.txt".format(xtest_name,b,a,outfile_time))
        #ytestb = open("{}-{}-{}-{}.txt".format(ytest_name,b,a,outfile_time), "r", encoding="utf-8")
        #print("open file: {}-{}-{}-{}.txt".format(ytest_name,b,a,outfile_time))
        
        counter_train = 0
        while counter_train < train_size:
            x_a = xtraina.readline()
            x_b = xtrainb.readline()
            y_a = ytraina.readline()
            y_b = ytrainb.readline()
            
            xtrain.write(x_a)
            xtrain.write(x_b)
            ytrain.write(y_a)
            ytrain.write(y_b)
            
            counter_train += 1 
        
        xtraina.close()
        print("file closed: {}-{}-{}-{}.txt".format(xtrain_name,a,b,outfile_time))
        xtrainb.close()
        print("file closed: {}-{}-{}-{}.txt".format(xtrain_name,b,a,outfile_time))
        ytraina.close()
        print("file closed: {}-{}-{}-{}.txt".format(ytrain_name,a,b,outfile_time))
        ytrainb.close()
        print("file closed: {}-{}-{}-{}.txt".format(ytrain_name,b,a,outfile_time))
        
        counter_val = 0
        while counter_val < val_size:
            x_a = xvala.readline()
            x_b = xvalb.readline()
            y_a = yvala.readline()
            y_b = yvalb.readline()
            
            xval.write(x_a)
            xval.write(x_b)
            yval.write(y_a)
            yval.write(y_b)
            
            counter_val += 1
         
        xvala.close()
        print("file closed: {}-{}-{}-{}.txt".format(xval_name,a,b,outfile_time))
        xvalb.close()
        print("file closed: {}-{}-{}-{}.txt".format(xval_name,b,a,outfile_time))
        yvala.close()
        print("file closed: {}-{}-{}-{}.txt".format(yval_name,a,b,outfile_time))
        yvalb.close()
        print("file closed: {}-{}-{}-{}.txt".format(yval_name,b,a,outfile_time))
        
        #counter_test = 0
        #while counter_test < test_size:
        #    x_a = xtesta.readline()
        #    x_b = xtestb.readline()
        #    y_a = ytesta.readline()
        #    y_b = ytestb.readline()
            
        #    xtest.write(x_a)
        #    xtest.write(x_b)
        #    ytest.write(y_a)
        #    ytest.write(y_b)
            
        #    counter_test += 1
        
        #xtesta.close()
        #print("file closed: {}-{}-{}-{}.txt".format(xtest_name,a,b,outfile_time))
        #xtestb.close()
        #print("file closed: {}-{}-{}-{}.txt".format(xtest_name,b,a,outfile_time))
        #ytesta.close()
        #print("file closed: {}-{}-{}-{}.txt".format(ytest_name,a,b,outfile_time))
        #ytestb.close()
        #print("file closed: {}-{}-{}-{}.txt".format(ytest_name,b,a,outfile_time))
        
    
    print("")
    xtrain.close()
    print("xtrain closed")
    ytrain.close()
    print("ytrain closed")
    xval.close()
    print("xval closed")
    yval.close()
    print("yval closed")
    #xtest.close()
    #print("xtest closed")
    #ytest.close()
    #print("ytest closed")
    end = datetime.now()
    delta = end - start
    delta_minutes = delta.seconds // 60
    d = delta_minutes
    d_unit = "minutes"
    if delta_minutes == 0:
        d = delta.seconds
        d_unit = "seconds"
    print("\nTraining set and Validation set successfully created (~{} {})".format(d, d_unit))

In [36]:
os.makedirs(os.path.join("Model_training","checkpoints","train"), exist_ok=True)

In [37]:
outfile_time = "2-8-22-50"
to_dir = "train_val_test_datasets-{}".format(outfile_time)
dir_train = "train_datasets"
dir_val = "val_datasets"

In [38]:
final_datasets_creation(homophones,to_dir,dir_train,dir_val,outfile_time, 5000, 1000)

open file: xtrain-a-à-2-8-22-50.txt
open file: ytrain-a-à-2-8-22-50.txt
open file: xval-a-à-2-8-22-50.txt
open file: yval-a-à-2-8-22-50.txt
open file: xtrain-à-a-2-8-22-50.txt
open file: ytrain-à-a-2-8-22-50.txt
open file: xval-à-a-2-8-22-50.txt
open file: yval-à-a-2-8-22-50.txt
file closed: xtrain-a-à-2-8-22-50.txt
file closed: xtrain-à-a-2-8-22-50.txt
file closed: ytrain-a-à-2-8-22-50.txt
file closed: ytrain-à-a-2-8-22-50.txt
file closed: xval-a-à-2-8-22-50.txt
file closed: xval-à-a-2-8-22-50.txt
file closed: yval-a-à-2-8-22-50.txt
file closed: yval-à-a-2-8-22-50.txt
open file: xtrain-est-et-2-8-22-50.txt
open file: ytrain-est-et-2-8-22-50.txt
open file: xval-est-et-2-8-22-50.txt
open file: yval-est-et-2-8-22-50.txt
open file: xtrain-et-est-2-8-22-50.txt
open file: ytrain-et-est-2-8-22-50.txt
open file: xval-et-est-2-8-22-50.txt
open file: yval-et-est-2-8-22-50.txt
file closed: xtrain-est-et-2-8-22-50.txt
file closed: xtrain-et-est-2-8-22-50.txt
file closed: ytrain-est-et-2-8-22-50.t

### Move the newly created files (xtrain, ytrain, xval, yval) in .\Model_training
### Move also the tokenizer 'tokenizer_1024_file_train_tokenizer-133000lines.txt' in .\Model_training

# === END ===

In [35]:
def combine_two_files(dr1,name1, dr2, name2, to_dr, nname):
    inp1 = open(os.path.join(dr1,name1), "r", encoding="utf-8")
    inp2 = open(os.path.join(dr2,name2), "r", encoding="utf-8")
    out = open(os.path.join(to_dr,nname), "w", encoding="utf-8")
    
    start = datetime.now()
    
    stop1 = False
    counter1 = 0
    print("start processing {}".format(name1))
    while not stop1:
        line = inp1.readline()
        if line:
            out.write(line)
            counter1 += 1
        if not line:
            stop1 = True
            print("finished processing {} ({} lines added)".format(name1, counter1))
    
    stop2 = False
    counter2 = 0
    print("start processing {}".format(name2))
    while not stop2:
        line = inp2.readline()
        out.write(line)
        counter2 += 1
        if not line:
            stop2 = True
            print("finished processing {} ({} lines added)".format(name2, counter2))
    
    inp1.close()
    inp2.close()
    out.close()
    
    end = datetime.now()
    delta = end - start
    delta_minutes = delta.seconds // 60
    d = delta_minutes
    d_unit = "minutes"
    if delta_minutes == 0:
        d = delta.seconds
        d_unit = "seconds"
    print("File created with success: ({} lines in new file, ~{} {})\n{}\n".format(counter1+counter2, d, d_unit, nname))