In [87]:
from lib2to3.pgen2.tokenize import tokenize
import nltk 
from collections import defaultdict
from nltk.corpus import stopwords
import enchant 
from typing import List
from tqdm.notebook import tqdm
import pyspark
from termcolor import colored


eng_stopwords = set(stopwords.words("english"))
eng_dict = enchant.Dict("en")

def to_ngram(index_tup):
    return tuple([tup[1] for tup in index_tup])

def correct_token(token: str) -> str : 
    if eng_dict.check(token):
        return token
    else:
        suggestions = eng_dict.suggest(token)
        if len(suggestions) > 0:
            return suggestions[0]
        return token

def generate_n_gram(text: List[str], n: int) -> List[List[str]]:
    return zip(*[text[i:] for i in range(n)])

def treat_article(article_path:str, context, stopwords):
    with open(article_path, mode = "r", encoding = "utf-8") as f:
        data = ''.join(f.readlines())
    full_article = ''.join([c for c in data if c.isalnum() or c == " "])
    tokenized_article = list(enumerate(nltk.word_tokenize(full_article)))
    filtered_article = [(index,token.lower()) for index,token in tokenized_article if token not in stopwords ]
    filtered_indexes = [index for index,_ in filtered_article]
    corrected_article = context.parallelize(filtered_article).map(lambda x: x[1]).map(correct_token).collect()
    corrected_article = list(zip(filtered_indexes, corrected_article))
    return tokenized_article, list(generate_n_gram(corrected_article, 4))

def treat_article2(article_path:str, context, stopwords):
    with open(article_path, mode = "r", encoding = "utf-8") as f:
        data = ''.join(f.readlines())
    full_article = ''.join([c for c in data if c.isalnum() or c == " "])
    tokenized_article = list(enumerate(nltk.word_tokenize(full_article)))
    filtered_article = [(index,token.lower()) for index,token in tokenized_article if token not in stopwords ]
    filtered_indexes = [index for index,_ in filtered_article]
    corrected_article = context.parallelize(filtered_article).map(lambda x: x[1]).map(correct_token).collect()
    corrected_article = list(zip(filtered_indexes, corrected_article))
    n_grams = list(generate_n_gram(corrected_article, 3))
    n_gram_dict =defaultdict(list)
    for n_gram in n_grams:
        n_gram_dict[to_ngram(n_gram)].append(n_gram[0][0])
    return tokenized_article, n_gram_dict

In [None]:
sc = pyspark.SparkContext()

In [88]:
fr_path = "./txt files/french.txt"
en_path = "./txt files/english.txt"
it_path = "./txt files/italian.txt"
es_path = "./txt files/spanish.txt"






treated_fr, fr_grams = treat_article2(fr_path, sc, eng_stopwords)
treated_it, it_grams = treat_article2(it_path,sc,eng_stopwords)



In [None]:
fr_grams

In [66]:

def align_sequences(art_grams_1,art_grams_2):
    matching_grams = []
    for gram_1 in art_grams_1:
        for gram_2 in art_grams_2:
            if gram_1 == gram_2:
                art_1_matching_id = art_grams_1[gram_1].pop(0)
                art_2_matching_id = art_grams_2[gram_1].pop(0)
                matching_grams.append((gram_1,art_1_matching_id,art_2_matching_id))
    return matching_grams

def glue_sequence(sequence, gap_tolerance):
    final = []
    temp = [sequence[0][1:]]
    last_seen = sequence[0][1:]
    for _, ind_1, ind_2 in sequence[1:]:
        # print(n_gram)
        if 0<= ind_1 - last_seen[0] < gap_tolerance and 0<= ind_2 - last_seen[1] < gap_tolerance :
            temp.append((ind_1,ind_2))
        else:
            final.append((temp[0], temp[-1]))
            temp.clear()
            temp.append((ind_1,ind_2))
        last_seen = (ind_1,ind_2)
    final.append((temp[0], temp[-1]))
    return final

def retrieve_text(tokenized_article, start_index, end_index):
    return " ".join([x[1] for x in tokenized_article[start_index:end_index+1]])

In [None]:
matching_sequence = align_sequences(fr_grams,it_grams)
glued_sequence = glue_sequence(matching_sequence, 5)
glued_sequence

In [97]:



def display_match(match_indexes,treated_1, treated_2, padding):
    # print(sequence)
    start_1,end_1 = match_indexes[0][0] , match_indexes[1][0] +2
    start_2,end_2 = match_indexes[0][1] , match_indexes[1][1] +2
    text_1 = [retrieve_text(treated_1,start_1-padding, start_1), colored(retrieve_text(treated_1,start_1,end_1),"red"),retrieve_text(treated_1, end_1, end_1 + padding)]
    text_2 = [retrieve_text(treated_2,start_2-padding, start_2), colored(retrieve_text(treated_2,start_2,end_2),"red"),retrieve_text(treated_2, end_2, end_2 + padding)]
    print(*text_1)
    print(*text_2)


display_match(glued_sequence[23], treated_fr,treated_it, 20)

arms they paffent the Rhine towards the year 420 crazy the one of silver to 50 lb are found unhappy Phiaramond [31mPhiaramond their leader the first[0m first king of this ferment distributed as so were the riches of monarchy The Netherlands Picardy were their Rome at the
species pro weapons they they passed the Rhine towards the an 420 putting reward to those who had ten under Faramondo [31mFaramondo their leader and the[0m the first king sons it would be state very much better go back at of this monarchy The Countries bass and


In [None]:
matching_sequence

In [62]:
indexes = final[0]
retrieve_text(treated_fr,indexes[0][0], indexes[1][0])
retrieve_text(treated_it,indexes[0][1], indexes[1][1])



'Switzerland'

In [53]:


def retrieve_text(tokenized_article, start_index, end_index):
    return " ".join([x[1] for x in tokenized_article[start_index:end_index+1]])
def to_ngram(index_tup):
    return tuple([tup[1] for tup in index_tup])
def isMatching(tup_1,tup_2):
    return to_ngram(tup_1) == to_ngram(tup_2)

def glue_sequence(index_sequence, gap_tolerance):
    final = []
    temp = [index_sequence[0]]
    last_seen = index_sequence[0]
    for index in index_sequence[1:]:
        if index - last_seen < gap_tolerance:
            temp.append(index)
        else:
            final.append((temp[0], temp[-1]))
            temp.clear()
            temp.append(index)
        last_seen = index
    final.append((temp[0], temp[-1]))
    return final
            

def find_matching_ngrams(art_1, art_2, gap_tolerance):
    # Probably asymmetric , let's first do article 1 
    art_1_matching_indexes = []
    art_2_matching_indexes = []
    all_matching_tuples = []

    for index_1,tup_1 in enumerate(art_1):
        for index_2, tup_2 in enumerate(art_2):
            if isMatching(tup_1,tup_2):
                art_1_matching_indexes.append(index_1)
                art_2_matching_indexes.append(index_2)
                all_matching_tuples.append(tup_1)

    glued_art_1_indexes = glue_sequence(sorted(art_1_matching_indexes), gap_tolerance)
    glued_art_2_indexes = glue_sequence(sorted(art_2_matching_indexes), gap_tolerance)
    

    return glued_art_1_indexes,glued_art_2_indexes, all_matching_tuples

def from_art_id_to_treated_id(indexes,article):
    start_tuple = article[indexes[0]]
    start_id = start_tuple[0][0]
    end_tuple = article[indexes[1]]
    end_id = end_tuple[-1][0]
    return start_id,end_id

def align_sequences(art_1,art_2, glued_1, glued_2):
    matching_sequences = []
    for id_1, seq_1 in enumerate(glued_1):
        for id_2, seq_2 in enumerate(glued_2):
            seq_1_gram = to_ngram(art_1[seq_1[0]])
            seq_2_gram = to_ngram(art_2[seq_2[0]])
            if seq_1_gram == seq_2_gram:
                matching_sequences.append((seq_1, seq_2))
    return matching_sequences

sequences = align_sequences(fr_article,it_article,glued_fr,glued_it)

indexes = sequences[0][1]
fr_article

start_index, end_index = from_art_id_to_treated_id(indexes,it_article)

retrieve_text(treated_it,start_index,end_index )

def display_sequence(sequence,treated_1, treated_2, art_1, art_2, padding):
    # print(sequence)
    start_1,end_1 = from_art_id_to_treated_id(sequence[0], art_1)
    start_2,end_2 = from_art_id_to_treated_id(sequence[1], art_2)
    text_1 = [retrieve_text(treated_1,start_1-padding, start_1), colored(retrieve_text(treated_1,start_1,end_1),"red"),retrieve_text(treated_1, end_1, end_1 + padding)]
    text_2 = [retrieve_text(treated_2,start_2-padding, start_2), colored(retrieve_text(treated_2,start_2,end_2),"red"),retrieve_text(treated_2, end_2, end_2 + padding)]
    print(*text_1)
    print(*text_2)

display_sequence(sequences[1], treated_fr, treated_it, fr_article, it_article, 15)

# [retrieve_text(it_article, *x[1]) for x in sequences]
# id = 4
# glued_fr[id][0], fr_article[glued_fr[id][0]] , matching_tuples[id]


are more manderies of the order of Malta two hundred faith of a fiècle in 1666 [31m1666 of this inveterate evil believed[0m believed xante thousand clerics secular Where regular encourage the propagation of the species in proLe county
permit the practice of religion Reformed It s the parts go up at the top point [31mpoint similar to oldest of the[0m the kingdoms of Europe Its dark rivers which in their run lose their date dates back


In [4]:
text1 = colored("Salut la mif", 'red')
text2 = colored("Comment ça va bien", "blue")
text3 = colored("oklm on est la frr")
print(text1, text2,text3)

[31mSalut la mif[0m [34mComment ça va bien[0m oklm on est la frr[0m


In [None]:
fr_article
it_article