In [1]:
import pyspark
from nltk.corpus import stopwords
from collections import defaultdict
from helpers import correct_token, generate_n_gram, to_ngram, align_sequences, glue_sequence, display_match
from nltk import word_tokenize
SC = pyspark.SparkContext()

In [2]:
### Model parameters
ENG_STOPWORDS = set(stopwords.words("english"))
N_GRAM = 3
GAP_TOLERANCE = 5 
PADDING = 20


def treat_article(article_path:str, context, stopwords, n):
    with open(article_path, mode = "r", encoding = "utf-8") as f:
        data = ''.join(f.readlines())
    full_article = ''.join([c for c in data if c.isalnum() or c == " "])
    tokenized_article = list(enumerate(word_tokenize(full_article)))
    filtered_article = [(index,token.lower()) for index,token in tokenized_article if token not in stopwords ]
    filtered_indexes = [index for index,_ in filtered_article]
    corrected_article = context.parallelize(filtered_article).map(lambda x: x[1]).map(correct_token).collect()
    corrected_article = list(zip(filtered_indexes, corrected_article))
    n_grams = list(generate_n_gram(corrected_article, n))
    n_gram_dict =defaultdict(list)
    for n_gram in n_grams:
        n_gram_dict[to_ngram(n_gram)].append(n_gram[0][0])
    return tokenized_article, n_gram_dict

def compute_plagiarism(art_1_path: str, art_2_path: str):

    # First treat the articles 
    treated_1, grams_1 = treat_article(art_1_path, SC, ENG_STOPWORDS, N_GRAM)
    treated_2, grams_2 = treat_article(art_2_path, SC, ENG_STOPWORDS, N_GRAM)

    # Align sequence and glue the sequences 
    matching_sequence = align_sequences(grams_1,grams_2)
    glued_sequence = glue_sequence(matching_sequence, GAP_TOLERANCE)

    # Create the viewer function 
    match_viewer = lambda i: display_match(glued_sequence[i], treated_1,treated_2, PADDING)

    # compute the plagiarism score from both articles
    score = 2*len(matching_sequence)/(len(treated_1) + len(treated_2)) * 100

    print("The two articles have a similarity score of {:.2f}, with {} matching n-gram. You can use the viewer to visualize the matching sequences".format(score, len(matching_sequence)))
    return match_viewer, score


In [7]:
### File paths 
fr_path = "./txt files/french.txt"
en_path = "./txt files/english.txt"
it_path = "./txt files/italian.txt"
es_path = "./txt files/spanish.txt"

viewer , score = compute_plagiarism(fr_path, en_path)


The two articles have a similarity score of 1.31, with 81 matching n-gram. You can use the viewer to visualize the matching sequences


In [15]:
viewer(36)


the Cross gold at eight spikes having of one side one is the only leader first president born but he dove [31mdove enamelled in[0m in white on the other the image not there seldom afflicts from there come that there is Saint Michael Orals make
and the cross eight pointed gold on one side with a war council so much so that the first king dove [31mdove enamelled in[0m in white and on the other the im sta in charge of all the active part and performance of S Michele
