In [1]:
import pyspark
from nltk.corpus import stopwords
from collections import defaultdict
from helpers import correct_token, generate_n_gram, to_ngram, align_sequences, glue_sequence, display_match
from nltk import word_tokenize
from copy import deepcopy
import pandas as pd 
import plotly.graph_objects as go

SC = pyspark.SparkContext()

In [2]:
### Model parameters
ENG_STOPWORDS = set(stopwords.words("english"))
N_GRAM = 3
GAP_TOLERANCE = 5 
PADDING = 20


def treat_article(article_path:str, context, stopwords, n):
    with open(article_path, mode = "r", encoding = "utf-8") as f:
        data = ''.join(f.readlines())
    full_article = ''.join([c for c in data if c.isalnum() or c == " "])
    tokenized_article = list(enumerate(word_tokenize(full_article)))
    filtered_article = [(index,token.lower()) for index,token in tokenized_article if token not in stopwords ]
    filtered_indexes = [index for index,_ in filtered_article]
    corrected_article = context.parallelize(filtered_article).map(lambda x: x[1]).map(correct_token).collect()
    corrected_article = list(zip(filtered_indexes, corrected_article))
    n_grams = list(generate_n_gram(corrected_article, n))
    n_gram_dict =defaultdict(list)
    for n_gram in n_grams:
        n_gram_dict[to_ngram(n_gram)].append(n_gram[0][0])
    return tokenized_article, n_gram_dict

def compute_plagiarism(art_1_path: str, art_2_path: str):

    # First treat the articles 
    treated_1, grams_1 = treat_article(art_1_path, SC, ENG_STOPWORDS, N_GRAM)
    treated_2, grams_2 = treat_article(art_2_path, SC, ENG_STOPWORDS, N_GRAM)

    # Align sequence and glue the sequences 
    matching_sequence = align_sequences(grams_1,grams_2)
    glued_sequence = glue_sequence(matching_sequence, GAP_TOLERANCE)

    # Create the viewer function 
    match_viewer = lambda i: display_match(glued_sequence[i], treated_1,treated_2, PADDING)

    # compute the plagiarism score from both articles
    score = 2*len(matching_sequence)/(len(treated_1) + len(treated_2)) * 100

    print("The two articles have a similarity score of {:.2f}, with {} matching n-gram. You can use the viewer to visualize the matching sequences".format(score, len(matching_sequence)))
    return match_viewer, score

def compute_plagiarism_from_articles(art_1, art_2):

    # First treat the articles 
    treated_1, grams_1 = deepcopy(art_1)
    treated_2, grams_2 = deepcopy(art_2)

    # Align sequence and glue the sequences 
    matching_sequence = align_sequences(grams_1,grams_2)
    glued_sequence = glue_sequence(matching_sequence, GAP_TOLERANCE)

    # Create the viewer function 
    match_viewer = lambda i: display_match(glued_sequence[i], treated_1,treated_2, PADDING)

    # compute the plagiarism score from both articles
    score = 2*len(matching_sequence)/(len(grams_1) + len(grams_2)) * 100

    print("The two articles have a similarity score of {:.2f}, with {} matching n-gram. You can use the viewer to visualize the matching sequences".format(score, len(matching_sequence)))
    return match_viewer, score


In [3]:
### File paths 
fr_path = "./txt files/french.txt"
en_path = "./txt files/english.txt"
it_path = "./txt files/italian.txt"
es_path = "./txt files/spanish.txt"

fr_article = treat_article(fr_path,SC,ENG_STOPWORDS,N_GRAM)
en_article = treat_article(en_path,SC,ENG_STOPWORDS,N_GRAM)
it_article = treat_article(it_path,SC,ENG_STOPWORDS,N_GRAM)
es_article = treat_article(es_path,SC,ENG_STOPWORDS,N_GRAM)

languages = ["french","english","italian","spanish"]
articles = [fr_article,en_article,it_article,es_article]


In [4]:
results = pd.DataFrame(columns = languages,index = languages)
for i in range(4):
    for j in range(i+1,4):
        _, sim = compute_plagiarism_from_articles(articles[i], articles[j])
        results[languages[i]][languages[j]] = sim
results

The two articles have a similarity score of 0.02, with 3 matching n-gram. You can use the viewer to visualize the matching sequences
The two articles have a similarity score of 2.35, with 81 matching n-gram. You can use the viewer to visualize the matching sequences
The two articles have a similarity score of 3.98, with 124 matching n-gram. You can use the viewer to visualize the matching sequences
The two articles have a similarity score of 0.01, with 2 matching n-gram. You can use the viewer to visualize the matching sequences
The two articles have a similarity score of 0.02, with 3 matching n-gram. You can use the viewer to visualize the matching sequences
The two articles have a similarity score of 2.97, with 100 matching n-gram. You can use the viewer to visualize the matching sequences


Unnamed: 0,french,english,italian,spanish
french,,,,
english,0.015613,,,
italian,2.35021,0.010275,,
spanish,3.982656,0.015681,2.973536,


In [5]:
results.min().min()
results.max().max()

3.98265617472298

In [63]:
france_lat_lon = (46.00,2.00)
spain_lat_lon = (40.46,-3.75)
italy_lat_lon = (42.23,13.57)
england_lat_lon = (52.35,-1.17)
import plotly.express as px
import plotly
from scipy.interpolate import interp1d
def to_rgb(hex:str):
    x = hex[1:]
    return tuple(int(x[i:i+2], 16) for i in (0, 2, 4))
interpolate = interp1d([results.min().min(), results.max().max()], [0,1])
print(interpolate(2))
colorscales = px.colors.sequential.Viridis
fig = go.Figure()
test = list(zip(languages,[france_lat_lon,england_lat_lon,italy_lat_lon,spain_lat_lon]))
for i in range(4):
    for j in range(i+1,4):
        first_name, first_lat_lon = test[i]
        second_name, second_lat_lon = test[j]
        print(first_name, second_name,results[first_name][second_name] )
        # color = plotly.colors.find_intermediate_color(to_rgb(colorscales[3]),to_rgb(colorscales[0]),interpolate(results[first_name][second_name]))
        color = plotly.colors.find_intermediate_color((0,0,0),(255,255,255),interpolate(results[first_name][second_name]))
        fig.add_trace(
            go.Scattergeo(
                mode = "markers+lines",
                lon = [first_lat_lon[1],second_lat_lon[1]],
                lat = [first_lat_lon[0],second_lat_lon[0]],
                line_color = 'rgb'+str(color),
                name = first_name + " - "+second_name +" sim : "  + "{:.2f}".format(results[first_name][second_name])
            )
        )


fig.update_layout(
    title_text = 'Similarity map between articles of encyclopedias',
    showlegend = True,
    geo = dict(
        resolution = 50,
        showland = True,
        # showlakes = True,
        landcolor = 'rgb(204, 204, 204)',
        # countrycolor = 'rgb(204, 204, 204)',
        # lakecolor = 'rgb(255, 255, 255)',
        projection_type = "equirectangular",
        # coastlinewidth = 2,
        lataxis = dict(
            range = [35, 58],
            # showgrid = True,
            dtick = 10
        ),
        lonaxis = dict(
            range = [-15, 20],
            # showgrid = True,
            dtick = 20
        ),
    )
)   
fig.update_layout(template = "none", width = 800, height = 600, legend_orientation = "h")
fig.show()

0.500889731993947
french english 0.0156128024980484
french italian 2.350210358334542
french spanish 3.98265617472298
english italian 0.010275116237252434
english spanish 0.015680945038287642
italian spanish 2.973535533749628


In [21]:
to_rgb(colorscales[3]),to_rgb(colorscales[0])

((49, 104, 142), (68, 1, 84))