#### Cross Lingual Document Similarity Analysis/Plagiarism Detection
##### The Entire Code is below:

In [1]:
import re
import nltk
from nltk.util import ngrams, pad_sequence, everygrams
from nltk.lm import MLE, WittenBellInterpolated
from scipy.ndimage import gaussian_filter
import numpy as np
import pandas as pd
import plotly.graph_objects as go
import plotly.express as px
import spacy
from zipfile import ZipFile
from googletrans import Translator
import googletrans
from PyPDF2 import PdfReader
import tkinter as tk
from tkinter import *
from tkinter.ttk import *
from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
from pdfminer.converter import TextConverter
from pdfminer.layout import LAParams
from pdfminer.pdfpage import PDFPage
import io
import os
from io import StringIO

-> preProcess(text): returns text

- Remove puctuation, new lines, tabs and extra spaces.
- Remove "Machine Translated By Google" tag
- Remove the Bibliography and References Section


In [2]:
def preProcess(text):
    text = re.sub(r'[^\w\s]','',text)
    text = text.replace('\n', ' ')
    text = text.replace('\t', ' ')
    text = text.replace("Machine Translated by Google",' ')
    text = re.sub(r'^https?:\/\/.*[\r\n]*', ' ', text, flags=re.MULTILINE)
    text = " ".join(text.split())

    # i = text.find('Bibliography')
    # if i != -1:
    #     text = text[:i]
    # i = text.find('References')
    # if i != -1:
    #     text = text[:i]
    
    return text

-> extractText(fileName,start): 
- Returns extracted text from 'fileName' PDF from 'start' page

In [3]:
def extractText(fileName,start):
    rsrcmgr = PDFResourceManager()
    codec = 'utf-8'
    laparams = LAParams()

    with io.StringIO() as retstr:
        with TextConverter(rsrcmgr, retstr, codec=codec,
                           laparams=laparams) as device:
            with open(fileName, 'rb') as fp:
                interpreter = PDFPageInterpreter(rsrcmgr, device)
                password = ""
                maxpages = 0
                caching = True
                pagenos = set()
                pgNo = 0

                for page in PDFPage.get_pages(fp,
                                              pagenos,
                                              maxpages=maxpages,
                                              password=password,
                                              caching=caching,
                                              check_extractable=True):
                    if(pgNo != (start-1)):
                        interpreter.process_page(page)
                    pgNo += 1

                return retstr.getvalue()


-> extractFromZIP(zipFileName):
- Extracts the Corpus PDFs from the specified ZIP File

In [4]:
def extractFromZIP(zipFileName):
    # ZIP file should be in the same folder as the .ipynb file obviously
    # extract the zip file
    with ZipFile(zipFileName, 'r') as zipObj:
        zipObj.extractall()

In [5]:
# Load the language library
nlp = spacy.load('en_core_web_lg')

-> getSourceLangs():
- Get the dict of source languages for each source language currently present in the Folder

In [6]:
def getSourceLangs():
    lang_dict = googletrans.LANGUAGES
    for i in lang_dict:
        lang_dict[i] = lang_dict[i][0].upper() + lang_dict[i][1:]
        
    translator = Translator()
    source_lang_dict = {}
    i=0
    for x in os.listdir():
        if x.endswith(".pdf") and not x.startswith("Suspicious"):
            reader = PdfReader(x)
            pages = len(reader.pages)
            story = """"""
            story = story + str(reader.pages[0].extractText())
            lang = translator.detect(story)
            source_lang_dict[i] = lang_dict[lang.lang]
            os.remove(x)
            i+=1
    
    return source_lang_dict

-> getSuspiciousDoc():
- Get the name of the Suspicious Document

In [7]:
def getSuspiciousDoc():
    for x in os.listdir():
        if x.endswith(".pdf") or x.endswith(".PDF"):
            return x

-> createCorpus():
- Create and Return List of Source Doc Names

In [8]:
def createCorpus():
    corpus = []
    for x in os.listdir():
        if ((x.endswith(".pdf") or x.endswith(".PDF")) and not(x.startswith("Sus"))):
            corpus.append(x)
    return corpus

-> removeOriginals():
- Remove the Original Source Documents temporarily for further code execution

In [9]:
def removeOriginals():
    for x in os.listdir():
        if ((x.endswith(".pdf") or x.endswith(".PDF")) and not(x.startswith("Sus"))):
            os.remove(x)

-> createListOfCorpus(corpus):
- Create and return list of Individual contents of the Corpus Docs

In [10]:
def createListOfCorpus(corpus):
    listOfCorpus = []
    for x in range(len(corpus)):
        text = extractText(corpus[x],1)
        text = preProcess(text)
        listOfCorpus.append(text)
    return listOfCorpus

-> createReferences(corpus):
- Create and return dictionary of the Corpus Doc Names for mapping back Plagiarised Portions

In [11]:
def createReferences(corpus):
    references = {}
    for x in range(len(corpus)):
        references[x] = corpus[x]
    
    return references

-> createVector(listOfCorpus):
- Create and return list of vectors of the Individual contents of the Corpus Docs

In [12]:
def createVector(listOfCorpus):
    vector = []
    for x in listOfCorpus:
        vector.append(nlp(x))
    return vector

-> trainModel(trainingVector, n):
- Trains model based on Witten Bell Interpolation over trainingVector (of entire Corpus combined) and n-gram value 'n'
- Returns model

In [13]:
def trainModel(trainingVector, n):
    words = [w.text for w in trainingVector]
    training_data = list(pad_sequence(words, n, 
                                    pad_left=True, 
                                    left_pad_symbol="<s>"))
    # Generate n-grams from the training data
    ngrams = list(everygrams(training_data, max_len=n))
    # Build n-gram language model
    model = WittenBellInterpolated(n)
    model.fit([ngrams],vocabulary_text=training_data)
    return model

-> createTestData(fileName):
- Create the test data from the Suspicious Doc and return its vector

In [14]:
def createTestData(fileName):
    suspiciousText = extractText(fileName,0)
    suspiciousText = preProcess(suspiciousText)
    testVector = nlp(suspiciousText)
    words = [w.text for w in testVector]
    test_data = list(pad_sequence(words, n,
                                pad_left=True,
                                left_pad_symbol="<s>"))
    return test_data,testVector,suspiciousText


-> generateScores(model, test_data):
- Get scores of probability of plagiarism on individual words based on context of neighbouring 'n' words
- Return numpy array of those scores

In [15]:
def generateScores(model, test_data):
    score=[]
    #source = []
    # Generate score
    for i,ele in enumerate(test_data[n-1:]):
        s = model.score(ele,test_data[i:i+n-1])
        score.append(s)

    # Convert to numpy array
    score_np=np.array(score)
    return score_np

-> generateData(width,score_np):
- Creating another numpy array of matrix dimensions to fit in a heatmap
- Creating source labels to label individual Plagiarised portions
- returning diff of size b/w 1D 'a' and score_np


In [16]:
def generateData(width,score_np):
    height=np.ceil(len(score_np)/width).astype("int32")
    # source label to be used to identify original source of plagiarised portions
    # Copy the score_np to a new array with shape (height,width)
    a=np.zeros(width*height)
    sourceLabel = ["Unplagiarised"]*(width*height)
    a[:len(score_np)] = score_np

    diff = len(a) - len(score_np)
    # Apply gaussian filter to the array
    a = gaussian_filter(a, sigma=1.0)
    # Reshape to fit rectangular shape
    a = a.reshape(-1,width)
    return a, sourceLabel, diff

-> getPlagPercentage(a,diff):
- Get back the Pecentage of Plagiarism
- Get back the indexes of the Plagiarised words

In [17]:
def getPlagPercentage(a,diff):
    a = a.reshape(-1)
    plagWds = 0
    plID=[] 
    # If a[i] > 0.62 or a[i-1]>0.58 or a[i+1]>0.58, then increase plagWds by 1
    for i in range(1,a.shape[0]-2): 
        if a[i] > 0.62 or a[i+1]>0.60 or a[i+2]>0.60 or a[i-1]>0.60:
            plagWds += 1
            plID.append(i)

    plagPercentage = plagWds*100/(len(a)-diff)

    return plagPercentage,plID

-> check_plagiarised(j,vecO,vecP,id,fin,plag_dict):
- Mapping the Plagiarised portions to their Sources
- Returns the dictionary with indexes of Plagiarised portions and their corresponding portions

In [18]:
def check_plagiarised(j,vecO,vecP,id,fin,plag_dict):
    for k in range(len(vecO)-3):
        # X is Vector of the plagiarized text present between indexes id and fin
        X1=vecP[id:id+12]
        X2 = vecP[fin-12:fin]
        X3 = vecP[(id+fin)//2-6:(id+fin)//2+6]
        # Y is Vector of the original text present between indexes j and j+l
        Y1=vecO[k:k+12]
        # Y2 = vecO[k:k+fin-id+1]
        
        #Computing cosine similarity
        sim1 = X1.similarity(Y1)
        sim2 = X2.similarity(Y1)
        sim3 = X3.similarity(Y1)

        # If the cosine similarity is greater than 0.997, then we get a match from original text
        if(sim1>0.995 or sim2>0.995 or sim3>0.997):
            if(j in plag_dict):
                a=[id,fin]
                b=[k,k+fin-id+1]
                plag_dict[j].append([a,b])
            else:
                a=[id,fin]
                b=[k,k+fin-id+1]
                plag_dict[j] = [[a,b]]
            break
    
    return plag_dict

-> getJumpStates(plID):
- Returns the specific Indexes of the Plagiarised portions from the Suspicious Doc

In [19]:
def getJumpStates(plID):
    jumpStates=[]
    jumpStates.append(plID[0])
    for i in range(len(plID)):
        if(plID[i]-plID[i-1]>11):
            jumpStates.append(plID[i-1])
            jumpStates.append(plID[i])

    jumpStates.append(plID[-1])
    return jumpStates

-> getPlagDict(jumpStates,vector,testVector):
- For each pair in the jumpStates, it adds its source to the plag_dict and returns the plag_dict after completion

In [20]:
def getPlagDict(jumpStates,vector,testVector):
    plag_dict = {}
    # Fill plag_dict with the plagiarised portions
    for i in range(0,len(jumpStates),2):
        id=jumpStates[i]
        fin=jumpStates[i+1]

        # Checking which document the plagiarised portion belongs to
        for j,vec in enumerate(vector):
            plag_dict=check_plagiarised(j,vector[j],testVector,id,fin,plag_dict)
            
    return plag_dict

-> populateSourceLabel(plag_dict,sourceLabel,references,width):
- Fills the SourceLabels with the original names of the sources from which plagiarised portions have been taken

In [21]:
def populateSourceLabel(plag_dict,sourceLabel,references,width):
    # Iterate over plag_dict
    for i in plag_dict:
        v = plag_dict[i]
        for j in v:
            # Extracting the plagiarised portions
            st = j[0][0]
            en = j[0][1]

            # Labelling those portions
            sourceLabel[st:en+1] = [references[i] for x in sourceLabel[st:en+1]]
            
    sourceLabel = np.array(sourceLabel)
    sourceLabel = sourceLabel.reshape(-1,width)
    return sourceLabel

-> createLabels(testVector,width,diff):
- Creates the content for writing in the Heatmaps
- Basically the entire suspicious doc

In [22]:
def createLabels(testVector,width,diff):
    # format labels
    labels = [" ".join(testVector[i:i+width].text.split()) for i in range(0, len(testVector), width)]
    labels_individual = [x.split() for x in labels]
    labels_individual[-1] += [""]*diff
    labels = [f"{x:60.60}" for x in labels]

    return labels,labels_individual

-> generateHeatMap(a,sourceLabel,width,height,labels_individual):
- Generates heatmap to help visualise the plagiarised portions
- Returns the created figure

In [23]:
def generateHeatMap(a,sourceLabel,width,height,labels_individual,suspiciousDoc):
    # create heatmap of the Plagiarized Text
    a=a.reshape(-1,width)
    fig = go.Figure(data=go.Heatmap(
                    z=a, x0=0, dx=1,
                    #y=labels, 
                    zmin=0, zmax=1,
                    customdata=sourceLabel,
                    hovertemplate='%{customdata} <br><b>Score:%{z:.3f}<extra></extra>',
                    text=labels_individual,
                    texttemplate='%{text}',
                    textfont={"size":7},
                    colorscale='reds'
                    ))
    fig.update_layout({"height":height*25, "width":1000, "font":{"family":"sans-serif"},"title": suspiciousDoc.replace(".pdf",'')})
    fig['layout']['yaxis']['autorange'] = "reversed"
    return fig

-> displayDoc(jumpStates,testVector,suspiciousText)
- Displays the Entire Suspicious Doc in a TextBox
- Has two buttons: Highlight and Exit
- Highlight button highlights the Plagiarised Portions and Exit button closes the window


In [24]:
def displayDoc(jumpStates,testVector,suspiciousText,suspiciousDoc):
    root = Tk()
    
    # specify size of window.
    root.geometry("1000x800")
    root.config(bg='#59FC97')

    style = Style()

    style.configure('W.TButton', font =
               ('calibri', 10, 'bold'),
                foreground = 'black',
                background = 'purple')
    
    # Create text widget and specify size.
    T = Text(root, wrap = 'word', height = 38, width = 120, font=("calibri", 12), bg='black', fg='white')
    
    # Create label
    l = Label(root, text = suspiciousDoc.replace(".pdf",''))
    l.config(font =("calibri", 20), background='#59FC97', foreground='black')
            
    
    # Create an Exit button.
    b1 = Button(root, text = "Exit",
                style='W.TButton',
                command = root.destroy)

    l.pack()
    T.pack()
    b1.pack(side=RIGHT, padx=350, pady=25)

    # Insert The Text
    T.insert(tk.END, suspiciousText)

    def search_re(pattern):
        """
        Uses the python re library to match patterns.

        pattern - the pattern to match.
        """
        matches = []
        text = T.get("1.0", tk.END).splitlines()
        for i, line in enumerate(text):
            for match in re.finditer(pattern, line):
                matches.append(f"{i + 1}.{match.start()}")
                matches.append(f"{i + 1}.{match.end()}")
        
        return matches
    
    def getLocDict(jumpStates):
        loc_dict = {}
        for i in range(0,len(jumpStates),2):
            a = search_re(testVector[jumpStates[i]:jumpStates[i]+4].text)
            b = search_re(testVector[jumpStates[i+1]-3:jumpStates[i+1]+1].text)
            loc_dict[a[0]] = b[1]
        return loc_dict

    def add_highlighter(s,e):
        T.tag_add("start", s, e)
        T.tag_config("start", background= "#FE5E61", foreground= "white")

    def highlight_all(loc_dict):
        for i in loc_dict:
            add_highlighter(i, loc_dict[i])
    
    loc_dict = getLocDict(jumpStates)

    b2 = Button(root, text= "Highlight Plagiarised Portions", style= 'W.TButton', command = lambda: highlight_all(loc_dict))
    b2.pack(side = BOTTOM, padx = 200, pady = 25)

    root.mainloop()

-> getPieLabelAndVals_1(plag_dict,plID):
- Set and return the labels for the Pie Chart 1 which will contain overall results of Plagiarism Check
- Set and return the values for calculating percentage values in Pie Chart

In [25]:
def getPieLabelAndVals_1(plag_dict,plID):
    pieL1 = []

    pieL1.append("Unplagiarised")
    pieL1.append("Plagiarised")

    pieVal1 = []
    pieVal1.append(len(score_np)-len(plID))

    # Iterate over plag_dict
    s=0
    for i in plag_dict:
        v = plag_dict[i]
        
        for j in v:
            # Extracting the plagiarised portions
            st = j[0][0]
            en = j[0][1]

            s=s+en-st+1
            
    pieVal1.append(s)
    return pieL1,pieVal1

-> getPieLabelAndVals_1(plag_dict,plID,references):
- Set and return the labels for the Pie Chart 2 which will contain detailed results of Plagiarism Check depicting percentage of Plagiarism
    attributed to each Source
- Set and return the values for calculating percentage values in Pie Chart 2

In [26]:
def getPieLabelAndVals_2(plag_dict,plID,references):
    pieL2 =[]
    pieL2.append("Unplagiarised")

    for i in references:
        pieL2.append(references[i])

    pieVal2 = [0]*len(pieL2)
    pieVal2[0]=(len(score_np)-len(plID))

    # Iterate over plag_dict
    for i in plag_dict:
        v = plag_dict[i]
        s=0
        for j in v:
            # Extracting the plagiarised portions
            st = j[0][0]
            en = j[0][1]

            s=s+en-st+1
        pieVal2[i+1]=s
    
    return pieL2,pieVal2

-> createDataFrames(pieL1,pieL2,pieVal1,pieVal2):
- Create pandas Data Frame of Labels and Values for both Pie Charts 

In [27]:
def createDataFrames(pieL1,pieL2,pieVal1,pieVal2):
    df1 = pd.DataFrame({"Status":pieL1,"No. of Words":pieVal1})
    df2 = pd.DataFrame({"Source":pieL2,"No. of Words":pieVal2})
    return df1,df2

-> createPieCharts(df1,df2):
- Create the Pie charts for both data frames with custom features for better visibility

In [28]:
def createPieCharts(df1,df2):
    figP1 = px.pie(df1,values='No. of Words', names='Status',
             title = "Plagiarism Status",
             color_discrete_sequence=px.colors.sequential.Bluered_r,
             hole=0.5)
    figP1.update_layout({"height":400, "width":400, "font":{"family":"sans-serif"}})
    figP1.update_layout(paper_bgcolor="#000000")
    figP1.update_layout(
                            font_family="sans-serif",
                            font_color="cyan",
                            title_font_family="sans-serif",
                            title_font_color="white",
                            legend_title_font_color="cyan"
                        )
    figP1.update_layout(margin=dict(t=40, b=0, l=0, r=0))

    figP2 = px.pie(df2,values='No. of Words', names='Source',
                title = "Plagiarism Source",
                color_discrete_sequence=px.colors.sequential.Rainbow_r,
                hole=0.5)
    figP2.update_layout({"height":400, "width":800, "font":{"family":"sans-serif"}})
    figP2.update_layout(paper_bgcolor="#000000")
    figP2.update_layout(
                            font_family="sans-serif",
                            font_color="cyan",
                            title_font_family="sans-serif",
                            title_font_color="white",
                            legend_title_font_color="cyan"
                        )
    figP2.update_layout(margin=dict(t=40, b=0, l=0, r=0))

    return figP1,figP2

In [29]:
def modify_df(df2,source_lang_dict):
    df2['Percent'] = df2['No. of Words']/sum(df2['No. of Words'])*100
    df2['Source Language'] = ''
    
    for x in df2.index:
        if x==0:
            df2.loc[x,'Source Language'] = '---'
        else:
            df2.loc[x,'Source Language'] = source_lang_dict[x-1]
    
    def make_clickable(val):
        return f'<a target="_blank" href="{val}">{val}</a>'

    df2 = df2.style.format({'Source': make_clickable})
    return df2

- The real code execution begins here
- Please take care of the following points:
    - The Documents in Corpus/Source, must be contained in a zip file in their original (untranslated) versions
    - The Documents in Corpus/Source, must be contained in another zip file in their derived (translated) versions
    - The Suspicious Document must be contained in a seperate zip file
- The User would be asked for their input 3 times during the execution henceforth
- Inputs would ask for the names of the aforementioned zip files

In [30]:
zipUntranslated = input("Name of the Corpus (Untranslated) zip file (put .zip in name): ")

In [None]:
extractFromZIP(zipUntranslated)
source_lang_dict = getSourceLangs()

In [None]:
zipTranslated = input("Name of the Corpus (Translated) zip file (put .zip in name): ")

In [None]:
# Call all the functions
extractFromZIP(zipTranslated)
corpus = createCorpus()
references = createReferences(corpus)
listOfCorpus = createListOfCorpus(corpus)
vector = createVector(listOfCorpus)

In [None]:
# Create mega-corpus and training vector from the mega-corpus
megaCorpus = '\n'.join(listOfCorpus)
trainingVector = nlp(megaCorpus)

In [None]:
# Value of n for n-grams
n = 5

In [None]:
# Call function to train the model
model = trainModel(trainingVector,n)

In [None]:
# Remove original corpus from current folder temporarily
removeOriginals()

In [None]:
zipSuspicious = input("Name of the zip file with Suspicious Doc (put .zip in name): ")

In [None]:
# Extract suspicious document, create test data vector
extractFromZIP(zipSuspicious)
suspiciousDoc = getSuspiciousDoc()
test_data,testVector,suspiciousText = createTestData(suspiciousDoc)

In [None]:
# Get scores for each token/word in the test data vector
score_np = generateScores(model,test_data)
# Keeping width = 22 for readability
a,sourceLabel,diff = generateData(22,score_np)
plagPercentage,plID = getPlagPercentage(a,diff)

In [None]:
jumpStates = []
# If there are no plagiarised sections then jumpStates will be empty
if len(plID)!=0:
    jumpStates = getJumpStates(plID)

In [None]:
# Get the mapping of plagiarised sections to source documents
plag_dict = getPlagDict(jumpStates,vector,testVector)

In [None]:
# Generate labels and values for heatmap
sourceLabel = populateSourceLabel(plag_dict,sourceLabel,references,22)
labels,labels_individual = createLabels(testVector,22,diff)

In [None]:
# Create all the visualisation techniques
height = np.ceil(len(score_np)/22).astype("int32")
fig = generateHeatMap(a,sourceLabel,22,height,labels_individual,suspiciousDoc)
pieL1,pieVal1 = getPieLabelAndVals_1(plag_dict,plID)
pieL2,pieVal2 = getPieLabelAndVals_2(plag_dict,plID,references)
df1,df2 = createDataFrames(pieL1,pieL2,pieVal1,pieVal2)
figP1,figP2 = createPieCharts(df1,df2)
# A Seperate DataFrame for the Suspicious Text
data = [[suspiciousDoc,sum(df2['No. of Words']),plagPercentage,"English"]]
df3 = pd.DataFrame(data,columns=['File Name','No. of Words','Plagiarised Percentage','Language'])

In [None]:
# Modify the DataFrame for the Suspicious Text to make it clickable
def make_clickable(val):
        return f'<a target="_blank" href="{val}">{val}</a>'
df3 = df3.style.format({'File Name': make_clickable})

In [None]:
# Re-extract the source documents from the zip file
extractFromZIP(zipTranslated)

- Below this point, there are 5 ways to visualise/view the Results
- The ways are:
    - Suspicious Document Heatmap:
        - Entire Document broken into block of words
        - Plagiarised portions highlighted in varying shades of red
        - Hover over any Plagiarised Portion to reveal Original Source Document
    - Suspicious Document Textbox:
        - View entire Document
        - Two buttons Highlight and Exit
        - Click on Highlight to highlight plagiarised portions in Red
    - Pie Chart 1:
        - Displays Overall percentages of Plagiarism
    - Pie Chart 2:
        - Detailed Percentages of Plagiarism from each source
    - Table:
        - Table to display all details of Pie Chart 2 in tabular format (and a seperate one to view Suspicious Document)
        - Contains a column 'Source Language' to display language of Source Document
        - Has Clickable Link in 'Source' column which opens that Document in a seperate tab

In [None]:
# Show heatmap
fig.show()

In [None]:
# Open Textbox 
displayDoc(jumpStates,testVector,suspiciousText,suspiciousDoc)

In [None]:
# Show Pie Chart-I
figP1.show()

In [None]:
# Show Pie Chart-II
figP2.show()

In [None]:
# Modily the DataFrame and add colums
df2 = modify_df(df2,source_lang_dict)

In [None]:
# Show tabulated data for Source documents
df2

Unnamed: 0,Source,No. of Words,Percent,Source Language
0,Unplagiarised,558,44.14557,---
1,Applications of neural networks and deep learning to biomedical engineering.pdf,109,8.623418,Spanish
2,Artificial intelligence-machine learning in Public Administration.pdf,61,4.825949,Portuguese
3,"Images, ecology and deep learning.pdf",101,7.990506,French
4,Is it possible to develop quantum computing in Venezuela.pdf,207,16.376582,Spanish
5,The Nuclear Mirage.pdf,228,18.037975,Spanish


In [None]:
# Show tabulated data for Suspicious document
df3

Unnamed: 0,File Name,No. of Words,Plagiarised Percentage,Language
0,Science Article.pdf,1264,55.67911,English
