In [1]:
# Import all necessary library and function

import os
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np
!pip install PyPDF2
import PyPDF2
!pip install nltk
import nltk
from nltk.tokenize import word_tokenize
nltk.download('punkt')
from nltk.tokenize import sent_tokenize
import re
!pip install python-docx
import docx
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer

Collecting PyPDF2
  Downloading pypdf2-3.0.1-py3-none-any.whl (232 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m232.6/232.6 kB[0m [31m4.6 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: PyPDF2
Successfully installed PyPDF2-3.0.1


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


Collecting python-docx
  Downloading python_docx-1.1.0-py3-none-any.whl (239 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m239.6/239.6 kB[0m [31m5.1 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: python-docx
Successfully installed python-docx-1.1.0


In [2]:
# Defining the Token of each text

def tokenizing(flat_text,text):
  tokenizer = Tokenizer()
  tokenizer.fit_on_texts(flat_text)
  word_index = tokenizer.word_index
  tokenid = []
  for i in text:
    tokens = tokenizer.texts_to_sequences(i)
    tokenid.append(tokens)
  return tokenid

In [3]:
# Defining the TfidfVectorizer of each text

def vecTfid(flat_text,text):
  vec = TfidfVectorizer()
  vec.fit(flat_text)
  vecarr = []
  for i in text:
    transform = vec.transform(i).toarray()
    vecarr.append(transform)
  return vecarr

In [4]:
# Using the flag system to count copy-pasted and high similarity line

def count_flag(token1, token2, tf1, tf2):
  # Defining all necessary flag variables
  plag_tf_token = 0
  di = len(tf1)
  high_sim = []
  copy_paste = []
  struc_token = []
  struc_tf = []
  token_cos = []

  # Getting the similarity on Tokenized text and Vectorized text
  tf_cos = cosine_similarity(tf1,tf2)
  for i in range(len(token1)):
    cos_line= []
    for j in range(len(token2)):
      leng = len(list(set(token1[i]+token2[j])))
      cos = len(set(token1[i])&set(token2[j]))/leng
      cos_line.append(cos)
    token_cos.append(cos_line)

  # Both Tokenized and Vectorized has their own strength and uses
  # Similarity threshold (0.999, 0.7, and 0.3) may be changed
  # Flag system works by:
  # 1. Flag each line that are copy-pasted or high similarity with value 1, final result is the total line
  # 2. Line with less than 0.9999 similarity uses several threshold (0.7 and 0.3)
       # Both threshold gave different flag value (for tf) (plag_tf_token)
       # Final result of plag_tf_token is the summation of flag tf and token if total > 1

  for i in range(di):
    tf = 0
    token = 0
    if any(vals >= 0.9999 for vals in tf_cos[i]):
      high_sim.append([i,[index for index, vals in enumerate(tf_cos[i]) if vals >= 0.9999]])
    elif any(0.7 < vals < 0.9999 for vals in tf_cos[i]):
      struc_tf.append([i,[index for index, vals in enumerate(tf_cos[i]) if 0.35 < vals < 0.9999]])
      tf = 2
    elif any(0.35 < vals < 0.9999 for vals in tf_cos[i]):
      struc_tf.append([i,[index for index, vals in enumerate(tf_cos[i]) if 0.35 < vals < 0.9999]])
      tf = 1
    else:
      tf = 0

    if any(vals >= 0.9999 for vals in token_cos[i]):
      copy_paste.append([i,[index for index, vals in enumerate(token_cos[i]) if vals >= 0.9999]])
    elif any(0.7 < vals < 0.9999 for vals in token_cos[i]):
      struc_token.append([i,[index for index, vals in enumerate(token_cos[i]) if 0.7 < vals < 0.9999]])
      token = 1
    else:
      token = 0

    if token + tf > 1:
      plag_tf_token += 1

  return plag_tf_token, di, high_sim, copy_paste, struc_token, struc_tf

In [5]:
# Defining the function to remove unnecesary symbols and empty arrays

def remove(text):
  pattern = r'[“”‘’:;"_\',.()\–\[\]]'
  sub_text = re.sub(pattern,'', text)
  pattern = r'[\-\/]'
  sub_text = re.sub(pattern,' ', sub_text)
  token_text = word_tokenize(sub_text)
  words = [word for word in token_text if word]
  return ' '.join(words)

## **Word Path**

In [6]:
# Obtaining all text files that endswith .docx

word_path = '/content/try-1'

def word_file(word_path):
  word_file_list = np.array([file for file in os.listdir(word_path) if file.lower().endswith('.docx')],dtype=object)
  return word_file_list

## **Extract Word**

In [7]:
# Extracting the text from word files

def get_word_data(word_path,word_file):
  word_read = docx.Document(word_path+'/'+word_file)

  # Defining the necessary asrrays and variables
  par_arr = []
  split_arr = []
  lindex = 0
  refindex = -1

  #Reading each paragraphs inside the text
  for par in word_read.paragraphs:
    sentences = sent_tokenize(par.text.lower())
    for sent in sentences:
      lines = sent.split('\n')
      lines = [remove(line) for line in lines]
      lines = [line for line in lines if line]
      par_arr.extend(lines)

  # To define the first index and last index, may be removed depending on the format of text
  if 'latar belakang' in par_arr:
    lindex = par_arr.index('latar belakang')
    lindex+=1
  if 'daftar pustaka' in par_arr:
    refindex = par_arr.index('daftar pustaka')
  elif 'reference' in par_arr:
    refindex = par_arr.index('reference')
  if lindex > 0 or refindex > 0:
    return par_arr[lindex:refindex]
  else:
    return par_arr

## **Get Word Text List**

In [8]:
# Creating an array containing the file name and text of each files

def list_word_text(word_path,word_file):
  text_list = np.empty((0, 2), dtype=str)
  for files in word_file:
    text = get_word_data(word_path,files)
    text_list = np.append(text_list,[[files,text]],axis=0)
  return text_list

# **PDF**

## **PDF Path**

In [9]:
# Obtaining all text files that endswith .pdf

folder_pdf = '/content/tryp'

def pdf_file(folder_pdf):
  pdf_file_list = np.array([file for file in os.listdir(folder_pdf) if file.lower().endswith('.pdf')],dtype=object)
  return pdf_file_list

## **Extract PDF**
---


In [10]:
# Extracting the text from pdf files

def get_pdf(path,file_name):
  # Defining the necessary array and variables
  lindex = 0
  refindex = -1
  line_arr = []

  # Reading text in pdf file by page
  with open(path+'/'+file_name,'rb') as temp_pdf:
    read_pdf = PyPDF2.PdfReader(temp_pdf)
    for num_page in range(len(read_pdf.pages)):
      page = read_pdf.pages[num_page].extract_text().lower()
      sentence = sent_tokenize(page)
      for sent in sentence:
        lines = sent.split('\n')
        lines = [remove(line) for line in lines]
        lines = [line for line in lines if line]
        line_arr.extend(lines)

  # To define the first index and last index, may be removed depending on the format of text
  if 'latar belakang' in line_arr:
    lindex = line_arr.index('latar belakang')
    lindex+=1
  if 'daftar pustaka' in line_arr:
    refindex = line_arr.index('daftar pustaka')
  elif 'reference' in line_arr:
    refindex = line_arr.index('reference')
  if lindex > 0 or refindex > 0:
    return line_arr[lindex:refindex]
  else:
    return line_arr

## **Get PDF Text List**

In [11]:
# Creating an array containing the file name and text of each files

def list_all_text(pdf_folder_path,pdf_file_list):
  text_list = np.empty((0, 2), dtype=str)
  for files in pdf_file_list:
    text = get_pdf(pdf_folder_path,files)
    text_list = np.append(text_list,[[files,text]],axis=0)
  return text_list

# **Main Code**

In [18]:
def show_line(struc_token,struc_tf,text_list,high_sim,copy_paste,j,l):
  # Show all line that has over 99% similarity
  if len(high_sim) > 0:
    print("\nLine with 99% similarity:")
    for i in range(len(high_sim)):
      for k in high_sim[i][1]:
        print(f"\t {i+1}. {text_list[l][high_sim[i][0]]} vs {text_list[j][k]}")

  # Show all line that are copy-pasted
  if len(copy_paste) > 0:
    print("\nCopy-pasted line:")
    for i in range(len(copy_paste)):
      for k in copy_paste[i][1]:
        print(f"\t {i+1}. {text_list[l][copy_paste[i][0]]} vs {text_list[j][k]}")

  # Show all line that has high similar structure
  if len(struc_token) > 0:
    print("\nLine with similar structure:")
    for i in range(len(struc_token)):
      for k in struc_token[i][1]:
        print(f"\t {i+1}. {text_list[l][struc_token[i][0]]} vs {text_list[j][k]}")

  # Show all line that has similarity more than 35%
  if len(struc_tf) > 0:
    print("\nLine with similarity higher than 35%:")
    for i in range(len(struc_tf)):
      for k in struc_tf[i][1]:
        print(f"\t {i+1}. {text_list[l][struc_tf[i][0]]} vs {text_list[j][k]}")

In [16]:
# Main code of the system, calling all the neccesary functions

def main_code(text_list):
  # Separating the filenames and texts from text_list array
  name = [text[0] for text in text_list]
  text = [text[1] for text in text_list]

  # Flattened the text to smooth out the tokenize and vectorize
  flat_text = [line for text in text for line in text]

  # Calling the tokenizer and vectorizer function
  tfid = vecTfid(flat_text,text)
  token = tokenizing(flat_text,text)

  # Iterating each text to examine the similarity between text
  for i in range(len(tfid)):
    for j in range(i+1,len(tfid)):
      plag_tf_token, di,  high_sim, copy_paste, struc_token, struc_tf = count_flag(token[i], token[j], tfid[i], tfid[j])
      flag_tf = len(high_sim)
      flag_token = len(copy_paste)
      plag_score = (plag_tf_token+(flag_tf+flag_token)/2)/di
      tf_token = len(set([index for index,val in enumerate(struc_token)] + [index for index,val in enumerate(struc_tf)]))

      # Plagiarized score are categorized into 3 section
      # Those with plagiarism score more than 30% and those with less than 30%
      # For plagiarism score with more than 30%, a warning will be displayed, along with the number of copy-pasted and high similarity line
      # For plagiarism score with less than 30%, however, has copy-pasted and high similarity line, a warning will be displayed
      # For plagiarism score that has less than 30% and no copy-pasted and high similarity line, no warning will be displayed

      print(f"file {name[i]} vs file {name[j]}")
      if plag_score >= 0.3:
        print(f"\t!Warning! There are overall {plag_score*100:.2f}% similarity score in both file! Bigger than 30%!")
        if flag_tf > 0:
          print(f"\t\tAmong which, there are {flag_tf} line with 99% similarity! About {flag_tf/di*100:.2f}% of the text!")
        if flag_token > 0:
          print(f"\t\tAmong which, there are {flag_token} line with 99% similar structure! About {flag_token/di*100:.2f}% of the text!")
        if plag_tf_token > 0:
          print(f"\t\tAmong which, there are {tf_token} line with either 70% more similarity or similar structure! About {tf_token/di*100:.2f}% of the text!")
        show_line(struc_token,struc_tf,text,high_sim,copy_paste,j,i)
      elif plag_score < 0.3 and (flag_tf > 0 or flag_token > 0):
        print(f"\t!Warning! Overall there are {plag_score*100:.2f}% similarity score in both file, less than 30%, however:")
        if flag_tf > 0:
          print(f"\t\tThere are {flag_tf} line with 99% similarity in both file! About {flag_tf/di*100:.2f}% of the text!")
        if flag_token > 0:
          print(f"\t\tThere are {flag_token} line with 99% similar structure in both file! About {flag_token/di*100:.2f}% of the text!")
        if plag_tf_token > 0:
          print(f"\t\tAmong which, there are {tf_token} line with either 70% more similarity or similar structure! About {tf_token/di*100:.2f}% of the text!")
        show_line(struc_token,struc_tf,text,high_sim,copy_paste,j,i)
      else:
          print(f"\tSimilarity score is {plag_score*100:.2f}%! Congratulations, you may upload your work :)!")

      print("\n---------------------------------------------------------------------------------------------------------------------------\n")

# **Word Plagiarism Checker Function**

In [14]:
# Defining the Plagiarism Checker function for word files

def Plagiarism_checker_word(word_path):
  text_list = list_word_text(word_path,word_file(word_path))
  main_code(text_list)

## **Word file Example**

In [19]:
Plagiarism_checker_word(word_path)



file kei.docx vs file Try3.docx
		Among which, there are 60 line with 99% similarity! About 29.70% of the text!
		Among which, there are 60 line with 99% similar structure! About 29.70% of the text!
		Among which, there are 34 line with either 70% more similarity or similar structure! About 16.83% of the text!

Line with 99% similarity:
	 1. seperti kita ketahui manusia memiliki salah satu bagian tubuh yaitu mata yang memiliki fungsi sebagai indra penglihatan vs seperti kita ketahui manusia memiliki salah satu bagian tubuh yaitu mata yang memiliki fungsi sebagai indra penglihatan
	 2. meskipun demikian mata manusia memiliki batasan dalam melihat objek berukuran normal hingga besar vs meskipun demikian mata manusia memiliki batasan dalam melihat objek berukuran normal hingga besar
	 3. oleh sebab itu ditemukannya titik terang dengan adanya penemuan mikroskop oleh antoine van leeuwenhoek yang merupakan seorang belanda pada sekitar tahun 1590 an vs oleh sebab itu ditemukannya titik terang

# **PDF Plagiarism Check Function**

In [20]:
# Defining the Plagiarism Checker function for pdf files

def Plagiarism_checker_pdf(pdf_path):
  text_list = list_all_text(pdf_path,pdf_file(pdf_path))
  main_code(text_list)

## **PDF file Example**

In [21]:
Plagiarism_checker_pdf(folder_pdf)

file vir.pdf vs file wid.pdf
	Similarity score is 0.00%! Congratulations, you may upload your work :)!

---------------------------------------------------------------------------------------------------------------------------



