# **Import Necessary Library**

---


In [1]:
# Import all necessary library and function

import os
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np
!pip install PyPDF2
import PyPDF2
!pip install nltk
import nltk
from nltk.tokenize import word_tokenize
nltk.download('punkt')
from nltk.tokenize import sent_tokenize
import re
!pip install python-docx
import docx
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences



[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!




# **Outline:**

```
1.   Similarity Check Function    | 2.   PDF                               | 3.   Word
     - TfidfVectorizer            |     - PDF Path                         |     - Word Path
     - Tokenizer                  |     - Extract PDF                      |     - Extract Word
     - Cosine_similarity          |     - Removing and Tokenizing Text     |     - Removing and Tokenizing Text
     - Padding text               |     - Creating Text List               |     - Creating Text List
                                  |     - PDF Plagiarism Check Function    |     - Word Plagiarism Check Function
                                  |     - Example                          |     - Example
```

#**Tokenize, Vectorize, and Padding**

In [2]:
# Defining the Token of each text

def tokenizing(flat_text,text):
  tokenizer = Tokenizer()
  tokenizer.fit_on_texts(flat_text)
  word_index = tokenizer.word_index
  tokenid = []
  for i in text:
    tokens = tokenizer.texts_to_sequences(i)
    tokenid.append(tokens)
  return tokenid

In [3]:
# Defining the TfidfVectorizer of each text

def vecTfid(flat_text,text):
  vec = TfidfVectorizer()
  vec.fit(flat_text)
  vecarr = []
  for i in text:
    transform = vec.transform(i).toarray()
    vecarr.append(transform)
  return vecarr

In [4]:
# Adding Pad Sequences on the Tokenized text

def padding(maxlen,sequences):
  padded_seq = []
  for i in range(len(sequences)):
    pad = pad_sequences(sequences[i], maxlen=maxlen, padding='post')
    padded_seq.append(pad)
  return padded_seq

# **Similarity Check**

In [5]:
# Using the flag system to count copy-pasted and high similarity line

def count_flag(pad1, pad2, tf1, tf2):
  # Getting the similarity on Tokenized text and Vectorized text
  tf_cos = cosine_similarity(tf1,tf2)
  token_cos = cosine_similarity(pad1,pad2)

  # Defining all necessary flag variables
  red_flag_tf = 0
  red_flag_token = 0
  tf = 0
  token = 0
  plag_tf_token = 0
  di = len(tf_cos)

  # Both Tokenized and Vectorized has their own strength and uses
  # Similarity threshold (0.999, 0.7, and 0.3) may be changed
  # Flag system works by:
  # 1. Flag each line that are copy-pasted or high similarity with value 1, final result is the total line
  # 2. Line with less than 0.9999 similarity uses several threshold (0.7 and 0.3)
       # Both threshold gave different flag value (for tf) (plag_tf_token)
       # Final result of plag_tf_token is the summation of flag tf and token if total > 1
  for i in range(di):
    if any(vals > 0.99 for vals in tf_cos[i]):
      red_flag_tf += 1
    if any(vals > 0.99 for vals in token_cos[i]):
      red_flag_token += 1
  for i in range(di):
    if any(vals >= 0.7 for vals in tf_cos[i]):
      tf = 2
    elif any(vals >= 0.3 for vals in tf_cos[i]):
      tf = 1
    else:
      tf = 0
    if any(vals >= 0.7 for vals in token_cos[i]):
      token = 1
    else:
      token = 0
    if token + tf > 1:
      plag_tf_token += 1
  return red_flag_tf, red_flag_token, plag_tf_token, di

## **Remove Unnecessary Symbols**

In [6]:
# Defining the function to remove unnecesary symbols and empty arrays

def remove(text):
  pattern = r'[“”‘’:;"_\',.()\–\[\]\-]'
  sub_text = re.sub(pattern,'', text)
  token_text = word_tokenize(sub_text)
  words = [word for word in token_text if word]
  return ' '.join(words)

# **Word**

## **Word Path**

In [7]:
# Obtaining all text files that endswith .docx

word_path = '/content/try-1'

def word_file(word_path):
  word_file_list = np.array([file for file in os.listdir(word_path) if file.lower().endswith('.docx')],dtype=object)
  return word_file_list

## **Extract Word**

In [8]:
# Extracting the text from word files

def get_word_data(word_path,word_file):
  word_read = docx.Document(word_path+'/'+word_file)

  # Defining the necessary asrrays and variables
  par_arr = []
  split_arr = []
  lindex = 0
  refindex = -1

  #Reading each paragraphs inside the text
  for par in word_read.paragraphs:
    sentences = sent_tokenize(par.text.lower())
    for sent in sentences:
      lines = sent.split('\n')
      lines = [remove(line) for line in lines]
      lines = [line for line in lines if line]
      par_arr.extend(lines)

  # To define the first index and last index, may be removed depending on the format of text
  if 'latar belakang' in par_arr:
    lindex = par_arr.index('latar belakang')
    lindex+=1
  if 'daftar pustaka' in par_arr:
    refindex = par_arr.index('daftar pustaka')
  elif 'reference' in par_arr:
    refindex = par_arr.index('reference')
  if lindex > 0 or refindex > 0:
    return par_arr[lindex:refindex]
  else:
    return par_arr

## **Get Word Text List**

In [9]:
# Creating an array containing the file name and text of each files

def list_word_text(word_path,word_file):
  text_list = np.empty((0, 2), dtype=str)
  for files in word_file:
    text = get_word_data(word_path,files)
    text_list = np.append(text_list,[[files,text]],axis=0)
  return text_list

# **PDF**

## **PDF Path**

In [10]:
# Obtaining all text files that endswith .pdf

folder_pdf = '/content/tryp'

def pdf_file(folder_pdf):
  pdf_file_list = np.array([file for file in os.listdir(folder_pdf) if file.lower().endswith('.pdf')],dtype=object)
  return pdf_file_list

## **Extract PDF**
---


In [11]:
# Extracting the text from pdf files

def get_pdf(path,file_name):
  # Defining the necessary array and variables
  lindex = 0
  refindex = -1
  line_arr = []

  # Reading text in pdf file by page
  with open(path+'/'+file_name,'rb') as temp_pdf:
    read_pdf = PyPDF2.PdfReader(temp_pdf)
    for num_page in range(len(read_pdf.pages)):
      page = read_pdf.pages[num_page].extract_text().lower()
      sentence = sent_tokenize(page)
      for sent in sentence:
        lines = sent.split('\n')
        lines = [remove(line) for line in lines]
        lines = [line for line in lines if line]
        line_arr.extend(lines)

  # To define the first index and last index, may be removed depending on the format of text
  if 'latar belakang' in line_arr:
    lindex = line_arr.index('latar belakang')
    lindex+=1
  if 'daftar pustaka' in line_arr:
    refindex = line_arr.index('daftar pustaka')
  elif 'reference' in line_arr:
    refindex = line_arr.index('reference')
  if lindex > 0 or refindex > 0:
    return line_arr[lindex:refindex]
  else:
    return line_arr

## **Get PDF Text List**

In [12]:
# Creating an array containing the file name and text of each files

def list_all_text(pdf_folder_path,pdf_file_list):
  text_list = np.empty((0, 2), dtype=str)
  for files in pdf_file_list:
    text = get_pdf(pdf_folder_path,files)
    text_list = np.append(text_list,[[files,text]],axis=0)
  return text_list

# **Main Code**

In [13]:
# Main code of the system, calling all the neccesary functions

def main_code(text_list):
  # Separating the filenames and texts from text_list array
  name = [text[0] for text in text_list]
  text = [text[1] for text in text_list]

  # Flattened the text to smooth out the tokenize and vectorize
  flat_text = [line for text in text for line in text]

  # Calling the tokenizer and vectorizer function
  tfid = vecTfid(flat_text,text)
  token = tokenizing(flat_text,text)

  # Getting the maximum num of line among all text and use it for padding
  maxlen_token = max([len(word) for line in token for word in line])
  pad_token = padding(maxlen_token,token)

  # Iterating each text to examine the similarity between text
  for i in range(len(pad_token)):
    for j in range(i+1,len(pad_token)):
      flag_tf, flag_token, plag_tf_token, di = count_flag(pad_token[i], pad_token[j], tfid[i], tfid[j])
      plag_score = (plag_tf_token+flag_tf+flag_token)/(di*3)

      # Plagiarized score are categorized into 2 section
      # Those with plag_score more than 25% and those with less than 25%
      # For plag score with more than 25%, a warning will be displayed, along with the number of copy-pasted and high similarity line
      # For plag score with less than 25%, however, has copy-pasted and high similarity line, a warning will be displayed
      # For plag score that has less than 25% and no copy-pasted and high similarity line, no warning will be displayed
      print(f"file {name[i]} vs file {name[j]}")
      if plag_score >= 0.3:
        print(f"\t!Warning! There are overall {plag_score*100:.2f}% similarity score in both file! Bigger than 30%!")
        if flag_tf > 0:
          print(f"\t\tAmong which, there are {flag_tf} line with 99% similarity! About {flag_tf/di*100:.2f}% of both text!")
        if flag_token > 0:
          print(f"\t\tAmong which, there are {flag_token} line with 99% similar structure! About {flag_token/di*100:.2f}% of both text!")
      elif plag_score < 0.3 :
        if flag_tf > 0:
          print(f"\t!Warning! There are {flag_tf} line with 99% similarity in both file! About {flag_tf/di*100:.2f}% of both text!")
        if flag_token > 0:
          print(f"\t!Warning! There are {flag_token} line with 99% similar structure in both file! About {flag_token/di*100:.2f}% of both text!")
        else:
          print(f"\tSimilarity score is {plag_score*100:.2f}%! Congratulations, you may upload your work :)!")

# **Word Plagiarism Checker Function**

In [14]:
# Defining the Plagiarism Checker function for word files

def Plagiarism_checker_word(word_path):
  text_list = list_word_text(word_path,word_file(word_path))
  main_code(text_list)

## **Word file Example**

In [19]:
Plagiarism_checker_word(word_path)



file ann.docx vs file Try3.docx
		Among which, there are 50 line with 99% similarity! About 42.02% of both text!
		Among which, there are 54 line with 99% similar structure! About 45.38% of both text!
file ann.docx vs file wid.docx
	Similarity score is 0.00%! Congratulations, you may upload your work :)!
file ann.docx vs file has.docx
		Among which, there are 35 line with 99% similarity! About 29.41% of both text!
		Among which, there are 43 line with 99% similar structure! About 36.13% of both text!
file ann.docx vs file nav.docx
		Among which, there are 21 line with 99% similarity! About 17.65% of both text!
		Among which, there are 31 line with 99% similar structure! About 26.05% of both text!
file ann.docx vs file sab.docx
file Try3.docx vs file wid.docx
	Similarity score is 0.00%! Congratulations, you may upload your work :)!
file Try3.docx vs file has.docx
		Among which, there are 45 line with 99% similarity! About 36.00% of both text!
		Among which, there are 47 line with 99% si

# **PDF Plagiarism Check Function**

In [16]:
# Defining the Plagiarism Checker function for pdf files

def Plagiarism_checker_pdf(pdf_path):
  text_list = list_all_text(pdf_path,pdf_file(pdf_path))
  main_code(text_list)

## **PDF file Example**

In [17]:
Plagiarism_checker_pdf(folder_pdf)



file wid.pdf vs file sab.pdf
file wid.pdf vs file rang.pdf
file wid.pdf vs file vir.pdf
file wid.pdf vs file zin.pdf
file wid.pdf vs file zo.pdf
file wid.pdf vs file syaf.pdf
file sab.pdf vs file rang.pdf
		Among which, there are 20 line with 99% similarity! About 4.67% of both text!
		Among which, there are 233 line with 99% similar structure! About 54.44% of both text!
file sab.pdf vs file vir.pdf
file sab.pdf vs file zin.pdf
file sab.pdf vs file zo.pdf
file sab.pdf vs file syaf.pdf
file rang.pdf vs file vir.pdf
file rang.pdf vs file zin.pdf
file rang.pdf vs file zo.pdf
file rang.pdf vs file syaf.pdf
file vir.pdf vs file zin.pdf
file vir.pdf vs file zo.pdf
file vir.pdf vs file syaf.pdf
file zin.pdf vs file zo.pdf
file zin.pdf vs file syaf.pdf
file zo.pdf vs file syaf.pdf


# **Unused**

In [18]:
#from nltk.corpus import stopwords
#nltk.download('stopwords')
#  indstop = set(stopwords.words('indonesian'))
#  indstop = list(indstop)
#  engstop = set(stopwords.words('english'))
#  engstop = list(engstop)
#  remove_word = (indstop,engstop)
#  remove_word = [word for line in remove_word for word in line]