In [1]:
# Import all necessary library and function

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np
import re
import requests
import pandas as pd
import io

!pip install nltk
import nltk
from nltk.tokenize import word_tokenize
nltk.download('punkt')
from nltk.tokenize import sent_tokenize

import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

import warnings
warnings.filterwarnings('ignore')



[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


In [2]:
def request(story_url='https://docs.google.com/spreadsheets/d/e/2PACX-1vQxre--PXKwKBMdhZGv7XmpUPkBAiXg84lETpmkftFNusZ2MLZOpq6jb4MPk3TQ02T-FBcO17Ui4X7l/pub?gid=1217383481&single=true&output=csv'):
  story = requests.get(story_url)
  story = story.content
  story_data = pd.read_csv(io.StringIO(story.decode('utf-8')))
  return story_data

In [3]:
# Defining the function to remove unnecesary symbols and empty arrays

def remove(text):
  pattern = r'[“”‘’:;"_\',.()\–\[\]]'
  sub_text = re.sub(pattern,'', text)
  pattern = r'[\-]'
  sub_text = re.sub(pattern,' ', sub_text)
  token_text = word_tokenize(sub_text)
  words = [word for word in token_text if word]
  return ' '.join(words)

In [7]:
# Cleaning the text by iterating through the line

def get_csv_list(data):
  csv_arr = []
  text_to_line = sent_tokenize(data.lower())
  for text in text_to_line:
    clean_line = remove(text)
    csv_arr.append(clean_line)
  return csv_arr

In [8]:
# Creating the text array of all files

def text_list(story_data):
  csv_list = []
  for _, row in story_data.iterrows():
    text_arr = get_csv_list(row[-2])
    csv_list.append(text_arr)
  return csv_list

In [6]:
# Defining the Token of each text

def tokenizing(flat_text,text):
  tokenizer = Tokenizer()
  tokenizer.fit_on_texts(flat_text)
  word_index = tokenizer.word_index
  tokenid = []
  for i in text:
    tokens = tokenizer.texts_to_sequences(i)
    tokenid.append(tokens)
  return tokenid

In [9]:
# Defining the TfidfVectorizer of each text

def vecTfid(flat_text,text):
  vec = TfidfVectorizer()
  vec.fit(flat_text)
  vecarr = []
  for i in text:
    transform = vec.transform(i).toarray()
    vecarr.append(transform)
  return vecarr

In [10]:
# Using the flag system to count copy-pasted and high similarity line

def count_flag(token1, token2, tf1, tf2):
  # Defining all necessary flag variables
  plag_tf_token = 0
  di = len(tf1)
  high_sim = []
  copy_paste = []
  struc_token = []
  struc_tf = []
  token_cos = []

  # Getting the similarity on Tokenized text and Vectorized text
  tf_cos = cosine_similarity(tf1,tf2)
  for i in range(len(token1)):
    cos_line= []
    for j in range(len(token2)):
      leng = len(list(set(token1[i]+token2[j])))
      cos = len(set(token1[i])&set(token2[j]))/leng
      cos_line.append(cos)
    token_cos.append(cos_line)

  # Both Tokenized and Vectorized has their own strength and uses
  # Similarity threshold (0.9999, 0.7, and 0.35) may be changed
  # Flag system works by:
  # 1. Flag each line that are copy-pasted or high similarity with value 1, final result is the total line
  # 2. Line with less than 0.9999 similarity uses several threshold (0.7 and 0.3)
       # Both threshold gave different flag value (for tf) (plag_tf_token)
       # Final result of plag_tf_token is the summation of flag tf and token if total > 1
  for i in range(di):
    tf = 0
    token = 0
    # For high similarity checking
    if any(vals >= 0.9999 for vals in tf_cos[i]):
      high_sim.append([i,[index for index, vals in enumerate(tf_cos[i]) if vals >= 0.9999]])
    elif any(0.7 < vals < 0.9999 for vals in tf_cos[i]):
      struc_tf.append([i,[index for index, vals in enumerate(tf_cos[i]) if 0.7 < vals < 0.9999]])
      tf = 2
    elif any(0.35 < vals <= 0.7 for vals in tf_cos[i]):
      struc_tf.append([i,[index for index, vals in enumerate(tf_cos[i]) if 0.35 < vals <= 0.7]])
      tf = 1
    else:
      tf = 0

    # For high structure similarity checking
    if any(vals >= 0.9999 for vals in token_cos[i]):
      copy_paste.append([i,[index for index, vals in enumerate(token_cos[i]) if vals >= 0.9999]])
    elif any(0.7 < vals < 0.9999 for vals in token_cos[i]):
      struc_token.append([i,[index for index, vals in enumerate(token_cos[i]) if 0.7 < vals < 0.9999]])
      token = 1
    else:
      token = 0

    # For plagiarism checking score (other than copy-pasted or 99% similarity)
    if token + tf > 1:
      plag_tf_token += 1
  return plag_tf_token, di, high_sim, copy_paste, struc_token, struc_tf

In [11]:
# A function to show all lines with high similarity or copy-pasted

def show_line(struc_token,struc_tf,text_list,high_sim,copy_paste,j):
  # Show all line that has over 99% similarity
  if len(high_sim) > 0:
    print("\nLine with 99% similarity:")
    for i in range(len(high_sim)):
      for k in high_sim[i][1]:
        print(f"\t {i+1}. {text_list[-1][high_sim[i][0]]} vs {text_list[j][k]}")

  # Show all line that are copy-pasted
  if len(copy_paste) > 0:
    print("\nCopy-pasted line:")
    for i in range(len(copy_paste)):
      for k in copy_paste[i][1]:
        print(f"\t {i+1}. {text_list[-1][copy_paste[i][0]]} vs {text_list[j][k]}")

  # Show all line that has high similar structure
  if len(struc_token) > 0:
    print("\nLine with similar structure:")
    for i in range(len(struc_token)):
      for k in struc_token[i][1]:
        print(f"\t {i+1}. {text_list[-1][struc_token[i][0]]} vs {text_list[j][k]}")

  # Show all line that has similarity more than 35%
  if len(struc_tf) > 0:
    print("\nLine with similarity higher than 35%:")
    for i in range(len(struc_tf)):
      for k in struc_tf[i][1]:
        print(f"\t {i+1}. {text_list[-1][struc_tf[i][0]]} vs {text_list[j][k]}")

In [12]:
# Main code of the system, calling all the neccesary functions

def main_code(story_data,text_list):
  # Flattened the text to smooth out the tokenize and vectorize
  flat_text = [line for text in text_list for line in text]

  # Calling the tokenizer and vectorizer function
  tfid = vecTfid(flat_text,text_list)
  token = tokenizing(flat_text,text_list)

  # Iterating each text to examine the similarity between text
  for j in range(len(token)):
    # Skip chapter text of same story
    if story_data.iloc[-1,1] != story_data.iloc[j,1]:
      plag_tf_token, di,  high_sim, copy_paste, struc_token, struc_tf = count_flag(token[-1], token[j], tfid[-1], tfid[j])
      flag_tf = len(high_sim)
      flag_token = len(copy_paste)
      plag_score = (plag_tf_token+(flag_tf+flag_token)/2)/di
      tf_token = len(set([index for index,val in enumerate(struc_token)] + [index for index,val in enumerate(struc_tf)]))

      # Plagiarized score are categorized into 3 section
      # Those with plagiarism score more than 30% and those with less than 30%
      # For plagiarism score with more than 30%, a warning will be displayed, along with the number of copy-pasted and high similarity line
      # For plagiarism score with less than 30%, however, has copy-pasted and high similarity line, a warning will be displayed
      # For plagiarism score that has less than 30% and no copy-pasted and high similarity line, no warning will be displayed

      print(f"file {story_data.iloc[-1,1]} chapter {story_data.iloc[-1,3]} vs file {story_data.iloc[j,1]} chapter {story_data.iloc[j,3]}")
      if plag_score >= 0.3:
        print(f"\t!Warning! There are overall {plag_score*100:.2f}% similarity score in both file! Bigger than 30%!")
        if flag_tf > 0:
          print(f"\t\tAmong which, there are {flag_tf} line with 99% similarity! About {flag_tf/di*100:.2f}% of the text!")
        if flag_token > 0:
          print(f"\t\tAmong which, there are {flag_token} line with 99% similar structure! About {flag_token/di*100:.2f}% of the text!")
        if plag_tf_token > 0:
          print(f"\t\tAmong which, there are {tf_token} line with either 70% more similarity or similar structure! About {tf_token/di*100:.2f}% of the text!")
        show_line(struc_token,struc_tf,text_list,high_sim,copy_paste,j)
      elif plag_score < 0.3 and (flag_tf > 0 or flag_token > 0):
        print(f"\t!Warning! Overall there are {plag_score*100:.2f}% similarity score in both file, less than 30%, however:")
        if flag_tf > 0:
          print(f"\t\tThere are {flag_tf} line with 99% similarity in both file! About {flag_tf/di*100:.2f}% of the text!")
        if flag_token > 0:
          print(f"\t\tThere are {flag_token} line with 99% similar structure in both file! About {flag_token/di*100:.2f}% of the text!")
        if plag_tf_token > 0:
          print(f"\t\tAmong which, there are {tf_token} line with either 70% more similarity or similar structure! About {tf_token/di*100:.2f}% of the text!")
        show_line(struc_token,struc_tf,text_list,high_sim,copy_paste,j)
      else:
          print(f"\tSimilarity score is {plag_score*100:.2f}%! Congratulations, you may upload your work :)!")

      print("\n---------------------------------------------------------------------------------------------------------------------------\n")

In [13]:
def Plagiarism_Checker(story_data):
  text = text_list(story_data)
  main_code(story_data,text)

In [14]:
Plagiarism_Checker(request())

file 10 chapter Ikatan vs file 1 chapter Takdir Pertemuan
	Similarity score is 0.00%! Congratulations, you may upload your work :)!

---------------------------------------------------------------------------------------------------------------------------

file 10 chapter Ikatan vs file 1 chapter Ikatan
		Among which, there are 9 line with 99% similarity! About 52.94% of the text!
		Among which, there are 10 line with 99% similar structure! About 58.82% of the text!
		Among which, there are 6 line with either 70% more similarity or similar structure! About 35.29% of the text!

Line with 99% similarity:
	 1. mereka melacak seorang wanita bernama sarah yang merupakan mantan kekasih korban vs mereka melacak seorang wanita bernama sarah yang merupakan mantan kekasih korban
	 2. sarah menceritakan bahwa ia dan korban pernah berselisih vs sarah menceritakan bahwa ia dan korban pernah berselisih
	 3. korban telah berselingkuh dengan wanita lain dan sarah merasa sangat sakit hati vs korban te