In [17]:
#all imported packages
import nltk
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('averaged_perceptron_tagger')
nltk.download('wordnet')
nltk.download('omw-1.4')
from nltk.stem import PorterStemmer
from nltk.corpus import stopwords
from nltk.corpus import wordnet

[nltk_data] Downloading package punkt to C:\Users\Vipul
[nltk_data]     Kapoor\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to C:\Users\Vipul
[nltk_data]     Kapoor\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\Vipul Kapoor\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package wordnet to C:\Users\Vipul
[nltk_data]     Kapoor\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to C:\Users\Vipul
[nltk_data]     Kapoor\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\omw-1.4.zip.


In [30]:
#all the functions and declarations

#object creation for stemming
porter = PorterStemmer()

#to check for nouns
is_noun = lambda pos: pos[:2] == 'NN'

###################################################pre-processing steps#################################################


#sentence_tokenization
def sen_tokenize(text1):
    text = text1.read()
    sen_list = nltk.tokenize.sent_tokenize(text)
    return sen_list

#word_tokenization
def wor_tokenize(sen_list):  
    word_list = []
    for terms in sen_list:
        text_tokens = nltk.tokenize.word_tokenize(terms)
        word_list.append(text_tokens)
    return word_list


#to remove the stop words      
def stopword_removal(a_list):
    word_list = []
    for terms in a_list:
        tokens_without_sw = [word for word in terms if not word in stopwords.words()]
        word_list.append(tokens_without_sw)
    return word_list


#to remove the punctuation
def remove_punctuation(a_list):
    word_list = []
    for terms in a_list:
        tokens_without_sw = [word for word in terms if word.isalnum()]
        word_list.append(tokens_without_sw)
    return word_list


#concept extraction: to extract the important features like nouns in our case
def concept_extraction(a_list):
    b_list=[]
    
    for terms in a_list:
        #extracting nouns
        nouns = [word for (word, pos) in nltk.pos_tag(terms) if is_noun(pos)] 
        tokens_without_sw1=[]
        for word in nouns:
            #stemming
            tokens_without_sw1.append(porter.stem(word))
            #adding synonyms to the list so that it can catch alternative words used
            synset=wordnet.synsets(word)
            tokens_without_sw1.append(synset[0].lemmas()[0].name())
        b_list.append(tokens_without_sw1)
    return b_list

#creating topic signature node
def get_topic_signature(b_list):
    topic_signature = list(set().union(*b_list))
    return topic_signature

In [18]:
######################processing original document#############################

text1 = open("original.txt","r")

print("Original Document")

b_list=sen_tokenize(text1)
print("After Sentence Tokenization")
print(b_list)
print()

b_list=wor_tokenize(b_list)
print("After Word Tokenization")
print(b_list)
print()

b_list=stopword_removal(b_list)
print("After removing stop words")
print(b_list)
print()

b_list=remove_punctuation(b_list)
print("After removing punctuations")
print(b_list)
print()

b_list=concept_extraction(b_list)
print("The concepts list extracted from original document")
print(b_list)
print()

ts_1=get_topic_signature(b_list)
print("Topic signature of original document")
print(ts_1)

Original Document
After Sentence Tokenization
['The legal system is made up of civil courts, criminal courts and specialty courts such as family law courts and bankruptcy court.', 'Each court has its own jurisdiction, which refers to the cases that the court is allowed to hear.', 'In some instances, a case can only be heard in one type of court.', 'For example, a bankruptcy case must be heard in a bankruptcy court.', 'In other instances, there may be several potential courts with jurisdiction.', 'For example, a federal criminal court and a state criminal court would each have jurisdiction over a crime that is a federal drug offense but that is also an offense on the state level.']

After Word Tokenization
[['The', 'legal', 'system', 'is', 'made', 'up', 'of', 'civil', 'courts', ',', 'criminal', 'courts', 'and', 'specialty', 'courts', 'such', 'as', 'family', 'law', 'courts', 'and', 'bankruptcy', 'court', '.'], ['Each', 'court', 'has', 'its', 'own', 'jurisdiction', ',', 'which', 'refers',

In [25]:
######################processing plagiarised document#############################


text2 = open("plagiarised text sample.txt","r")

print("Suspected Document")
c_list=sen_tokenize(text2)
print("After Sentence Tokenization")
print(c_list)
print()

c_list=wor_tokenize(c_list)
print("After Word Tokenization")
print(c_list)
print()

c_list=stopword_removal(c_list)
print("After removing stop words")
print(c_list)
print()

c_list=remove_punctuation(c_list)
print("After removing punctuations")
print(c_list)
print()

c_list=concept_extraction(c_list)
print("The concepts list extracted from suspected document")
print(c_list)
print()

ts_2=get_topic_signature(c_list)
print("Topic signature of suspected document")
print(ts_2)

Suspected Document
After Sentence Tokenization
['The legal system is comprised of criminal and civil courts and specialty courts like bankruptcy and family law courts.', 'Every one of the courts is vested with its own jurisdiction.', 'Jurisdiction means the types of cases each court is permitted to rule on.', 'Sometimes, only one type of court can hear a particular case.', 'For instance, bankruptcy cases an be ruled on only in bankruptcy court.', 'In other situations, it is possible for more than one court to have jurisdiction.', 'For instance, both a state and federal criminal court could have authority over a criminal case that is illegal under federal and state drug laws.']

After Word Tokenization
[['The', 'legal', 'system', 'is', 'comprised', 'of', 'criminal', 'and', 'civil', 'courts', 'and', 'specialty', 'courts', 'like', 'bankruptcy', 'and', 'family', 'law', 'courts', '.'], ['Every', 'one', 'of', 'the', 'courts', 'is', 'vested', 'with', 'its', 'own', 'jurisdiction', '.'], ['Juri

In [26]:
plag_percentage = len(list(set(ts_1) & set(ts_2)))/len(list(set(ts_1).union(set(ts_2))))*100
print("percentage of plagiarism found",plag_percentage)
print()
print("The copied concepts are")
print(list(set(ts_1) & set(ts_2)))

percentage of plagiarism found 53.57142857142857

The copied concepts are
['system', 'case', 'law', 'bankruptcy', 'jurisdict', 'state', 'specialti', 'court', 'famili', 'forte', 'instanc', 'bankruptci', 'legal_power', 'type', 'family']


In [27]:
#setting the tolerance level of plagiarism
tolerance=30
copied_content = list(set(ts_1) & set(ts_2))

#function for calculation of copied content within the document
def localization(sen_list):    
    copied_sen = []
    print("plagiarism percentage of each sentence from start")
    for lists in sen_list:
        plag_percentage1 = len(list(set(copied_content) & set(lists)))/len(list(set(copied_content).union(set(lists))))*100
        print(" ",plag_percentage1)
        if plag_percentage1 > tolerance:
            copied_sen.append(sen_list.index(lists)+1)
    return copied_sen

In [29]:
copied_sen2 = localization(c_list)
print(copied_sen2,"sentences are copied with high accuracy in document")

plagiarism percentage of each sentence from start
  60.0
  20.0
  31.25
  13.333333333333334
  33.33333333333333
  17.647058823529413
  31.25
[1, 3, 5, 7] sentences are copied with high accuracy in suspected document
