This repository has been archived by the owner on Feb 26, 2021. It is now read-only.
forked from jgera/plagiarism
-
Notifications
You must be signed in to change notification settings - Fork 0
/
plagiarism.py
131 lines (104 loc) · 4.17 KB
/
plagiarism.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
#! /usr/bin/env python3
# -*- coding: utf-8 -*-
from io import StringIO, BytesIO
import sys, re, string, logging
from collections import defaultdict
from datetime import datetime
from pdfminer.pdfinterp import PDFResourceManager, process_pdf
from pdfminer.converter import TextConverter
from nltk.corpus import stopwords
from nltk.tokenize import TreebankWordTokenizer
from nltk.stem import SnowballStemmer
import nltk.data
from network import download
from ssk import SSK
def timestr():
return datetime.now().strftime('%H:%M:%S')
def decodepdf(fp, debug = False):
with StringIO() as outfp:
rsrcmgr = PDFResourceManager()
device = TextConverter(rsrcmgr, outfp)
logging.disable(logging.WARNING)
if debug: print("processing pdf begin ({0})".format(timestr()))
process_pdf(rsrcmgr, device, fp)
if debug: print("processing pdf ended ({0})".format(timestr()))
logging.disable(logging.NOTSET)
return outfp.getvalue()
def decodetxt(fp):
return fp.read()
def readfile(file, debug = False):
try:
if file.endswith('.txt'):
with open(file, 'r') as fp:
return fp.read()
elif file.endswith('.pdf'):
with open(file, 'rb') as fp:
return decodepdf(fp, debug=debug)
except KeyboardInterrupt:
raise
except:
pass
def downloadfile(url, debug = False):
data = download(url)
try:
if url.endswith('.txt'):
return data.decode('utf-8')
elif url.endswith('.pdf'):
return decodepdf(BytesIO(data), debug=debug)
except KeyboardInterrupt:
raise
except:
pass
words_cache = {}
def getwords(text, langs=["english", "russian"], debug = False):
key = (text, tuple(langs))
if (key in words_cache):
if debug: print("found words in cache")
return words_cache[key]
punct = re.compile('[%s0-9\–]' % re.escape(string.punctuation))
if debug: print("tokenize begin ({0})".format(timestr()))
words = TreebankWordTokenizer().tokenize(str(text));
if debug: print("tokenize ended ({0})".format(timestr()))
if debug: print("del short words begin ({0})".format(timestr()))
words[:] = [word for word in words if len(word)>2]
if debug: print("del short words ended ({0})".format(timestr()))
if debug: print("punctuation begin ({0})".format(timestr()))
words[:] = [word for word in words if punct.sub("", word) == word]
if debug: print("punctuation ended ({0})".format(timestr()))
if debug: print("stopwords begin ({0})".format(timestr()))
words[:] = [word.lower() for word in words]
stops = [stopwords.words(lang) for lang in langs]
for stop in stops:
words[:] = [word for word in words if word not in stop]
if debug: print("stopwords ended ({0})".format(timestr()))
if debug: print("stemming begin ({0})".format(timestr()))
stemmers = [SnowballStemmer(lang) for lang in langs]
for stemmer in stemmers:
words[:] = [stemmer.stem(word) for word in words]
if debug: print("stemming ended ({0})".format(timestr()))
words_cache[key] = words
return words
def getkeywords(text, langs=["english", "russian"], num = 10, debug = False):
words = getwords(text, langs, debug=debug)
wordsCount = defaultdict(int)
for word in words:
wordsCount[word] += 1
if debug: print("sorting begin ({0})".format(timestr()))
words = sorted(wordsCount.items(), key=lambda x: x[1], reverse=True)[:num]
if debug: print("sorting ended ({0})".format(timestr()))
words[:] = [word for (word, cnt) in words]
return words
def evaluate(text1, text2, langs = ["english", "russian"], debug = False):
text1 = getwords(text1, langs=langs, debug=debug)
text2 = getwords(text2, langs=langs, debug=debug)
block_sz = 100
threshold = 0.1
text1s = [text1[i:i+block_sz] for i in range(0, len(text1), block_sz)]
text2s = [text2[i:i+block_sz] for i in range(0, len(text2), block_sz)]
blocks = []
for s in text1s:
for t in text2s:
res = SSK(s, t).solve(3)
if res > threshold:
blocks.append((res, s, t))
return blocks