In [1]:
import re
import pandas as pd
import numpy as np
from binascii import crc32

In [2]:
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
from pymystem3 import Mystem
mystem = Mystem() 
russian_stopwords = stopwords.words("russian")
russian_stopwords.append('это')

ukrainian_stopwords = ['з', 'й', 'що', 'та', 'із', 'але', 'цей', 'коли', 'як', 'чого', 'хоча', 'нам', 'якось', 'чи',
                       'це', 'від', 'їх', 'інших', 'ти', 'він', 'вона', 'воно', 'ми', 'ви', 'вони', 'якого', 
                       'яких', 'яким', 'є', 'чому', 'чим', 'де', 'десь', 'собі', 'свій']
stop_words = russian_stopwords + ukrainian_stopwords

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Oleg\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [25]:
print(','.join(stop_words))

и,в,во,не,что,он,на,я,с,со,как,а,то,все,она,так,его,но,да,ты,к,у,же,вы,за,бы,по,только,ее,мне,было,вот,от,меня,еще,нет,о,из,ему,теперь,когда,даже,ну,вдруг,ли,если,уже,или,ни,быть,был,него,до,вас,нибудь,опять,уж,вам,ведь,там,потом,себя,ничего,ей,может,они,тут,где,есть,надо,ней,для,мы,тебя,их,чем,была,сам,чтоб,без,будто,чего,раз,тоже,себе,под,будет,ж,тогда,кто,этот,того,потому,этого,какой,совсем,ним,здесь,этом,один,почти,мой,тем,чтобы,нее,сейчас,были,куда,зачем,всех,никогда,можно,при,наконец,два,об,другой,хоть,после,над,больше,тот,через,эти,нас,про,всего,них,какая,много,разве,три,эту,моя,впрочем,хорошо,свою,этой,перед,иногда,лучше,чуть,том,нельзя,такой,им,более,всегда,конечно,всю,между,это,з,й,що,та,із,але,цей,коли,як,чого,хоча,нам,якось,чи,це,від,їх,інших,ти,він,вона,воно,ми,ви,вони,якого,яких,яким,є,чому,чим,де,десь,собі,свій


In [3]:
def preprocess_text(text):
    tokens = mystem.lemmatize(text.lower())
    tokens = [token for token in tokens if token not in stop_words\
              and token != " "]
    
    text = " ".join(tokens)
    
    return text

In [4]:
filenames = ['gogol_1_ukr', 'gogol_2_ukr', 'gogol_3_ukr',
            'gogol_1_rus', 'gogol_2_rus', 'gogol_3_rus',
            'gogol_1_auto', 'gogol_2_auto', 'gogol_3_auto',
            'nl_1_ukr', 'nl_2_ukr', 'nl_3_ukr',
            'nl_1_rus', 'nl_2_rus', 'nl_3_rus',
            'nl_1_auto', 'nl_2_auto', 'nl_3_auto']

In [5]:
def read_file(filename):
    with open('files/' + filename + '.txt', 'r', encoding='utf8') as f:
        data = f.read()
    return data

In [6]:
def clean_data(data):
    data = data.lower()
    data = re.sub('[\'":;,\+-]', '', data)
    data = re.sub('[?!]', '.', data)
    data = re.sub('\s+', ' ', data)
    return data

In [7]:
def get_shingles(data, shingle_length=3, sort=False, preprocess=False):
    data = clean_data(data)
    if preprocess:
        data = preprocess_text(data)
    data = [sent.strip() for sent in data.split('.') if sent.strip() != '']

    shingles = []
    for sent in data:
        sent = sent.split()
        if sort:
            sent = sorted(sent)
        if len(sent) <= shingle_length:
            shingles.append(' '.join(sent))
            continue
        for i in range(len(sent) - shingle_length + 1):
            shingles.append(' '.join(sent[i:i+shingle_length]))
    return shingles

In [8]:
max_shingle_id = 2**32-1
prime = 4294967311
a = [i**2 for i in range(1, 85)]
b = [i**3 for i in range(1, 85)]

In [9]:
def custom_hash(i, h):
    return (a[i] * h + b[i]) % prime

In [10]:
def get_signature(shingles):
    hashes = []
    for shingle in shingles:
        hashes.append(crc32(shingle.encode()))
    
    signature = []
    for i in range(84):
        component = []
        for h in hashes:
            component.append(custom_hash(i, h))
        signature.append(min(component))
    return signature

In [11]:
def compare_signatures(sig1, sig2):
    plag = 0
    for i in range(84):
        if sig1[i] == sig2[i]:
            plag += 1
    return plag / 84

In [12]:
def compare(f1, f2, sort=False, preprocess=False):
    data1, data2 = read_file(f1), read_file(f2)
    sh1, sh2 = get_shingles(data1, sort=sort, preprocess=preprocess), get_shingles(data2, sort=sort, preprocess=preprocess)
    sig1, sig2 = get_signature(sh1), get_signature(sh2)
    return compare_signatures(sig1, sig2)

In [13]:
def main(sort=False, preprocess=False):
    df = pd.DataFrame(index=filenames, columns=filenames)
    #np.fill_diagonal(df.values, 1)
    for i in range(df.shape[0]):
        for j in range(i, df.shape[1]):
            df.iat[i, j] = compare(df.columns[i], df.columns[j], sort=sort, preprocess=preprocess)
            df.iat[j, i] = df.iat[i, j]
    return df

In [14]:
result = main()
result

Unnamed: 0,gogol_1_ukr,gogol_2_ukr,gogol_3_ukr,gogol_1_rus,gogol_2_rus,gogol_3_rus,gogol_1_auto,gogol_2_auto,gogol_3_auto,nl_1_ukr,nl_2_ukr,nl_3_ukr,nl_1_rus,nl_2_rus,nl_3_rus,nl_1_auto,nl_2_auto,nl_3_auto
gogol_1_ukr,1.0,0.0238095,0.0119048,0.0,0.0,0.0,0.0119048,0.0,0.0,0.0,0.0,0,0.0,0.0,0.0,0.0,0.0,0.0
gogol_2_ukr,0.0238095,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0.0,0.0,0.0,0.0,0.0,0.0
gogol_3_ukr,0.0119048,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0.0,0.0,0.0,0.0,0.0,0.0
gogol_1_rus,0.0,0.0,0.0,1.0,0.0119048,0.0,0.0952381,0.0,0.0,0.0,0.0,0,0.0,0.0,0.0,0.0,0.0,0.0
gogol_2_rus,0.0,0.0,0.0,0.0119048,1.0,0.0,0.0,0.0952381,0.0,0.0,0.0,0,0.0,0.0,0.0,0.0,0.0,0.0
gogol_3_rus,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0595238,0.0,0.0,0,0.0,0.0,0.0,0.0,0.0,0.0
gogol_1_auto,0.0119048,0.0,0.0,0.0952381,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0,0.0,0.0,0.0,0.0,0.0,0.0
gogol_2_auto,0.0,0.0,0.0,0.0,0.0952381,0.0,0.0,1.0,0.0,0.0,0.0,0,0.0,0.0,0.0,0.0,0.0,0.0
gogol_3_auto,0.0,0.0,0.0,0.0,0.0,0.0595238,0.0,0.0,1.0,0.0,0.0,0,0.0,0.0,0.0,0.0,0.0,0.0
nl_1_ukr,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0,0.0119048,0.0,0.0,0.0833333,0.0,0.0


In [15]:
result.to_csv('result.csv')

In [16]:
result_sort = main(sort=True)
result_sort

Unnamed: 0,gogol_1_ukr,gogol_2_ukr,gogol_3_ukr,gogol_1_rus,gogol_2_rus,gogol_3_rus,gogol_1_auto,gogol_2_auto,gogol_3_auto,nl_1_ukr,nl_2_ukr,nl_3_ukr,nl_1_rus,nl_2_rus,nl_3_rus,nl_1_auto,nl_2_auto,nl_3_auto
gogol_1_ukr,1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
gogol_2_ukr,0,1.0,0.0119048,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
gogol_3_ukr,0,0.0119048,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
gogol_1_rus,0,0.0,0.0,1.0,0.0,0.0,0.0595238,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
gogol_2_rus,0,0.0,0.0,0.0,1.0,0.0,0.0,0.047619,0.0,0.0,0.0,0.0,0.0119048,0.0,0.0,0.0,0.0,0.0
gogol_3_rus,0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.047619,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
gogol_1_auto,0,0.0,0.0,0.0595238,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
gogol_2_auto,0,0.0,0.0,0.0,0.047619,0.0,0.0,1.0,0.0119048,0.0,0.0,0.0,0.0119048,0.0119048,0.0,0.0119048,0.0119048,0.0
gogol_3_auto,0,0.0,0.0,0.0,0.0,0.047619,0.0,0.0119048,1.0,0.0,0.0,0.0,0.0119048,0.0119048,0.0,0.0119048,0.0119048,0.0
nl_1_ukr,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0119048,0.0,0.0,0.047619,0.0,0.0


In [17]:
result_sort.to_csv('result_sort.csv')

In [18]:
result_preprocess = main(preprocess=True)
result_preprocess

Unnamed: 0,gogol_1_ukr,gogol_2_ukr,gogol_3_ukr,gogol_1_rus,gogol_2_rus,gogol_3_rus,gogol_1_auto,gogol_2_auto,gogol_3_auto,nl_1_ukr,nl_2_ukr,nl_3_ukr,nl_1_rus,nl_2_rus,nl_3_rus,nl_1_auto,nl_2_auto,nl_3_auto
gogol_1_ukr,1.0,0.0,0,0.0238095,0.0,0.0,0.0119048,0.0,0.0,0.0,0.0,0,0.0,0.0,0.0,0.0,0.0,0.0
gogol_2_ukr,0.0,1.0,0,0.0,0.0119048,0.0,0.0,0.0119048,0.0,0.0,0.0,0,0.0,0.0,0.0,0.0,0.0,0.0
gogol_3_ukr,0.0,0.0,1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0.0,0.0,0.0,0.0,0.0,0.0
gogol_1_rus,0.0238095,0.0,0,1.0,0.0,0.0,0.0952381,0.0,0.0,0.0,0.0,0,0.0,0.0,0.0,0.0,0.0,0.0
gogol_2_rus,0.0,0.0119048,0,0.0,1.0,0.0,0.0,0.0833333,0.0,0.0,0.0,0,0.0,0.0,0.0,0.0,0.0,0.0
gogol_3_rus,0.0,0.0,0,0.0,0.0,1.0,0.0,0.0,0.0952381,0.0,0.0,0,0.0,0.0,0.0,0.0,0.0,0.0
gogol_1_auto,0.0119048,0.0,0,0.0952381,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0,0.0,0.0,0.0,0.0,0.0,0.0
gogol_2_auto,0.0,0.0119048,0,0.0,0.0833333,0.0,0.0,1.0,0.0,0.0,0.0,0,0.0,0.0,0.0,0.0,0.0,0.0
gogol_3_auto,0.0,0.0,0,0.0,0.0,0.0952381,0.0,0.0,1.0,0.0,0.0,0,0.0,0.0,0.0,0.0,0.0,0.0
nl_1_ukr,0.0,0.0,0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0,0.0,0.0,0.0,0.0119048,0.0,0.0


In [19]:
result_preprocess.to_csv('result_preprocess.csv')