In [97]:
from ipynb.fs.defs.WordBag import count_percents
from ipynb.fs.full.WordBag import create_bag_of_words_from_file
from ipynb.fs.defs.WordBag import combine_word_bags

In [98]:
import re
import math
import spacy

In [99]:
"""
funkcja liczy miarę fragmentu względem całości
jest to suma spierwiastkowanych odległości
"""
def calculate_measure(fragment_txt, whole_txt, is_fragment, with_stop_words, lemma):
    fragment = create_bag_of_words_from_file(fragment_txt, is_fragment, with_stop_words, lemma)
    whole = create_bag_of_words_from_file(whole_txt, False)
    count_percents(fragment)
    count_percents(whole)
    measure = 0
    for i in (fragment.keys()):
        if i in whole:
            measure += math.sqrt(abs(fragment[i]-whole[i]))
        else:
            measure += math.sqrt(fragment[i])
    return measure

"""
funkcja liczy miarę fragmentu względem połączonych lektur autora
jest to suma spierwiastkowanych odległości
"""
def calculate_measures(fragment_txt, combined, is_fragment, with_stop_words, lemma):
    fragment = create_bag_of_words_from_file(fragment_txt, is_fragment, with_stop_words, lemma)
    count_percents(fragment)
    measures = {}
    for author in combined:
        count_percents(combined[author])
        if author not in measures:
            measures[author] = 0
        for i in (fragment.keys()):
            if i in combined[author]:
                measures[author] += math.sqrt(abs(fragment[i]-combined[author][i]))
            else:
                measures[author] += math.sqrt(fragment[i])
    return measures

"""
przykład użycia
"""
# calculate_measure('fragment.txt', 'testFile1.txt', True, True, True)

# bag1 = create_bag_of_words_from_file('testFile1.txt', False)
# bag2 = create_bag_of_words_from_file('testFile2.txt', False)
# word_bags = []
# word_bags.append(bag1)
# word_bags.append(bag2)
# combined = combine_word_bags(word_bags)

# calculate_measures('fragment.txt', combined, True, True, True)

'\nprzykład użycia\n'

In [100]:
"""
funkcja zwraca najbardziej prawdopodobnych autorów, czyli takich, przy których miara jest najmniejsza w kolejności rosnącej
"""
def find_possible_authors(fragment_txt, combined, is_fragment, with_stop_words, lemma):
    measures = calculate_measures(fragment_txt, combined, is_fragment, with_stop_words, lemma)
    possible_authors = dict(sorted(measures.items(), key=lambda item: item[1]))
    return possible_authors

"""
funkcja zwraca najbardziej prawdopodobnych autorów z procentową szansą
v1 to wersja z dużymi % (nie sumuje sie do 100)
v2 to wersja sumująca się do 100
"""
def find_possible_authors_percents_v1(fragment_txt, combined, is_fragment, with_stop_words, lemma):
    sum = 0
    measures = calculate_measures(fragment_txt, combined, is_fragment, with_stop_words, lemma)
    possible_authors_percents = dict(sorted(measures.items(), key=lambda item: item[1]))
    for author in possible_authors_percents:
        sum += possible_authors_percents[author]
    for author in possible_authors_percents:
        possible_authors_percents[author]=(sum-possible_authors_percents[author])/sum*100
    return possible_authors_percents

def find_possible_authors_percents_v2(fragment_txt, combined, is_fragment, with_stop_words, lemma):
    sum = 0
    measures = calculate_measures(fragment_txt, combined, is_fragment, with_stop_words, lemma)
    possible_authors_percents = dict(sorted(measures.items(), key=lambda item: item[1]))
    for author in possible_authors_percents:
        sum += 1/possible_authors_percents[author]
    for author in possible_authors_percents:
        possible_authors_percents[author]=1/possible_authors_percents[author]/sum*100
    return possible_authors_percents

"""
przykład użycia
"""
# bag1 = create_bag_of_words_from_file('testFile1.txt', False)
# bag2 = create_bag_of_words_from_file('testFile2.txt', False)
# bag3 = create_bag_of_words_from_file('testFile3.txt', False)
# word_bags = []
# word_bags.append(bag1)
# word_bags.append(bag2)
# word_bags.append(bag3)
# combined = combine_word_bags(word_bags)

# find_possible_authors('fragment.txt', combined, True, True, True)
# find_possible_authors_percents_v1('fragment.txt', combined, True, True, True)
# find_possible_authors_percents_v2('fragment.txt', combined, True, True, True)

'\nprzykład użycia\n'