In [100]:
## All Imports
import glob
import random
from bs4 import BeautifulSoup

In [166]:
## Parameters
N_SAMPLES = 5
N_ESTIMATORS = 10
TRAIN_SIZE = 0.75
HTML_PATH = "*/*.html"

In [131]:
## All Functions
def text_from_html(html):
    soup = BeautifulSoup(html)

    # kill all script and style elements
    for script in soup(["script", "style"]):
        script.extract()    # rip it out

    # get text
    text = soup.get_text()

    # break into lines and remove leading and trailing space on each
    lines = (line.strip() for line in text.splitlines())
    # break multi-headlines into a line each
    chunks = (phrase.strip() for line in lines for phrase in line.split("  "))
    # drop blank lines
    text = '\n'.join(chunk for chunk in chunks if chunk)

    return text

def split_phrases_setfy(text):
    return set(text.lower().replace('\n', '.').replace('¶', '.').replace('. ', '.').split('.'))


def get_similarity(html1, html2, corpus_intersect):
    
    clean_sets = html1-corpus_intersect
    clean_sets1 = html2-corpus_intersect
    similarity = len(clean_sets.intersection(clean_sets1))/len(clean_sets.union(clean_sets1))
    return similarity

def read_html(h):
    with open(h) as f:
        html = f.read()
    return html

def html_text_from_path(html_path):
    h = read_html(html_path)
    h = text_from_html(h)
    h = split_phrases_setfy(h)
    return h

def get_corpus_intersect(all_html, n_samples):
    html_path_sample = random.sample(all_html, n_samples)
    html_sources = [read_html(h) for h in html_path_sample]
    html_sources = [text_from_html(h) for h in html_sources]
    html_sets = [split_phrases_setfy(h) for h in html_sources]
    corpus_intersect = set.intersection(*html_sets)
    return corpus_intersect

def intersect_model(all_html, n_samples, n_estimators):
    all_models = []
    for n in range(n_estimators):
        all_models.append(get_corpus_intersect(all_html, n_samples))

    model = set.union(*all_models)
    return model

In [167]:
# Get filepaths of htmls
all_html = glob.glob(HTML_PATH)
#all_html = [h for h in all_html if 'PT' not in h]

In [142]:
# Train test split
train_samples = int(TRAIN_SIZE*len(all_html))
train_html = all_html[:train_samples]
test_html = all_html[train_samples:]

In [144]:
# Create set of common sentences among those documents
model = intersect_model(train_html, N_SAMPLES, N_ESTIMATORS)

In [161]:
# Compare documents in the test set
for i, h1 in enumerate(test_html[:-1]):
    for j in range(i+1, len(test_html)):
        print('='*80)
        print(f'Index {i} vs idx {j}')
        h2 = test_html[j]
        h_text_1 = html_text_from_path(h1)
        h_text_2 = html_text_from_path(h2)
        similarity = get_similarity(h_text_1, h_text_2, model)
        print(f'Similarity is {similarity}')

Index 0 vs idx 1
Similarity is 0.6734006734006734
Index 0 vs idx 2
Similarity is 0.10594315245478036
Index 0 vs idx 3
Similarity is 0.06443914081145585
Index 0 vs idx 4
Similarity is 0.08533333333333333
Index 0 vs idx 5
Similarity is 0.06179775280898876
Index 0 vs idx 6
Similarity is 0.09976798143851508
Index 0 vs idx 7
Similarity is 0.0979020979020979
Index 0 vs idx 8
Similarity is 0.10250569476082004
Index 0 vs idx 9
Similarity is 0.08423913043478261
Index 0 vs idx 10
Similarity is 0.05152671755725191
Index 0 vs idx 11
Similarity is 0.055865921787709494
Index 0 vs idx 12
Similarity is 0.08235294117647059
Index 0 vs idx 13
Similarity is 0.07526881720430108
Index 0 vs idx 14
Similarity is 0.059278350515463915
Index 0 vs idx 15
Similarity is 0.05747126436781609
Index 0 vs idx 16
Similarity is 0.0945273631840796
Index 0 vs idx 17
Similarity is 0.058931860036832415
Index 0 vs idx 18
Similarity is 0.09965034965034965
Index 0 vs idx 19
Similarity is 0.058673469387755105
Index 0 vs idx 20
Si