here i want to update the frontier so that we can then continue the crawling from a better frontier and with an improved priority function

## Filtering Frontier

We already have a lot of commons.wikimedia and uni-tuebingen so we want to filter these ursl from the frontier now

In [67]:
import os
import random
import csv
import heapq
from urllib.parse import urlparse, urljoin, unquote_plus


In [83]:
def read_saved_frontier(filename: str) -> list:
    result = []
    with open(filename, newline='', encoding='utf-8') as csvfile:
        reader = csv.reader(csvfile)
        for row in reader:
            # Convert first two columns to int, third remains a string
            tup = (int(row[0]), int(row[1]), row[2])
            result.append(tup)
    return result


def read_saved_visited(filename: str) -> list:
    result = []
    with open(filename, newline='', encoding='utf-8') as csvfile:
        reader = csv.reader(csvfile)
        for row in reader:
            result.extend(row)
    return result


def save_set_to_csv(filename: str, visited: set) -> None:
    with open(filename, 'w') as f:
        writer = csv.writer(f)
        for url in visited:
            writer.writerow([url])

In [3]:
current_frontier = read_saved_frontier('frontier.csv')
frontier_domains = {}

def domain_counter(url, dictionary):
    domain = urlparse(url).netloc
    dictionary[domain] = dictionary.get(domain, 0) + 1


for entry in current_frontier:
    url = entry[2]
    domain_counter(url, frontier_domains)


In [4]:
current_saved_pages = read_saved_frontier('saved_pages.csv')
saved_domains = {}


for entry in current_saved_pages:
    url = entry[2]
    domain_counter(url, saved_domains)


In [5]:
sorted_saved_pages_domains = sorted(saved_domains.items(), key=lambda x: x[1], reverse=True)

print(f'Amount of saved domains: {len(sorted_saved_pages_domains)}\n')


# Print results
# for domain, count in sorted_saved_pages_domains:
#     print(f"{domain}: {count}")

Amount of saved domains: 244



In [6]:
# based on this we can no filter domains from the frontier, that are already very common in the saved pages
# print(sorted_saved_pages_domains[:25])

n_to_remove = 0

for i in range(25):
    domain = sorted_saved_pages_domains[i][0]
    print(f'is in saved pages: {sorted_saved_pages_domains[i][1]}')
    if domain in frontier_domains.keys():
        print(f'Domain in frontier: {domain} | {frontier_domains[domain]}')
        n_to_remove += frontier_domains[domain]
    else:
        print(f'Domain not in frontier: {domain}')
    print('\n')
    
print(n_to_remove)

is in saved pages: 4671
Domain in frontier: uni-tuebingen.de | 72849


is in saved pages: 1338
Domain in frontier: commons.wikimedia.org | 3250


is in saved pages: 520
Domain in frontier: www.iwm-tuebingen.de | 6713


is in saved pages: 362
Domain in frontier: www.uni-tuebingen.de | 17734


is in saved pages: 147
Domain in frontier: www.neuroschool-tuebingen.de | 449


is in saved pages: 129
Domain in frontier: www.unimuseum.uni-tuebingen.de | 891


is in saved pages: 127
Domain in frontier: en.wikipedia.org | 1466


is in saved pages: 126
Domain in frontier: allevents.in | 602


is in saved pages: 103
Domain in frontier: www.komoot.com | 583


is in saved pages: 100
Domain in frontier: www.dai-tuebingen.de | 515


is in saved pages: 94
Domain in frontier: www.booking.com | 88


is in saved pages: 91
Domain in frontier: www.medizin.uni-tuebingen.de | 1815


is in saved pages: 83
Domain in frontier: www.is.tuebingen.mpg.de | 3560


is in saved pages: 79
Domain in frontier: www.tuebinge

In [7]:
updated_frontier = []
domains_to_remove_from_frontier = [sorted_saved_pages_domains[i][0] for i in range(25)]

for entry in current_frontier:
    url = entry[2]
    domain = urlparse(url).netloc
    if domain not in domains_to_remove_from_frontier:
        updated_frontier.append(entry)


In [8]:
for domain in domains_to_remove_from_frontier:
    print(domain)

uni-tuebingen.de
commons.wikimedia.org
www.iwm-tuebingen.de
www.uni-tuebingen.de
www.neuroschool-tuebingen.de
www.unimuseum.uni-tuebingen.de
en.wikipedia.org
allevents.in
www.komoot.com
www.dai-tuebingen.de
www.booking.com
www.medizin.uni-tuebingen.de
www.is.tuebingen.mpg.de
www.tuebingen.mpg.de
www.timeanddate.com
tuebingen.ai
tuebingenresearchcampus.com
db.cs.uni-tuebingen.de
www.kyb.tuebingen.mpg.de
jazz-concerts.com
imprs-mmfd.tuebingen.mpg.de
hiiker.app
www.krone-tuebingen.de
wanderlog.com
portal.mlcloud.uni-tuebingen.de


In [9]:
print(f'removed {len(current_frontier) - len(updated_frontier)} entries from frontier')
print(n_to_remove)

removed 114777 entries from frontier
114777


In [10]:
for entry in updated_frontier[:10]:
    print(entry)

(-18, 3, 'https://www.jura.uni-tuebingen.de/einrichtungen/ifk/listserv')
(-18, 3, 'https://www.jstor.org/action/doBasicSearch?Query=%22Alter+Botanischer+Garten+T%C3%BCbingen%22&acc=on&wc=on')
(-18, 3, 'https://www.jstor.org/action/doBasicSearch?Query=%22Faculty+of+Roman-Catholic+Theology%2C+University+of+T%C3%BCbingen%22&acc=on&wc=on')
(-18, 3, 'https://www.jura.uni-tuebingen.de/einrichtungen/ifk/www')
(-18, 3, 'https://www.jura.uni-tuebingen.de/professoren_und_dozenten/vonbernstorff/')
(-18, 3, 'https://www.kreis-tuebingen.de/Startseite.html')
(-18, 3, 'https://www.mnf.uni-tuebingen.de/fachbereiche/geowissenschaften/sammlungen/mineralogische-sammlung.html')
(-18, 3, 'https://www.kreis-tuebingen.de/bauen_+verkehr/verkehr/kfz#anker11296032')
(-18, 3, 'https://www.iwm-tuebingen.de:443/www/de/probanden/index.html')
(-18, 3, 'https://www.krebskranke-kinder-tuebingen.de/wir-helfen/familie/elternhaus-familienhaus')


In [11]:
print(len(updated_frontier))

26296


In [12]:
updated_frontier_domains = {}

for entry in updated_frontier:
    url = entry[2]
    domain_counter(url, updated_frontier_domains)

sorted_updated_frontier_domains = sorted(updated_frontier_domains.items(), key=lambda x: x[1], reverse=True)

for domain in sorted_updated_frontier_domains:
    print(domain)


('web.archive.org', 4625)
('www.hih-tuebingen.de', 1447)
('www.cmfi.uni-tuebingen.de', 1120)
('www.eventbrite.com', 1092)
('www.fml.tuebingen.mpg.de', 736)
('doi.org', 721)
('www.youtube.com', 668)
('www.my-stuwe.de', 639)
('whichmuseum.com', 553)
('www.eye-tuebingen.de', 518)
('alma.uni-tuebingen.de', 500)
('womeninmath.net', 453)
('www.wg-gesucht.de', 432)
('www.ecogsci.cs.uni-tuebingen.de', 395)
('fit.uni-tuebingen.de', 296)
('ps.cs.uni-tuebingen.de', 293)
('bsky.app', 257)
('www.math.uni-tuebingen.de', 245)
('www.linkedin.com', 222)
('alma.uni-tuebingen.de:443', 195)
('www.compsens.uni-tuebingen.de', 186)
('www.eventbrite.de', 183)
('www.researchgate.net', 179)
('youtube.com', 171)
('dx.doi.org', 169)
('wisskomm.social', 169)
('www.bio.mpg.de', 155)
('listserv.uni-tuebingen.de', 148)
('timms.uni-tuebingen.de', 147)
('www.onegreenplanet.org', 147)
('commons.m.wikimedia.org', 143)
('www.facebook.com', 130)
('www.openstreetmap.org', 127)
('is.tuebingen.mpg.de', 126)
('xn--baw-joa.soci

# Recalculate priority scores

In [13]:
TUEBINGENS = ['tübingen', 'tubingen', 'tuebingen']


def contains_tuebingen(text: str) -> bool:
    try:
        for tue in TUEBINGENS:
            if tue in text.lower():
                return True

        return False

    except Exception as e:
        print(f'[ERROR] checking if Tuebingen in page: {e}')
        return False

In [14]:
visited = set(read_saved_visited('visited.csv'))
visited_domains = set()
print(len(visited))

for url in visited:
    domain = urlparse(url).netloc
    if domain not in visited_domains:
        visited_domains.add(domain)


print(len(visited_domains))

53814
908


In [15]:
print(list(visited_domains)[49])

test = 'www.uni-tuebingen.de' #'www.physiologie2.uni-tuebingen.de'
print(test in visited_domains)

fr.wikipedia.org
True


In [None]:
def calc_priority_score_updated(
    url: str,
    depth: int,
    visited_domains: set,
    current_domain: str,
    max_depth: int = 10
) -> int:
    score = 0

    if depth > max_depth:
        print('ERROR: max depth overstepped')
        return -1000  # very low priority -> will never be crawled
    
    # 1. Closer pages get higher score
    score += max(0, 12 - depth)

    # 2. URL contains Tübingen -> decays with depth
    if contains_tuebingen(unquote_plus(url)):
        score += max(0, int(4 * (1 - (depth / max_depth))))  

    # 4. New domain bonus to motivate domain diversity -> decays with depth
    if current_domain not in visited_domains:
        score += int(5 * (1 - (depth / max_depth)))

    # 6. Random exploration to further motivate diversity
    # score += random.randint(0, 2)
    score += random.randint(0, int(2 * (1 - depth / max_depth)))

    return score


In [77]:
print(len(updated_frontier))

26296


In [78]:
new_prioritized_frontier = []

for entry in updated_frontier:
    old_priority = entry[0]
    depth = entry[1]
    url = entry[2]
    domain = urlparse(url).netloc


    new_priority = - (calc_priority_score_updated(url=url, depth=depth, visited_domains=visited_domains, current_domain=domain))

    # print(f'NEW: {-new_priority}')
    # print(f'OLD: {old_priority}')
    # print(old_priority + new_priority)

    new_prioritized_frontier.append((new_priority, depth, url))
    

In [79]:
print(len(new_prioritized_frontier))
print(new_prioritized_frontier[0])

26296
(-12, 3, 'https://www.jura.uni-tuebingen.de/einrichtungen/ifk/listserv')


In [80]:
print(type(new_prioritized_frontier))

<class 'list'>


In [81]:
sorted_new_prioritized_frontier = sorted(new_prioritized_frontier, key=lambda x: x[0])

for entry in sorted_new_prioritized_frontier:
    print(entry)


(-16, 1, 'https://donate.wikimedia.org/?wmf_source=donate&wmf_medium=sidebar&wmf_campaign=en.wikipedia.org&uselang=en')
(-16, 1, 'https://itunes.apple.com/us/app/food-monster-5000+-recipes./id1052988561')
(-16, 1, 'http://')
(-16, 1, 'https://www.gamberorosso.it/contattaci#registrazionetribunale')
(-16, 1, 'http://www.jamclub.de/')
(-16, 1, 'http://www.kloster-bebenhausen.de/en/home/')
(-16, 1, 'https://www.reutlingen.de/naturkundemuseum')
(-16, 1, 'https://hi.wikipedia.org/wiki/%E0%A4%9F%E0%A5%8D%E0%A4%AF%E0%A5%81%E0%A4%AC%E0%A4%BF%E0%A4%A8%E0%A5%8D%E0%A4%97%E0%A4%A8')
(-16, 1, 'https://kk.wikipedia.org/wiki/%D0%A2%D1%8E%D0%B1%D0%B8%D0%BD%D0%B3%D0%B5%D0%BD')
(-16, 1, 'https://www.teinacher.de/')
(-16, 1, 'http://schokowerkstatt.ritter-sport.de/?lang=en')
(-16, 1, 'https://www.awin1.com/cread.php?awinmid=11018&awinaffid=507025&ued=https%3A%2F%2Fwww.viator.com%2F')
(-16, 1, 'https://www.avantlink.com/click.php?tt=cl&merchant_id=e295c418-295a-447c-b265-734e25f82503&website_id=9ee28d0b-9d

In [82]:
def save_frontier(filename: str, frontier: list) -> None:
    with open(filename, 'w') as f:
        writer = csv.writer(f)
        for entry in frontier:
            writer.writerow(entry)


heapq.heapify(new_prioritized_frontier)
save_frontier('new_prioritized_frontier.csv', new_prioritized_frontier)


In [85]:
print(visited_domains)
save_set_to_csv('visited_domains.csv', visited_domains)

{'www.neckartalradweg-bw.de', 'www.nip.uni-tuebingen.de', 'ja.weather-forecast.com', 'zu.wikipedia.org', 'www.klinischeanatomie-tuebingen.de', 'placesofgermany.de', 'www.physik.uni-tuebingen.de', 'www.thefork.com', 'idw-online.de', 'tuebingen.rockyourlife.de', 'exchange.uni-tuebingen.de', 'institute-tue.ellis.eu', 'infoe-tuebingen.de', 'www.echo24.de', 'www.bundeswahlleiterin.de', 'webmail.uni-tuebingen.de', 'cvml.tuebingen.ai', 'vep.wikipedia.org', 'feedbook.website', 'www.sino.uni-tuebingen.de', 'veggie-box-tuebingen.de', 'zea.wikipedia.org', 'www.wsi.uni-tuebingen.de', 'www.ns-akteure-in-tuebingen.de', 'jv.wikipedia.org', 'no.wikipedia.org', 'movein-uni-tuebingen.moveonnet.eu', 'idb.ub.uni-tuebingen.de', 'www.institutsgeschichten.khi.uni-tuebingen.de', 'www.jura.uni-tuebingen.de', 'www.qype.com', 'zh.weather-forecast.com', 'www.cvjm-tuebingen.de', 'tomsuehr.com', 'www.stay22.com', 'www.bgu-tuebingen.de', 'allevents.in', 'www.evolaemp.uni-tuebingen.de', 'www.tat.physik.uni-tuebingen.