In [None]:
import pandas as pd

In [None]:
data = pd.read_csv('dates.csv', sep=';', header=None)
data

In [None]:
data.loc[len(data)] = ["od 21 lutego aż do dnia 29 lutego"]
data.loc[len(data)] = ["21 luty 22 luty 23 luty"]
data.loc[len(data)] = ["od dnia 21 lutego do 29 lutego"]
data.loc[len(data)] = ["lipca i sierpnia rok 2025"]
data

In [None]:
import spacy


In [None]:
nlp = spacy.load("pl_core_news_lg")



In [None]:
for i in range(len(data)):
    doc = nlp(data.iloc[i][0])
    print("--- "+doc.text)
    for tok in doc:
        print(tok.text, tok.pos_, tok.ent_type_)
    for ent in doc.ents:
        print("++ ", ent.text, "+++", ent.start_char, ent.end_char, ent.label_, ent.start)


In [None]:
patterns = [[
        {"POS": {"IN": ["ADJ", "NUM"]},  "OP": "?"},
        {"POS": "NOUN", "ENT_TYPE": "date", "LOWER": {"REGEX": "(?:stycz|lut|mar|kwie|maj|czerw|lip|sierp|wrze|październik|listopad|grud)[a-z]*"}},
        {"LOWER": {"IN": ["-", "do", "i"]}},
        {"POS": {"IN": ["ADJ", "NUM"]}, "OP": "?"},
        {"POS": "NOUN", "LOWER": {"REGEX": "(?:stycz|lut|mar|kwie|maj|czerw|lip|sierp|wrze|październik|listopad|grud)[a-z]*"}}
], [
        {"POS": {"IN": ["ADJ", "NUM"]}, "ENT_TYPE": "date"},
        {"POS": "NOUN",  "OP": "?", "LOWER": {"REGEX": "(?:stycz|lut|mar|kwie|maj|czerw|lip|sierp|wrze|październik|listopad|grud)[a-z]*"}},
        {"LOWER": {"IN": ["-", "do", "i"]}},
        {"POS": {"IN": ["ADJ", "NUM"]}},
        {"POS": "NOUN",  "OP": "?", "LOWER": {"REGEX": "(?:stycz|lut|mar|kwie|maj|czerw|lip|sierp|wrze|październik|listopad|grud)[a-z]*"}}
]]

date_patterns = [[
    {"ENT_TYPE": "date", "OP": "?"},
    {"POS": "NOUN", "ENT_TYPE": "date", "LOWER": {"REGEX": "(?:stycz|lut|mar|kwie|maj|czerw|lip|sierp|wrze|październik|listopad|grud)[a-z]*"}}
]]

In [None]:
from spacy.matcher import Matcher

matcher = Matcher(nlp.vocab)
matcher.add("DatePhrase", date_patterns)
for i in range(len(data)):
    doc = nlp(data.iloc[i][0])
    print("--- "+doc.text)
    matches = matcher(doc)
    for m_id, start, end in matches:
        print(doc[start:end])


In [None]:
from spacy.matcher import Matcher

matcher = Matcher(nlp.vocab)
matcher.add("DatePhrase", patterns)
for i in range(len(data)):
    doc = nlp(data.iloc[i][0])
    print("--- "+doc.text)
    matches = matcher(doc)
    for m_id, start, end in matches:
        print(doc[start:end])


In [None]:
from transformers import GPT2Tokenizer

In [None]:
tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
enc = tokenizer(data.iloc[0][1])

for i in enc['input_ids']:
    print(tokenizer.decode(i))

In [None]:
from transformers import BertTokenizer

In [None]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
enc = tokenizer(data.iloc[0][1])
for i in enc['input_ids']:
    print(tokenizer.decode(i))


In [None]:
from transformers import AutoTokenizer


In [None]:
tokenizer = AutoTokenizer.from_pretrained('flax-community/papuGaPT2')
enc = tokenizer(data.iloc[0][1])
for i in enc['input_ids']:
    print(tokenizer.decode(i))


In [None]:
tokenizer = AutoTokenizer.from_pretrained('dkleczek/bert-base-polish-uncased-v1')
enc = tokenizer(data.iloc[0][1])
print(enc)
for i in enc['input_ids']:
    print(tokenizer.decode(i))


In [None]:
tokenizer = AutoTokenizer.from_pretrained('allegro/herbert-base-cased')
enc = tokenizer(data.iloc[0][1])
for i in enc['input_ids']:
    print(tokenizer.decode(i))


In [None]:
from transformers import XLMTokenizer

In [None]:
tokenizer = XLMTokenizer.from_pretrained('allegro/herbert-klej-cased-tokenizer-v1')
enc = tokenizer(data.iloc[0][1])
for i in enc['input_ids']:
    print(tokenizer.decode(i))


In [None]:
from creating_trip.point_of_interest.poi_from_osm_selectors import selectors
from OSMPythonTools.overpass import Overpass
from OSMPythonTools.nominatim import Nominatim

In [None]:
overpass = Overpass()
nominatim = Nominatim()
region = nominatim.query("Europa, Italy")

region_data = region.toJSON()
if isinstance(region_data, list):
    region_data = region_data[0]

region_name = region_data.get("name")
print(region_data)
float(region_data.get("lat"))

In [None]:
query_str = f'area["name"="{region_name}"]->.searchArea;('
for selector in selectors:
    query_str += f'nwr["{selector[0]}"="{selector[1]}"](area.searchArea);'
query_str += ');out body;>;out skel;'
res = overpass.query(query_str)
res.toJSON()


In [None]:
from creating_trip.point_of_interest.poi_from_osm_selectors import selectors
from OSMPythonTools.overpass import Overpass
from OSMPythonTools.nominatim import Nominatim
from models.mongo_utils import MongoUtils
from creating_trip.point_of_interest.mappings_for_OSM import determine_kinds

In [None]:
def save_pois(region: str, country: str):
    overpass = Overpass()
    nominatim = Nominatim()
    region_data = nominatim.query(region)

    region_data = region_data.toJSON()
    if isinstance(region_data, list):
        region_data = region_data[0]

    region_name = region_data.get("name")
    query_str = f'area["name"="{region_name}"]->.searchArea;('
    for selector in selectors:
        query_str += f'nwr["{selector[0]}"="{selector[1]}"](area.searchArea);'
    query_str += ');out body;>;out skel;'
    res = overpass.query(query_str)

    db_connection = MongoUtils()
    collection = db_connection.get_collection_attractions(f"{country.lower()}-{region.lower()}")
    for element in res.toJSON().get("elements"):
        tags = element.get("tags")
        if tags is None:
            continue
        if tags.get("name") is None or element.get("type") is None or element.get("id") is None:
            continue

        lon = element.get("lon")
        lat = element.get("lat")
        if lon is None or lat is None:
            data = nominatim.query(f"{element.get('type')}/{element.get('id')}", lookup=True).toJSON()
            if isinstance(data, list):
                if len(data) == 0:
                    continue
                data = data[0]
            lon = data.get("lon")
            lat = data.get("lat")
            if lon is None or lat is None:
                continue
        lon = float(lon)
        lat = float(lat)

        kinds = determine_kinds(tags)
        if len(kinds) == 0:
            continue
        collection.insert_one({
            "xid": f"{element.get('type')[0].upper()}{element.get('id')}",
            "name": tags.get("name"),
            "osm": f"{element.get('type')}/{element.get('id')}",
            "wikidata": tags.get("wikidata"),
            "wikipedia": tags.get("wikipedia"),
            "point":{
                "lon": lon,
                "lat": lat
            },
            "opening_hours": tags.get("opening_hours"),
            "url": tags.get("website"),
            "kinds": kinds
        })


In [None]:
save_pois("Gdańsk", "Poland")

In [None]:
from humanized_opening_hours import OHParser
import re

In [None]:
day_regex = re.compile("^(\w+):\s*(?:(\d\d:\d\d)\s*-\s*(\d\d:\d\d)|(closed))$")

opening_hours = "We-Sa 09:30-15:00; Tu 12:00-18:30"
parser = OHParser(opening_hours)
description = parser.render().plaintext_week_description()
week = description.split("\n")
print(week)
for day in week:
    match = day_regex.match(day)
    print(match.group(2))
