# Location names

Some location names are long, sometimes very long, and often contain redundant
information such as the region, country code and coordinates which can easily
be obtained from the related models. Long names also take up a lot of screen 
space, which creates a lot of visual clutter:

![Examples of long location names](./images/long-location-names.png "Some examples of long location names")

This notebooks analyses the names to see if there are any options to filter them
on loading to remove the redundant information.

## History

2025-05-01 Initial version with statistics on location names

In [None]:
from django.db import connection
from django.db.models.functions import Length
from django.template.loader import render_to_string

from IPython.display import display, HTML

import matplotlib.pyplot as plt

from data.models import Location

In [None]:
# Top fifty shortest names
shortest = Location.objects.annotate(length=Length('name')).order_by("length")[:50]

table = render_to_string("notebooks/location-names.html", {"locations": shortest})
display(HTML(table))

In [None]:
# Top fifty longest names
longest = Location.objects.annotate(length=Length('name')).order_by("-length")[:50]

table = render_to_string("notebooks/location-names.html", {"locations": longest})
display(HTML(table))

In [None]:
# Distribution of lengths
with connection.cursor() as cursor:
    cursor.execute(
        "select count(*), ((length(name) - 1) / 5)::int as quantile from data_location group by quantile"
    )

    result = sorted(cursor.fetchall(), key=lambda t: t[1])

data: list[int] = []
labels: list[str] = []

for count, index in result:
    start = index * 5 + 1
    end = start + 4
    data.append(count)
    labels.append(end)

fig, ax = plt.subplots()
fig.set_figwidth(15)
ax.bar(labels, data, width=4)
ax.set_xlabel("Name length ranges, e.g. 1-5, 6-10, etc.")
ax.set_ylabel("Number of locations")
ax.set_title('Distribution of location name lengths')
plt.show();

In [None]:
# Location names, particularly private or one-time locations, end with the coordinates.

latitude = r"-?\d{1,2}[.,]\d{2,6}"
longitude = r"-?\d{1,3}[.,]\d{2,6}"
coordinates = r"\(?%s, ?%s\)?$" % (latitude, longitude)

labels = ["Hotspots with", "Hotspots without", "Private with", "Private withoout"]

values = [
    Location.objects.filter(hotspot=True, name__regex=coordinates).count(),
    Location.objects.filter(hotspot=True).exclude(name__regex=coordinates).count(),
    Location.objects.filter(hotspot=False, name__regex=coordinates).count(),
    Location.objects.filter(hotspot=False).exclude(name__regex=coordinates).count(),
]

print("Hotspots with:", values[0])
print("Hotspots without:", values[1])
print("Private with:", values[2])
print("Private without:", values[3])

fig, ax = plt.subplots()
ax.pie(values, labels=labels, autopct='%1.f%%');

In [None]:
# Do location names ending with coordinates, also contain the country code?
from data.models import Location

latitude_regex = r"-?\d{1,2}[.,]\d{3,5}"
longitude_regex = r"-?\d{1,3}[.,]\d{3,5}"
coordinates_regex = r"\(%s, ?%s\)$" % (latitude_regex, longitude_regex)

# How many locations match this pattern?
total = Location.objects.filter(name__regex=coordinates_regex).count()

print("Locations ending in coordinates:", total)

country_code = "PT"
country_coordinates_regex = r", %s %s$" % (country_code, coordinates_regex)

# How many locations match this pattern?
total = Location.objects.filter(name__regex=country_coordinates_regex).count()

print("Locations ending with country code and coordinates:", total)
print()

# What are the differences?
names = set()

for location in Location.objects.filter(name__regex=country_coordinates_regex):
    names.add(location.name)

for location in Location.objects.filter(name__regex=coordinates_regex):
    if location.name not in names:
        print(location.name)

# The majority of the differences is where the country code has the
# name of the region as a suffix. For example:
#
# PT-Lisboa
# PT-Centro
# PT-Azores
#
# Note that the suffixes include either the English name (Azores) or the
# Portuguese name (Norte), or either, PT-Lisbon, PT-Lisboa. Based on the other
# part of the name, this pattern appears to occur most often when people from
# other countries are visiting. For example "Lissabon" is the French name for
# the district or city. The region name can also contain accented characters,
# which are not Portuguese, e.g. PT-Madère.


In [None]:
# Do location names ending with coordinates, also contain the country code and the name of the region?
from data.models import Location

latitude_regex = r"-?\d{1,2}[.,]\d{3,5}"
longitude_regex = r"-?\d{1,3}[.,]\d{3,5}"
coordinates_regex = r"\(%s, ?%s\)$" % (latitude_regex, longitude_regex)

# How many locations match this pattern?
total = Location.objects.filter(name__regex=coordinates_regex).count()

print("Locations ending in coordinates:", total)

country_code = "PT"
region_suffix = r"-\w+"
country_region_regex = r"%s(%s)? %s$" % (country_code, region_suffix, coordinates_regex)

# How many locations match this pattern?
total = Location.objects.filter(name__regex=country_region_regex).count()

print("Locations ending with country code. optional region, and coordinates:", total)
print()

# What is the difference:
names = set()

for location in Location.objects.filter(name__regex=country_region_regex):
    names.add(location.name)

for location in Location.objects.filter(name__regex=coordinates_regex):
    if location.name not in names:
        print(location.name)

In [None]:
# The differences are:
# - the name of the country is used instead of the country code, e.g. Portugal instead of PT
# - the name is from a different country, e.g. Viana do Castelo, ES !!!
# - the entire name only contains the coordinates, .e.g. (37,616, -7,959)
# - there is no space between the end of the name and the coordinates, e.g. PT - Entre Selvagem Grande e Pequena(30.1060,-15.9545)
# - there are two spaces before the coordinates, e.g. Parada  (41,264, -6,888)
# - there is a hyphen before the coordinates, e.g, Localidade Desconhecida - (38,306, -8,245)
#
# The observer editing the name, before saving the location, explains most of these.
# Since the variations are potentially endless, any cleanup should stick to the basics
# then the remainder can be edited manually.

In [None]:
# From the above analysis, how can locations be cleaned up?
#
# 1. Remove the ", Country Code-[Region] Coordinates"
# 2. Assume commas path separators - remove the last element
# 3. Remove any descriptors, e.g. "(accesso condicionado)"
# 4. Truncate names longer than 80 characters
# 5. Success???
#
# Step 2 removes the last element if the count equals three, as some of 
# the really long names contain commas giving the list of frequesias. 
# Truncating the string will take care of those. 

In [35]:
# How well so the following "rules" work?

# Write the results to a file because the number of locations 
# is large and scrolling in notebooks is completely broken.

import re

from data.models import Location

# +/- 90 degrees with comma or period for the decimal point,
# followed by 2-5 decimal places.
latitude_regex = r"-?\d{1,2}[.,]\d{2,5}"

# +/- 180 degrees with comma or period for the decimal point,
# followed by 2-5 decimal places.
longitude_regex = r"-?\d{1,3}[.,]\d{2,5}"

# Latitude and longitude, separated by comma, and/or optional space,
# surrounded by optional round brackets, and preceded with anm optional
# comma and a space. The initial part of the name is captured in a
# group for the next step. The word boundary is added since the .*
# is greedy, consume the comma, and so the match will fail.
coordinates_regex = r"^(.*)\b,? (:?\()?%s, ?%s(:?\))?$" % (latitude_regex, longitude_regex)

# Country code (hard-wired to PT). The initial part of the name
# is captured in a group for the next step.
country_regex = r"^(.*), PT$"

# Country code (hard-wired to PT), followed by the region name,
# which might be in any language. The initial part of the name
# is captured in a group for the next step.
region_regex = r"^(.*) PT-\w+$"


def remove_coordinates(name: str) -> str:
    if re.match(coordinates_regex, name):
        name = re.sub(coordinates_regex, r"\1", name)
    return name


def remove_country(name: str) -> str:
    if re.match(country_regex, name):
        name = re.sub(country_regex, r"\1", name)
    return name


def remove_region(name: str) -> str:
    if re.match(region_regex, name):
        name = re.sub(region_regex, r"\1", name)
    return name


def remove_state(name: str) -> str:
    elements = name.split(",")
    if len(elements) == 3:
        del elements[2]
        name = ",".join(elements)
    return name


def truncate(name: str) -> str:
    if len(name) > 80:
        name = name[:78] + "..."
    return name


def remove_access(name: str) -> str:
    if name.endswith(" (acesso condicionado)"):
        name = name.replace(" (acesso condicionado)", "")
    return name


def generate_byname(name) -> str:
    cleaned = remove_coordinates(name)
    cleaned = remove_country(cleaned)
    cleaned = remove_region(cleaned)
    cleaned = remove_state(cleaned)
    cleaned = remove_access(cleaned)
    cleaned = truncate(cleaned)
    return cleaned if cleaned != name else ""


with open('results/locations-original.txt', 'a') as fpo:
    with open('results/locations-cleaned.txt', 'a') as fpc:
    
        for location in Location.objects.all():
            location.byname = generate_byname(location.name)
     
            fpo.write("%s\n" % location.name)
            fpc.write("%s\n" % location.display_name())

print("Location bynames generated")

Location bynames generated


In [None]:
# How many locations contain the city, county or region name repeated, e.g. ..., Lisboa, Lisboa, PT

import re

from data.models import Location

pattern = r"^(\w+), \1$"

queryset = Location.objects.filter(byname__regex=pattern)

print("Repeated:", queryset.count())
print()

for location in queryset:
    print(location.byname)