In [1]:
!pip install geopy unidecode

Collecting unidecode
  Downloading Unidecode-1.4.0-py3-none-any.whl.metadata (13 kB)
Downloading Unidecode-1.4.0-py3-none-any.whl (235 kB)
[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/235.8 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m235.8/235.8 kB[0m [31m16.6 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: unidecode
Successfully installed unidecode-1.4.0


In [2]:
import pandas as pd
import os
import re
import time
from geopy.geocoders import Nominatim
from geopy.extra.rate_limiter import RateLimiter
import unidecode

Merging the LLM locations and rules locations

In [4]:
def agg_pages(series):
    return ','.join(sorted({str(x) for x in series if pd.notna(x)}))

def agg_mentions(series):
    return " || ".join(sorted({str(x).strip() for x in series if pd.notna(x)}))

def get_base_key(fname):
    # for match
    return (fname.lower()
            .replace('_locations.csv', '')
            .replace('_locations_llm_clean.csv', '')
            .replace('.csv', '')
            .replace(' ', '')
           )

In [5]:
def merge_rule_llm_pair(rules_csv, llm_csv, out_csv):
    import pandas as pd

    # RULES
    df_rules = pd.read_csv(rules_csv)
    if not {'filename', 'page', 'mention', 'location'}.issubset(df_rules.columns):
        print(f"Skipping {rules_csv}: missing required columns")
        return
    # Label as rules
    df_rules = df_rules[['filename', 'page', 'mention', 'location']].copy()
    df_rules["source"] = "rules"

    # LLM
    df_llm = pd.read_csv(llm_csv)
    if not {'filename', 'page', 'mention', 'location'}.issubset(df_llm.columns):
        print(f"Skipping {llm_csv}: missing required columns")
        return
    df_llm = df_llm[['filename', 'page', 'mention', 'location']].copy()
    df_llm["source"] = "llm"

    # Combine
    df_all = pd.concat([df_rules, df_llm], ignore_index=True)

    # Group and Merge Sources
    df_merged = (
        df_all
        .groupby(['filename', 'location'])
        .agg({
            'page': agg_pages,
            'mention': agg_mentions,
            'source': lambda s: ','.join(sorted(set(s))),
        })
        .reset_index()
        .rename(columns={'page': 'pages', 'mention': 'mentions'})
    )


    df_merged.to_csv(out_csv, index=False)
    print(f"Saved merged file: {out_csv}")
    return df_merged

In [6]:
def merge_all_rule_llm_files(rules_folder, llm_folder, out_folder):
    os.makedirs(out_folder, exist_ok=True)
    rules_files = {get_base_key(f): f for f in os.listdir(rules_folder) if f.endswith("_locations.csv")}
    llm_files = {get_base_key(f): f for f in os.listdir(llm_folder) if "_llm_clean.csv" in f.lower()}
    matched = rules_files.keys() & llm_files.keys()
    print(f"RULES base keys: {list(rules_files.keys())}")
    print(f"LLM base keys: {list(llm_files.keys())}")
    print(f"Found {len(matched)} matching base keys to merge.")
    for key in matched:
        rules_path = os.path.join(rules_folder, rules_files[key])
        llm_path = os.path.join(llm_folder, llm_files[key])
        out_path = os.path.join(out_folder, f"{key}_merged.csv")
        merge_rule_llm_pair(rules_path, llm_path, out_path)

In [25]:
merge_all_rule_llm_files("/content/drive/MyDrive/Data4_rules_csv", "/content/drive/MyDrive/Data_LLM_CSV_clean", "/content/drive/MyDrive/Data4_merged2")


RULES base keys: ['2013_funyufunyu', '2015_masurel_phd', '2013_peters', '2014_msc_yossi', '2009_bontlenkuna_0605886p_honoursreport', '2010_mohale_gisinterpretationofneburkinafaso', '2007_tshibubudze_themarkoyefault_2007', '2013_ramabulana_sadiolahillpetrology', '2010_matsheka_irvinfinalthesis', '2011_peters_eastmarkoye_2011', '2008_matabane_fe3', '2012_simoko_petrology,geochemistryandstructureofthepissilabatholithandthesaabazonegneiss', '2015_lebrun_siguiri', '2011_woolfe_thestratigraphyandmetamorphicfaciesofthekemb']
LLM base keys: ['2010_mohale_gisinterpretationofneburkinafaso', '2012_simoko_petrology,geochemistryandstructureofthepissilabatholithandthesaabazonegneiss', '2009_bontlenkuna_0605886p_honoursreport', '2007_tshibubudze_themarkoyefault_2007', '2010_matsheka_irvinfinalthesis', '2014_msc_yossi', '2015_masurel_phd', '2013_ramabulana_sadiolahillpetrology', '2015_lebrun_siguiri', '2008_matabane_fe3', '2013_funyufunyu', '2013_peters', '2011_woolfe_thestratigraphyandmetamorphicfaci

Cleaning the merged files:

Removing authors due to in text citation

Removing generic location names e.g. study area, basin

Categorizing location by type: GPS,PLACE and Approximate


In [7]:
def is_citation(mention, loc):
    # Looks for patterns like "Ledru et al.", "(Taylor, 1990)", etc.
    return bool(re.search(rf"\b{re.escape(loc)}\b.*et al\.?", mention, re.IGNORECASE)) or \
           bool(re.search(rf"\({re.escape(loc)}, \d{{4}}\)", mention))


In [8]:
import spacy
nlp = spacy.load("en_core_web_sm")
def is_person_entity(loc):
    doc = nlp(loc)
    return any(ent.label_ == "PERSON" for ent in doc.ents)


In [9]:
def is_author_like(loc, mention):
    loc_clean = loc.strip()
    if is_citation(mention, loc_clean):
        return True
    if is_person_entity(loc_clean):
        return True
    return False


In [10]:
def filter_out_authors(df):
    # Expects 'location' and 'mentions' columns
    mask = df.apply(lambda row: not is_author_like(row['location'], row['mentions']), axis=1)
    return df[mask].reset_index(drop=True)

In [11]:
GENERIC_LOCATION_WORDS = [
    "study area", "area of study", "area of investigation", "study region", "study site",
    "study location", "area", "region", "zone", "locality", "localities",
    "the area", "the region", "investigation area", "site", "sites"
]

def remove_generic_location_words(location):
    """
    Removes generic location words/phrases (case-insensitive) from start or end of a location string.
    """
    if not isinstance(location, str):
        return location
    s = location.strip()
    # Build a regex to match generic words at start or end (with optional spaces/punctuation)
    pattern = r"^(%s)\b[\s,:\-]*|[\s,:\-]*(%s)$" % (
        "|".join(map(re.escape, GENERIC_LOCATION_WORDS)),
        "|".join(map(re.escape, GENERIC_LOCATION_WORDS))
    )
    s = re.sub(pattern, "", s, flags=re.IGNORECASE)
    # Remove double spaces, commas, etc.
    s = re.sub(r"\s+", " ", s)
    s = s.strip(",.:-; ")
    return s


In [12]:
def is_gps(s):
    # Simple patterns for decimal/sexagesimal degrees or UTM
    s = str(s).strip()
    # Simple decimal degree/UTM/Easting-Northing patterns:
    if re.match(r"^\d{5,}[-; ]\s*\d{5,}$", s):  # e.g., 0184836-1587581 or 1534836; 0216256
        return True
    # Matches "N 14 36 37 8 E 00 00 12 1" etc.
    if re.search(r"[NSEW]\s*\d+", s) and re.search(r"[EW]\s*\d+", s):
        return True
    # Matches "GPS" in string (sometimes extracted as "GPS 14 35 05 7 W 00 00 05 4")
    if s.upper().startswith("GPS"):
        return True
    return False


def is_approximate(loc):
    if pd.isna(loc): return False
    loc = str(loc).strip()
    # Match "NE of Burkina Faso", "southern part of Ghana", etc.
    direction = r"(?:N|S|E|W|NE|NW|SE|SW|North|South|East|West|Northern|Southern|Eastern|Western)"
    if re.match(rf"^{direction}(\s+part)?\s+of\s+.+", loc, flags=re.IGNORECASE):
        return True
    if re.match(rf"^.+\b({direction})\b", loc, flags=re.IGNORECASE):
        return True
    return False

def is_geounit(s):
    geo_unit_keywords = [
        "belt", "craton", "basin", "shear zone", "fault", "batholith", "terrane", "inlier",
        "province", "region", "mine", "group", "supergroup", "pluton", "complex", "gneiss", "channel", "graben","domain"
    ]
    s = s.lower().strip()
    # Don't count plain country names as geounit
    return any(kw in s for kw in geo_unit_keywords)

def classify_location_type(loc):
    loc = str(loc)
    if is_gps(loc):
        return "GPS"
    elif is_approximate(loc):
        return "APPROXIMATE"
    elif is_geounit(loc):
        return "GEOLOGICAL_UNIT"
    elif len(loc) < 3:
        return "UNGEOCODED"
    else:
        return "PLACE"


While geocoding the approximate locations, the base area is extracted and treated as the main location but the geocode will indicate it will be approximate location

In [13]:
def extract_base_place(location):
    """
    Attempt to extract the most likely base location from a regional phrase.
    - Strips out directional modifiers and keeps the likely toponyms.
    - Works best for phrases like "eastern part of X", "near Y", etc.
    """
    # 1. Try 'of ...' or 'in ...'
    match = re.search(r'of ([A-Za-z \-\’\'éèêàôîïç]+)$', location)
    if match:
        return match.group(1).strip()
    match = re.search(r'in ([A-Za-z \-\’\'éèêàôîïç]+)$', location)
    if match:
        return match.group(1).strip()
    # 2. Remove phrases like 'part of', 'region of', 'border with'
    cleaned = re.sub(r'^(north|south|east|west|northern|southern|eastern|western|central|upper|lower|border|area|part|region|near|around|about|along)\s+of\s+', '', location, flags=re.I)
    cleaned = re.sub(r'^(north|south|east|west|northern|southern|eastern|western|central|upper|lower|border|area|part|region|near|around|about|along)\s+', '', cleaned, flags=re.I)
    cleaned = cleaned.strip(",.;:()[] ")

    # 3. If there are coordinates, skip
    if re.search(r'\d{4,}', cleaned):
        return location.strip()

    # 4. Return the last capitalized chunk (if any)
    matches = re.findall(r'\b([A-Z][a-zA-Z\'\-éèêàôîïç]+(?: [A-Z][a-zA-Z\'\-éèêàôîïç]+)*)\b', cleaned)
    if matches:
        return matches[-1]

    return cleaned.strip()

Geocoding using GeoPy geocoders

Output will add on: Latitude, Longitude, Location address and geocode type(approximate)

If a location is not found then no-geocode result will be output

In [23]:
geolocator = Nominatim(user_agent="geo_coding_example")
geocode = RateLimiter(geolocator.geocode, min_delay_seconds=1)  # Respect OSM's rate limits!

# Geocode function with handling for failures
def geopy_geocode(loc):
    if not loc or loc.lower() in {"gps", "na", "no-geocode-result"}:
        return pd.Series(["no-geocode-result"] * 4)
    # Try full location first
    try:
        result = geocode(loc, language='en', addressdetails=True, timeout=10)
        if result:
            return pd.Series([result.latitude, result.longitude, result.address, "APPROXIMATE"])
    except Exception as e:
        print(f"Error geocoding '{loc}': {e}")
        # You can sleep here for rate limits if needed
        time.sleep(1)

    # Try base place extraction as fallback
    base = extract_base_place(loc)
    if base and base.lower() != loc.lower():
        try:
            result = geocode(base, language='en', addressdetails=True, timeout=10)
            if result:
                return pd.Series([result.latitude, result.longitude, result.address, "APPROXIMATE"])
        except Exception as e:
            print(f"Error geocoding fallback '{base}': {e}")
            time.sleep(1)

    return pd.Series(["no-geocode-result"] * 4)



In [15]:
def clean_location_csv_folder(input_folder, output_folder):
    import os
    os.makedirs(output_folder, exist_ok=True)
    for fname in os.listdir(input_folder):
        if not fname.lower().endswith(".csv"):
            continue
        print(f"Processing: {fname}")
        df = pd.read_csv(os.path.join(input_folder, fname))
        original_len = len(df)
        # Clean location column
        if 'location' in df.columns:
            df['location'] = df['location'].map(remove_generic_location_words)
        else:
            print(f"Skipping {fname}: no 'location' column")
            continue

        # Drop empty after cleaning
        df_cleaned = df[df['location'].astype(str).str.strip().astype(bool)].reset_index(drop=True)

        # Remove authors - check for 'location' and 'mentions'
        if {'location', 'mentions'}.issubset(df_cleaned.columns):
            df_cleaned = filter_out_authors(df_cleaned)

        # Classify location type
        if 'location' in df_cleaned.columns:
            df_cleaned['location_type'] = df_cleaned['location'].map(classify_location_type)
        else:
            df_cleaned['location_type'] = 'UNGEOCODED' # Or some other default

        print(f"Kept {len(df_cleaned)}/{original_len} locations after filtering.")
        # Save
        outpath = os.path.join(output_folder, fname.replace(".csv", "_final.csv"))
        df_cleaned.to_csv(outpath, index=False)
        print(f"Saved cleaned: {outpath}")

In [40]:
clean_location_csv_folder("/content/drive/MyDrive/Data4_merged2", "/content/drive/MyDrive/Data_merged_clean")

Processing: 2013_peters_merged.csv
Kept 223/288 locations after filtering.
Saved cleaned: /content/drive/MyDrive/Data_merged_clean/2013_peters_merged_final.csv
Processing: 2014_msc_yossi_merged.csv
Kept 214/269 locations after filtering.
Saved cleaned: /content/drive/MyDrive/Data_merged_clean/2014_msc_yossi_merged_final.csv
Processing: 2007_tshibubudze_themarkoyefault_2007_merged.csv
Kept 76/107 locations after filtering.
Saved cleaned: /content/drive/MyDrive/Data_merged_clean/2007_tshibubudze_themarkoyefault_2007_merged_final.csv
Processing: 2012_simoko_petrology,geochemistryandstructureofthepissilabatholithandthesaabazonegneiss_merged.csv
Kept 34/45 locations after filtering.
Saved cleaned: /content/drive/MyDrive/Data_merged_clean/2012_simoko_petrology,geochemistryandstructureofthepissilabatholithandthesaabazonegneiss_merged_final.csv
Processing: 2011_woolfe_thestratigraphyandmetamorphicfaciesofthekemb_merged.csv
Kept 44/58 locations after filtering.
Saved cleaned: /content/drive/MyD

In [26]:
def geocode_location_csv_folder(input_folder, output_folder):
    os.makedirs(output_folder, exist_ok=True)
    for fname in os.listdir(input_folder):
        if not fname.lower().endswith(".csv"):
            continue
        print(f"Geocoding: {fname}")
        df = pd.read_csv(os.path.join(input_folder, fname))
        if 'location' not in df.columns:
            print(f"SKIP (no location column): {fname}")
            continue
        # Clean generic location words (optional)
        df['location'] = df['location'].map(remove_generic_location_words)
        # Apply geopy_geocode, safe handling!
        geo_df = df['location'].apply(lambda loc: pd.Series(geopy_geocode(loc)))
        geo_df.columns = ['geocode_lat', 'geocode_lon', 'geocode_str', 'geocode_type']
        df = pd.concat([df, geo_df], axis=1)
        out_csv = os.path.join(output_folder, fname.replace(".csv", "_geocoded.csv"))
        df.to_csv(out_csv, index=False)
        print(f"Saved geocoded file: {out_csv}")


geocode_location_csv_folder("/content/drive/MyDrive/Data_merged_clean", "/content/drive/MyDrive/Data_geocoded")

Geocoding: 2013_peters_merged_final.csv
Saved geocoded file: /content/drive/MyDrive/Data_geocoded/2013_peters_merged_final_geocoded.csv
Geocoding: 2014_msc_yossi_merged_final.csv
Saved geocoded file: /content/drive/MyDrive/Data_geocoded/2014_msc_yossi_merged_final_geocoded.csv
Geocoding: 2007_tshibubudze_themarkoyefault_2007_merged_final.csv
Saved geocoded file: /content/drive/MyDrive/Data_geocoded/2007_tshibubudze_themarkoyefault_2007_merged_final_geocoded.csv
Geocoding: 2012_simoko_petrology,geochemistryandstructureofthepissilabatholithandthesaabazonegneiss_merged_final.csv


Traceback (most recent call last):
  File "/usr/local/lib/python3.12/dist-packages/geopy/geocoders/base.py", line 368, in _call_geocoder
    result = self.adapter.get_json(url, timeout=timeout, headers=req_headers)
             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/usr/local/lib/python3.12/dist-packages/geopy/adapters.py", line 472, in get_json
    resp = self._request(url, timeout=timeout, headers=headers)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/usr/local/lib/python3.12/dist-packages/geopy/adapters.py", line 500, in _request
    raise AdapterHTTPError(
geopy.adapters.AdapterHTTPError: Non-successful status code 503

The above exception was the direct cause of the following exception:

Traceback (most recent call last):
  File "/usr/local/lib/python3.12/dist-packages/geopy/extra/rate_limiter.py", line 136, in _retries_gen
    yield i  # Run the function.
    ^^^^^^^
  File "/usr/local/lib/python3.12/dist-packages/geop

Saved geocoded file: /content/drive/MyDrive/Data_geocoded/2012_simoko_petrology,geochemistryandstructureofthepissilabatholithandthesaabazonegneiss_merged_final_geocoded.csv
Geocoding: 2011_woolfe_thestratigraphyandmetamorphicfaciesofthekemb_merged_final.csv
Saved geocoded file: /content/drive/MyDrive/Data_geocoded/2011_woolfe_thestratigraphyandmetamorphicfaciesofthekemb_merged_final_geocoded.csv
Geocoding: 2011_peters_eastmarkoye_2011_merged_final.csv
Saved geocoded file: /content/drive/MyDrive/Data_geocoded/2011_peters_eastmarkoye_2011_merged_final_geocoded.csv
Geocoding: 2015_masurel_phd_merged_final.csv


Traceback (most recent call last):
  File "/usr/local/lib/python3.12/dist-packages/geopy/geocoders/base.py", line 368, in _call_geocoder
    result = self.adapter.get_json(url, timeout=timeout, headers=req_headers)
             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/usr/local/lib/python3.12/dist-packages/geopy/adapters.py", line 472, in get_json
    resp = self._request(url, timeout=timeout, headers=headers)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/usr/local/lib/python3.12/dist-packages/geopy/adapters.py", line 500, in _request
    raise AdapterHTTPError(
geopy.adapters.AdapterHTTPError: Non-successful status code 503

The above exception was the direct cause of the following exception:

Traceback (most recent call last):
  File "/usr/local/lib/python3.12/dist-packages/geopy/extra/rate_limiter.py", line 136, in _retries_gen
    yield i  # Run the function.
    ^^^^^^^
  File "/usr/local/lib/python3.12/dist-packages/geop

Saved geocoded file: /content/drive/MyDrive/Data_geocoded/2015_masurel_phd_merged_final_geocoded.csv
Geocoding: 2010_matsheka_irvinfinalthesis_merged_final.csv
Saved geocoded file: /content/drive/MyDrive/Data_geocoded/2010_matsheka_irvinfinalthesis_merged_final_geocoded.csv
Geocoding: 2009_bontlenkuna_0605886p_honoursreport_merged_final.csv
Saved geocoded file: /content/drive/MyDrive/Data_geocoded/2009_bontlenkuna_0605886p_honoursreport_merged_final_geocoded.csv
Geocoding: 2010_mohale_gisinterpretationofneburkinafaso_merged_final.csv
Saved geocoded file: /content/drive/MyDrive/Data_geocoded/2010_mohale_gisinterpretationofneburkinafaso_merged_final_geocoded.csv
Geocoding: 2015_lebrun_siguiri_merged_final.csv
Saved geocoded file: /content/drive/MyDrive/Data_geocoded/2015_lebrun_siguiri_merged_final_geocoded.csv
Geocoding: 2013_funyufunyu_merged_final.csv
Saved geocoded file: /content/drive/MyDrive/Data_geocoded/2013_funyufunyu_merged_final_geocoded.csv
Geocoding: 2008_matabane_fe3_merged

Problems from geocode duplicate places that are repeated e.g. Sadiola goldfield, Sadiola, Sadiola mine etc how to collapse plus retain mentions and pages

Mapping visualization for checking but needs improvement for better interface what user can input etc

should show location, mentions and page in the text

In [16]:
import folium


df = pd.read_csv("/content/drive/MyDrive/Data4_merged_clean_geocoded/2007_tshibubudze_themarkoyefault_2007_merged_final_geocoded.csv")


df = df[df['geocode_lat'].apply(lambda x: str(x).replace('.','',1).isdigit())]
df['geocode_lat'] = df['geocode_lat'].astype(float)
df['geocode_lon'] = df['geocode_lon'].astype(float)


center_lat = df['geocode_lat'].mean()
center_lon = df['geocode_lon'].mean()

m = folium.Map(location=[center_lat, center_lon], zoom_start=6)

for i, row in df.iterrows():

    popup_text = (
        f"<b>Location:</b> {row.get('location_clean', '')}<br>"
        f"<b>Context:</b> {row.get('mentions', row.get('mention', ''))}<br>"
        f"<b>Geocode:</b> {row.get('geocode_str', '')}<br>"
        f"<b>Pages:</b> {row.get('pages', '')}"
    )
    folium.Marker(
        location=[row['geocode_lat'], row['geocode_lon']],
        popup=folium.Popup(popup_text, max_width=350),
        tooltip=row.get('location_clean', '')
    ).add_to(m)

# Save map to HTML and display
m.save("locations_map.html")
m