In [1]:
!pip install geotext

Collecting geotext
  Downloading geotext-0.4.0-py2.py3-none-any.whl.metadata (2.5 kB)
Downloading geotext-0.4.0-py2.py3-none-any.whl (2.0 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.0/2.0 MB[0m [31m21.2 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: geotext
Successfully installed geotext-0.4.0


In [2]:
import re
import io
import csv
import os
import pandas as pd
from typing import List, Tuple
from collections import defaultdict
import spacy
from geotext import GeoText

In [3]:
def text_to_sentence_paragraphs(text, sentences_per_paragraph=6):
    # Basic sentence splitter
    sentences = re.split(r'(?<=[.!?])\s+', text)
    sentences = [s.strip() for s in sentences if s.strip()]
    paragraphs = []
    for i in range(0, len(sentences), sentences_per_paragraph):
        para = " ".join(sentences[i:i+sentences_per_paragraph])
        paragraphs.append(para)
    return paragraphs

def txt_file_to_page_paragraphs(txt_path, sentences_per_paragraph=6):
    """
    Reads a .txt file with "--- Page N ---" markers,
    returns a list of dicts: {"filename", "page", "paragraph"}
    """
    filename = os.path.basename(txt_path)
    with open(txt_path, "r", encoding="utf-8") as f:
        text = f.read()

    # Split by page marker
    page_blocks = re.split(r'\n?--- Page (\d+) ---\n', text)
    out = []
    # page_blocks[0] is any text before the first page (often empty)
    for i in range(1, len(page_blocks), 2):
        page_num = int(page_blocks[i])
        page_text = page_blocks[i+1]
        paragraphs = text_to_sentence_paragraphs(page_text, sentences_per_paragraph)
        for para in paragraphs:
            if para.strip():
                out.append({
                    "filename": filename,
                    "page": page_num,
                    "paragraph": para.strip()
                })
    return out

def folder_txts_to_page_paragraphs(folder_path, sentences_per_paragraph=6):
    """
    For all .txt files in folder, returns a list of dicts:
    filename, page, paragraph
    """
    all_paragraphs = []
    for fname in os.listdir(folder_path):
        if fname.lower().endswith(".txt"):
            file_path = os.path.join(folder_path, fname)
            out = txt_file_to_page_paragraphs(file_path, sentences_per_paragraph)
            all_paragraphs.extend(out)
    return all_paragraphs

In [4]:
COMPASS = {"N","S","E","W","NE","NW","SE","SW"}
DENY_SINGLE = {
    "al.", "et", "al", "date", "university", "université", "ministere", "orpailleur", "brgm",
    "thièblemont", "yacouba"
}
DENY_TITLES = {"university", "declaration", "preamble", "chapter", "figure", "table"}
DENY_PHRASE_PREFIXES = (
    "geologists at", "in this region", "various geological features",
    "between the towns", "the region", "the desert", "the capital city"
)

def looks_like_location_token(t: str) -> bool:
    s = t.strip()
    if not s: return False
    if len(s) <= 2: return False
    if s.upper() in COMPASS: return False
    if s.lower() in DENY_SINGLE: return False
    if s.endswith("."): return False
    if re.fullmatch(r"[^\w]+", s): return False
    return True

def looks_like_location_phrase(t: str) -> bool:
    s = re.sub(r"\s+", " ", t.strip())
    if not looks_like_location_token(s):
        return False
    low = s.lower()
    if low in DENY_TITLES: return False
    if any(low.startswith(pref) for pref in DENY_PHRASE_PREFIXES): return False
    if s.isupper() and " " not in s and not re.search(r"[-’']", s):
        return False
    words = s.split()
    if len(words) >= 2:
        titleish = sum(w[:1].isupper() for w in words) >= 1
        if not titleish: return False
    return True


Extractor 1: Using Spacy

Loads spaCy’s small english model and extracts entities with labels GPE, LOC, FAC (cities/countries/regions + many sites like mines).

In [5]:
def spacy_extract_locations(paragraphs):
    nlp = spacy.load("en_core_web_sm")
    out = []
    for item in paragraphs:
        filename = item['filename']
        page = item['page']
        para = item['paragraph']
        doc = nlp(para)
        for ent in doc.ents:
            if ent.label_ in {"GPE", "LOC", "FAC"}:
                loc_text = ent.text.strip()
                if looks_like_location_token(loc_text):
                    out.append({
                        "filename": filename,
                        "page": page,
                        "mention": para,
                        "location": loc_text,
                        "label": ent.label_,
                        "source": "spacy"
                    })
    return out


Extractor 2: Using Geotext

In [6]:
def geotext_extract_locations(paragraphs):
    """
    For each paragraph dict (filename, page, paragraph),
    extract city/country mentions using GeoText.
    Output: list of dicts with filename, page, mention (the paragraph), location, and label ("GEO").
    """
    out = []
    for item in paragraphs:
        filename = item['filename']
        page = item['page']
        para = item['paragraph']
        places = GeoText(para)
        # Combine all found locations
        locs = set(list(places.cities) + list(places.countries))
        for loc in locs:
            if looks_like_location_token(loc):   # Use your custom filter!
                out.append({
                    "filename": filename,
                    "page": page,
                    "mention": para,
                    "location": loc,
                    "label": "GEO",
                    "source": "geotext"
                })
    return out


Extractor 3: REGEX

In [8]:
def regex_geo_spans(text: str) -> List[Tuple[int, int, str, str, str]]:
    """
    Find geographic phrases using regex patterns.

    Matches three main cases:
      1. Proper name(s) followed by a geo suffix (e.g. "Ashanti Belt", "West African Craton")
      2. Directional phrase + proper name (e.g. "West Africa", "Eastern Highlands")
      3. Proper name(s) + geo unit (e.g. "Essakane Mine", "Tarkwa Basin", "Pilbara Region")

    Returns a list of spans: (start_index, end_index, source, label, text)
    """

    # case-insensitive suffixes that define a geo phrase
    geo_suffixes = (
        r"(?i:"  # (?i:) = case-insensitive group
        r"Belt|Greenstone Belt|Craton|Basin|Shear Zone|Fault|Goldfield|Range|Desert|River|"
        r"Lake|Sea|Ocean|Gulf|Province|State|Region|Valley|Plateau|Peninsula|Archipelago|"
        r"Canyon|Strait|Channel|Highlands|Lowlands|Orogeny|Greenstone"
        r")"
    )

    # pattern 1: ProperName (+ ProperName …) + geo suffix
    pattern_suffix = re.compile(
        rf"""\b(?:[A-Z][\w’'-]*(?:\s+[A-Z][\w’'-]*){{0,5}})\s+{geo_suffixes}\b"""
    )

    # pattern 2: Direction word + ProperName (e.g. "West Africa")
    pattern_direction = re.compile(
        r"""\b(?:North|South|East|West|Northern|Southern|Eastern|Western)\s+"""
        r"""[A-Z][\w’'-]+(?:\s+[A-Z][\w’'-]+){0,3}\b"""
    )

    # pattern 3: ProperName + geo unit like Mine/Region/City
    pattern_unit = re.compile(
        r"""\b(?:[A-Z][\w’'-]+(?:\s+[A-Z][\w’'-]+){0,4})\s+"""
        r"""(?i:Mine|Mines|Goldmine|Goldfield|District|Province|Region|County|City|Town|Village)s?\b"""
    )

    # quick quality check: phrase must look like a proper title
    def looks_titlecase(phrase: str) -> bool:
        phrase = re.sub(r"\s+", " ", phrase.strip())
        tokens = phrase.split()
        if not tokens:
            return False
        # single token: must start uppercase
        if len(tokens) == 1:
            return tokens[0][0].isupper()
        # multiword: at least half tokens start uppercase
        uppercase_count = sum(tok[0].isupper() for tok in tokens)
        return uppercase_count >= max(2, len(tokens) // 2)

    spans = []
    for pattern in (pattern_suffix, pattern_direction, pattern_unit):
        for match in pattern.finditer(text):
            candidate = text[match.start():match.end()]
            if looks_titlecase(candidate) and looks_like_location_phrase(candidate):
                spans.append((match.start(), match.end(), "regex", "GEO", candidate))

    return spans


In [9]:
def regex_geo_extract_paragraphs(paragraphs):
    """
    For each paragraph dict (filename, page, paragraph),
    extract regex-matched locations.
    Output: list of dicts: filename, page, mention, location, label ("regex-GEO")
    """
    results = []
    for item in paragraphs:
        filename = item['filename']
        page = item['page']
        para = item['paragraph']
        spans = regex_geo_spans(para)
        for start, end, src, label, loc in spans:
            results.append({
                "filename": filename,
                "page": page,
                "mention": para,
                "location": loc,
                "label": "regex-GEO",
                "source": "regex"
            })
    return results


Cleaning, Deduplicating and Normalizing results from results of merging all 3 extractors.

In [10]:
import re
import csv
import io

LEADING_DROP = (
    r"the\b", r"preamble\b", r"conclusion\b", r"at\b"
)

# Compass modifiers like “NE-trending”, “NW-striking”, etc. to strip (case-insensitive)
LEADING_COMPASS = r"(?:\b[NS][EW]?\s*-(?:trending|striking)\b\s*)+"

# If any of these substrings appear, reject the candidate (case-insensitive)
REJECT_SUBSTRINGS = (
    "declaration", "journal", "research ", "figure", "table",
    "universite", "university", "ministere", "ministry",
    "geologists at", "capital city", "between the towns",
    "various geological features", "in this region"
)

# Single-word denials (exact, case-insensitive)
REJECT_SINGLE = {"harmattan", "north-east", "earth", "subcrop"}

# If present as a standalone token (case-sensitive), reject (common surnames/initials)
REJECT_TOKENS = {"Rogers", "Davis", "K.A.A"}

# Geo suffixes we consider meaningful for regex candidates
GEO_SUFFIX_WORDS = {
    "Belt","Greenstone Belt","Craton","Basin","Shear Zone","Fault","Goldfield",
    "Range","Desert","River","Lake","Sea","Ocean","Gulf","Province","State",
    "Region","Valley","Plateau","Peninsula","Archipelago","Canyon","Strait",
    "Channel","Highlands","Lowlands","Orogeny","Greenstone","Mine","Mines",
    "Goldmine","District","County","City","Town","Village"
}

def _titlecase_if_allcaps(s: str) -> str:
    # Keep acronyms with digits/short bits as-is; otherwise Title Case ALL-CAPS for display
    if s.isupper() and len(s) > 3 and not re.search(r"\d", s):
        return " ".join(w.capitalize() if len(w) > 1 else w for w in s.split())
    return s

def _strip_leading_noise(s: str) -> str:
    t = s.strip()
    # remove compass “NE-trending …” style sequences first
    t = re.sub(rf"^{LEADING_COMPASS}", "", t, flags=re.IGNORECASE)
    # strip determiners/section words like The/Preamble/Conclusion/At (possibly repeated)
    changed = True
    while changed:
        changed = False
        for key in LEADING_DROP:
            new_t = re.sub(rf"^{key}\s+", "", t, flags=re.IGNORECASE)
            if new_t != t:
                t = new_t
                changed = True
    return t.strip()

def _has_geo_suffix(s: str) -> bool:
    # true if ends with one of our geo suffix tokens (case-insensitive)
    for suf in GEO_SUFFIX_WORDS:
        if re.search(rf"\b{re.escape(suf)}\b$", s, flags=re.IGNORECASE):
            return True
    return False

def _looks_like_geo(s: str, source: str, label: str) -> bool:
    low = s.lower()
    if any(sub in low for sub in REJECT_SUBSTRINGS):
        return False
    toks = s.split()
    if len(toks) == 1 and low in REJECT_SINGLE:
        return False
    if any(tok in REJECT_TOKENS for tok in toks):
        return False
    # For regex candidates, strongly prefer a geo suffix (“… Fault”, “… Basin”, “… Region”, …)
    if source == "regex":
        return _has_geo_suffix(s)
    # For spaCy/GeoText, allow country/city/GPE tokens even without suffix
    # but require Title-ish casing (not all-lowercase).
    if s[0].islower():
        return False
    return True

def _norm_for_dedup_paragraph(loc: str) -> str:
    s = re.sub(r"\s+", " ", loc.strip())
    s = _strip_leading_noise(s)
    return s.lower()

def _clean_location_display_paragraph(s: str) -> str:
    s = re.sub(r"\s+", " ", s.strip())
    s = _strip_leading_noise(s)
    s = _titlecase_if_allcaps(s)
    return s

In [11]:
def dedup_paragraph_location_results_from_lists(*results_lists):
    """
    Accepts any number of result lists (e.g. spacy, geotext, regex), flattens, dedups, and returns
    list of dicts: filename, page, mention, location, label, source.
    """
    # Flatten all results into one list
    all_results = []
    for lst in results_lists:
        all_results.extend(lst)

    seen = set()
    rows = []
    for d in all_results:
        fname = d['filename']
        page = d['page']
        para = d['mention']
        loc = d['location']
        label = d['label']
        source = d.get('source', 'unknown')
        loc_disp = _clean_location_display_paragraph(loc)
        if not _looks_like_geo(loc_disp, source, label):
            continue
        key = (fname, _norm_for_dedup_paragraph(loc_disp))
        if key in seen:
            continue
        seen.add(key)
        rows.append({
            "filename": fname,
            "page": page,
            "mention": para,
            "location": loc_disp,
            "label": label,
            "source": source
        })
    return rows

In [12]:
def process_txt_folder_to_csvs(folder_path, out_csv_folder, sentences_per_paragraph=6):
    os.makedirs(out_csv_folder, exist_ok=True)
    all_paragraphs = folder_txts_to_page_paragraphs(folder_path, sentences_per_paragraph)
    # Group paragraphs by filename
    file_to_paragraphs = defaultdict(list)
    for p in all_paragraphs:
        file_to_paragraphs[p["filename"]].append(p)
    for fname, paragraphs in file_to_paragraphs.items():
        print(f"Processing {fname}...")
        spacy_results = spacy_extract_locations(paragraphs)
        geotext_results = geotext_extract_locations(paragraphs)
        regex_results = regex_geo_extract_paragraphs(paragraphs)
        deduped = dedup_paragraph_location_results_from_lists(
            spacy_results, geotext_results, regex_results
        )
        out_csv = os.path.join(out_csv_folder, fname.replace(".txt", "_locations.csv"))
        pd.DataFrame(deduped).to_csv(out_csv, index=False)
        print(f"Saved to {out_csv}, {len(deduped)} unique locations.")


In [14]:
process_txt_folder_to_csvs("/content/drive/MyDrive/Data_txt", "/content/drive/MyDrive/Data4_rules_csv")

Processing 2013_FUNYUFUNYU.txt...
Saved to /content/drive/MyDrive/Data4_rules_csv/2013_FUNYUFUNYU_locations.csv, 200 unique locations.
Processing 2015_Masurel_phd.txt...
Saved to /content/drive/MyDrive/Data4_rules_csv/2015_Masurel_phd_locations.csv, 319 unique locations.
Processing 2013_Peters.txt...
Saved to /content/drive/MyDrive/Data4_rules_csv/2013_Peters_locations.csv, 125 unique locations.
Processing 2014_MSc_YOSSI.txt...
Saved to /content/drive/MyDrive/Data4_rules_csv/2014_MSc_YOSSI_locations.csv, 109 unique locations.
Processing 2009_Bontle Nkuna_0605886P_Honours Report.txt...
Saved to /content/drive/MyDrive/Data4_rules_csv/2009_Bontle Nkuna_0605886P_Honours Report_locations.csv, 37 unique locations.
Processing 2010_Mohale_GIS interpretation of NE Burkina Faso.txt...
Saved to /content/drive/MyDrive/Data4_rules_csv/2010_Mohale_GIS interpretation of NE Burkina Faso_locations.csv, 51 unique locations.
Processing 2007_Tshibubudze_THE MARKOYE FAULT_2007.txt...
Saved to /content/driv