# Named Entity Recognition (NER) for LULCC Texts
Extract geographic regions, organizations, and custom LULCC terms from abstracts.


## Import Libraries


In [45]:
import spacy
from spacy.language import Language
from spacy.matcher import Matcher, PhraseMatcher
from pdfminer.high_level import extract_pages
from pdfminer.layout import LTTextContainer
import pandas as pd
import numpy as np
from sklearn.metrics import classification_report
import pandas as pd
import spacy
from spacy.matcher import PhraseMatcher
import pycountry

In [None]:
nlp = spacy.load("en_core_web_sm")



LAND_COVER_TERMS = [
    "forest", "woodland", "rainforest", "boreal forest", "temperate forest",
    "tropical forest", "mangrove", "taiga", "jungle", "cloud forest",
    "cropland", "farmland", "agricultural land", "arable land", "pasture",
    "grazing land", "rice paddy", "irrigated land", "plantation", "vineyard",
    "orchard", "olive grove", "grassland", "savanna", "steppe", "prairie",
    "shrubland", "bushland", "tundra", "wetland", "marsh", "swamp", "bog",
    "fen", "peatland", "urban area", "built-up area", "industrial area",
    "transportation area", "residential area", "commercial area", "mining area",
    "bare ground", "desert", "sand dune", "rocky area", "glacier", "permafrost",
    "water body", "lake", "river", "reservoir", "lagoon"
]

CHANGE_VERBS = [
    "increase", "decrease", "expand", "shrink", "grow", "decline", "rise", "fall",
    "convert", "transform", "replace", "turn into", "change to", "shift to",
    "transition to", "give way to", "make way for", "degrade", "restore",
    "reclaim", "rehabilitate", "regenerate", "deteriorate", "improve", "enhance",
    "damage", "fragment", "fragmentize", "clear", "log", "cultivate", "irrigate",
    "abandon", "afforest", "reforest", "deforest", "urbanize", "develop",
    "mine", "drain", "flood"
]

COUNTRIES = [country.name for country in pycountry.countries]
GLOBAL_REGIONS = [
    "Sub-Saharan Africa", "North Africa", "East Africa", "West Africa", "Central Africa", "Southern Africa",
    "Sahel", "Sahara", "Middle East", "Western Asia", "Central Asia", "South Asia",
    "Southeast Asia", "East Asia", "North America", "Central America", "Caribbean",
    "South America", "Amazon", "Andes", "Patagonia", "Europe", "Western Europe",
    "Eastern Europe", "Northern Europe", "Southern Europe", "Balkans", "Scandinavia",
    "Arctic", "Antarctica", "Oceania", "Australasia", "Melanesia", "Micronesia", "Polynesia"
]


In [47]:

matcher = PhraseMatcher(nlp.vocab, attr="LOWER")
matcher.add("LAND_COVER", list(nlp.pipe(LAND_COVER_TERMS)))
matcher.add("CHANGE", list(nlp.pipe(CHANGE_VERBS)))

geo_matcher = PhraseMatcher(nlp.vocab, attr="LOWER")
geo_matcher.add("GEO", list(nlp.pipe(COUNTRIES + GLOBAL_REGIONS)))


# Step 3: CUSTOM NLP PIPELINE


In [None]:

def extract_rules(text):
    doc = nlp(text)
    rules = []

    # 1. Extract geography
    geographies = set()
    for ent in doc.ents:
        if ent.label_ in ["GPE", "LOC"]:
            geographies.add(ent.text.strip())
    for match_id, start, end in geo_matcher(doc):
        geographies.add(doc[start:end].text.strip())
    if not geographies:
        geographies.add("Global")

    # 2. Extract percentage change
    for token in doc:
        if token.like_num and "%" in doc[token.i:token.i+3].text:
            context = doc[max(0, token.i - 5):min(len(doc), token.i + 5)].text
            for geo in geographies:
                rules.append({
                    'type': 'percentage_change',
                    'term': '',
                    'value': f"{token.text}%",
                    'geography': geo,
                    'context': context
                })

    # 3. Extract land cover + change process terms
    matches = matcher(doc)
    for match_id, start, end in matches:
        term = doc[start:end].text
        rule_type = 'land_cover' if nlp.vocab.strings[match_id] == "LAND_COVER" else 'change_process'
        context = doc[max(0, start - 5):min(len(doc), end + 5)].text
        for geo in geographies:
            rules.append({
                'type': rule_type,
                'term': term,
                'value': '',
                'geography': geo,
                'context': context
            })

    return rules


def process_row(row):
    rules = []
    for col in ['title', 'abstract', 'sections']:
        if pd.notna(row.get(col)):
            extracted = extract_rules(str(row[col]))
            for rule in extracted:
                rule.update({
                    'authors': row.get('authors', ''),
                    'source_column': col,
                    'original_row': row.name
                })
            rules.extend(extracted)
    return rules


In [49]:
# ================== CSV PROCESSING ==================

def process_csv(input_path, output_path):
    df = pd.read_csv(input_path)
    all_rules = []

    for _, row in df.iterrows():
        all_rules.extend(process_row(row))

    results = pd.DataFrame(all_rules)
    cols = ['original_row', 'authors', 'source_column', 'type', 'term', 'value', 'geography', 'context']
    results = results.reindex(columns=[c for c in cols if c in results.columns])
    results.to_csv(output_path, index=False)
    print(f"✅ Processed {len(df)} rows — Extracted {len(results)} rules")
    print(results.head(10).to_string())


In [50]:

if __name__ == "__main__":
    process_csv(
        input_path="/Users/rehamjamal/Desktop/ARENA 2025/extracted_text/extracted_data_full.csv",
        output_path="final_lulc_rules_with_geography.csv"
    )


✅ Processed 118 rows — Extracted 97806 rules
   original_row                                       authors source_column               type           term   value geography                                                 context
0             0  Sonam Wang; Lamchin Munkhnasan; Woo-Kyun Lee      abstract  percentage_change                 12.77%   Thimphu       observed a significant increase (12.77%) in built
1             0  Sonam Wang; Lamchin Munkhnasan; Woo-Kyun Lee      abstract  percentage_change                 52.88%   Thimphu                      up area from 2002 (52.88%) to 2018
2             0  Sonam Wang; Lamchin Munkhnasan; Woo-Kyun Lee      abstract  percentage_change                  65.5%   Thimphu                            %) to 2018 (65.5%), followed
3             0  Sonam Wang; Lamchin Munkhnasan; Woo-Kyun Lee      abstract  percentage_change                 15.25%   Thimphu  forest cover declined drastically (15.25%) followed by
4             0  Sonam Wang; Lamchi

In [51]:
import pandas as pd
import plotly.express as px

# Load the rules file
df = pd.read_csv("final_lulc_rules_with_geography.csv")

# Aggregate by geography
geo_counts = df.groupby("geography").size().reset_index(name="count")

# Plot
fig = px.choropleth(
    geo_counts,
    locations="geography",
    locationmode="country names",
    color="count",
    title="LULC Mentions by Country/Region",
    color_continuous_scale="YlGnBu"
)
fig.show()


In [None]:
import pandas as pd
import networkx as nx
from pyvis.network import Network
import webbrowser

# 1. Load your data (with sample fallback)

df = pd.read_csv('/Users/rehamjamal/Desktop/ARENA 2025/notebook/final_lulc_rules_with_geography.csv')

# 2. Transform data with debug output
print("\n=== Input Data Sample ===")
print(df.head())

network_data = []
for _, row in df.iterrows():
    if row['type'] == 'change_process':
        network_data.append({
            'source': row['term'],
            'target': 'LAND_COVER_CHANGE',
            'label': row['term']
        })
    elif row['type'] == 'land_cover':
        network_data.append({
            'source': 'LAND_COVER',
            'target': row['term'],
            'label': row['term']
        })
    elif row['type'] == 'percentage_change':
        network_data.append({
            'source': 'QUANT_CHANGE',
            'target': row['value'],
            'label': row['value']
        })

# 3. Build network with diagnostics
# print("\n=== Network Data ===")
# print(network_data[:3])  # Show first 3 records

G = nx.DiGraph()
for record in network_data:
    G.add_node(record['source'], label=record.get('label', ''))
    G.add_node(record['target'], label=record.get('label', ''))
    G.add_edge(record['source'], record['target'])

print(f"\nNodes: {G.nodes()}\nEdges: {G.edges()}")

# 4. Enhanced Visualization
nt = Network(
    height="750px",
    width="100%",
    directed=True,
    notebook=False,
    cdn_resources='remote',  
    select_menu=True,       
    filter_menu=True       
)

# Force physics configuration
nt.set_options("""
{
  "physics": {
    "forceAtlas2Based": {
      "gravitationalConstant": -50,
      "centralGravity": 0.01,
      "springLength": 100
    },
    "minVelocity": 0.75,
    "solver": "forceAtlas2Based"
  }
}
""")

nt.from_nx(G)
output_path = 'lulc_network.html'
nt.write_html(output_path)

# 5. Verification Steps
print("\n=== Verification ===")
print(f"HTML file generated: {output_path}")
print(f"Node count: {len(G.nodes())}")
print(f"Edge count: {len(G.edges())}")

# Open in browser with timeout
try:
    webbrowser.open(output_path, new=2)
    print("Opened in browser. If blank, try:")
    print("1. Right-click -> 'Open with' another browser")
    print("2. Check developer console (F12) for errors")
except:
    print(f"Manually open: file://{os.path.abspath(output_path)}")


=== Input Data Sample ===
   original_row                                       authors source_column  \
0             0  Sonam Wang; Lamchin Munkhnasan; Woo-Kyun Lee      abstract   
1             0  Sonam Wang; Lamchin Munkhnasan; Woo-Kyun Lee      abstract   
2             0  Sonam Wang; Lamchin Munkhnasan; Woo-Kyun Lee      abstract   
3             0  Sonam Wang; Lamchin Munkhnasan; Woo-Kyun Lee      abstract   
4             0  Sonam Wang; Lamchin Munkhnasan; Woo-Kyun Lee      abstract   

                type term   value geography  \
0  percentage_change  NaN  12.77%   Thimphu   
1  percentage_change  NaN  52.88%   Thimphu   
2  percentage_change  NaN   65.5%   Thimphu   
3  percentage_change  NaN  15.25%   Thimphu   
4  percentage_change  NaN   1.01%   Thimphu   

                                             context  
0  observed a significant increase (12.77%) in built  
1                 up area from 2002 (52.88%) to 2018  
2                       %) to 2018 (65.5%), follow

In [2]:
# Full Clean and Complete NER Pipeline for LULCC Domain

import spacy
from spacy.pipeline import EntityRuler
from spacy.tokens import Span
from spacy.util import filter_spans
import pandas as pd
import re
from tqdm import tqdm
import pycountry

# -----------------------------
# 1. Load spaCy Model & Add EntityRuler
# -----------------------------
nlp = spacy.load("en_core_web_sm")
ruler = nlp.add_pipe("entity_ruler", before="ner")

# -----------------------------
# 2. Clean Text Function (merges title, abstract, sections)
# -----------------------------
def clean_text(row):
    text = " ".join(str(row[col]) for col in ["title", "abstract", "sections"] if col in row and pd.notnull(row[col]))
    text = re.sub(r"\{.*?\}", " ", text)
    text = re.sub(r"'#text':", '', text)
    text = re.sub(r"'@xmlns':", '', text)
    text = re.sub(r"'\w+':\s*", ' ', text)
    text = re.sub(r"[^a-zA-Z0-9 .%°]+", ' ', text)
    text = re.sub(r"\s+", " ", text).strip()
    return text

# -----------------------------
# 3. Load LULC Vocabularies from CSV
# -----------------------------
lulc_voc = pd.read_csv("/Users/rehamjamal/Desktop/ARENA 2025/data/LULC.csv")["word"].dropna().unique().tolist()
lulc_process = pd.read_csv("/Users/rehamjamal/Desktop/ARENA 2025/data/LCprocess.csv")["word"].dropna().unique().tolist()

# -----------------------------
# 4. Define GEO Vocabulary (Countries + Regions)
# -----------------------------
COUNTRIES = [country.name for country in pycountry.countries]
GLOBAL_REGIONS = [
    "Africa", "Sub-Saharan Africa", "North America", "South America",
    "Asia", "Southeast Asia", "Europe", "Middle East", "Oceania",
    "Western Africa", "Eastern Africa", "Northern Africa"
]
geo_vocab = list(set(COUNTRIES + GLOBAL_REGIONS))

# -----------------------------
# 5. Pattern Helpers
# -----------------------------
def format_patterns(vocab_list, label):
    return [
        {"label": label, "pattern": [{"LOWER": word}]} for word in vocab_list if len(word.split()) == 1
    ] + [
        {"label": label, "pattern": [{"LOWER": w} for w in phrase.split()]} for phrase in vocab_list if len(phrase.split()) > 1
    ]

def create_pattern_sw(list_of_words, label):
    single_words = [w for w in list_of_words if len(w.split()) == 1]
    pattern_list = [
        {"label": label, "pattern": [{"TEXT": word}]} for word in single_words
    ] + [
        {"label": label, "pattern": [{"LEMMA": word}]} for word in single_words
    ]
    return pattern_list

# -----------------------------
# 6. Add Core Patterns to the EntityRuler
# -----------------------------
base_patterns = [
    {"label": "SURFACE_UNIT", "pattern": [
        {"TEXT": {"REGEX": r"\d+"}},
        {"LOWER": {"IN": ["km²", "ha", "hectares", "acres", "kilometers"]}}
    ]},
    {"label": "COORDINATES", "pattern": [
        {"TEXT": {"REGEX": r"\d{1,3}"}}, {"TEXT": "°"},
        {"TEXT": {"REGEX": "[NSEW]"}}, {"TEXT": "–"},
        {"TEXT": {"REGEX": r"\d{1,3}"}}, {"TEXT": "°"},
        {"TEXT": {"REGEX": "[NSEW]"}}
    ]},
    *[{"label": "CHANGE", "pattern": [{"LEMMA": lemma}]} for lemma in ["increase", "decrease", "loss", "gain", "expand"]],
    {"label": "CHANGE", "pattern": [{"LOWER": {"REGEX": r"(increase|decrease|loss(es)?|gain(s)?|expansion|expand)"}}]}
]

ruler.add_patterns(base_patterns +
                   format_patterns(geo_vocab, "GEO") +
                   create_pattern_sw(lulc_voc, "LULC") +
                   create_pattern_sw(lulc_process, "LULC_process"))

# -----------------------------
# 7. NER Function (includes multiword and lemmatized matching)
# -----------------------------
def extract_entities(text):
    rows = []
    doc = nlp(text)
    original_ents = list(doc.ents)

    # Multiword LULC_process
    multi_words_process = [w for w in lulc_process if len(w.split()) > 1]
    multi_words_lemma_process = [" ".join([token.lemma_ for token in nlp(w)]) for w in multi_words_process]

    mwt_ents = []
    for mw in multi_words_process + multi_words_lemma_process:
        for match in re.finditer(re.escape(mw), doc.text):
            span = doc.char_span(*match.span())
            if span is not None:
                mwt_ents.append((span.start, span.end, span.text))
    for start, end, name in mwt_ents:
        original_ents.append(Span(doc, start, end, label="LULC_process"))

    # Multiword LULC
    multi_words_lulc = [w for w in lulc_voc if len(w.split()) > 1]
    multi_words_lemma_lulc = [" ".join([token.lemma_ for token in nlp(w)]) for w in multi_words_lulc]

    mwt_ents = []
    for mw in multi_words_lulc + multi_words_lemma_lulc:
        for match in re.finditer(re.escape(mw), doc.text):
            span = doc.char_span(*match.span())
            if span is not None:
                mwt_ents.append((span.start, span.end, span.text))
    for start, end, name in mwt_ents:
        original_ents.append(Span(doc, start, end, label="LULC"))

    # Final span filtering
    doc.ents = filter_spans(original_ents)
    for ent in doc.ents:
        rows.append((ent.text, ent.label_))
    return pd.DataFrame(rows, columns=["value", "entity_label"])

# -----------------------------
# 8. Apply Pipeline to Full CSV
# -----------------------------
df_docs = pd.read_csv("/Users/rehamjamal/Desktop/ARENA 2025/extracted_text/extracted_data_full.csv")
df_docs["cleaned_text"] = df_docs.apply(clean_text, axis=1)

all_entities = pd.DataFrame()

for idx, row in tqdm(df_docs.iterrows(), total=len(df_docs)):
    ents = extract_entities(row["cleaned_text"])
    ents["doc_id"] = idx
    all_entities = pd.concat([all_entities, ents], ignore_index=True)

# -----------------------------
# 9. Save Land Use and Land Cover Entities
# -----------------------------
lulc_entities = all_entities[all_entities["entity_label"].isin(["LULC", "LULC_process"])]
lulc_entities.drop_duplicates().sort_values(by="value").to_csv("/Users/rehamjamal/Desktop/ARENA 2025/notebook/lulc_entities.csv", index=False)

print("✅ All done! Extracted entities saved to data/lulc_entities.csv")


100%|██████████| 118/118 [01:43<00:00,  1.14it/s]

✅ All done! Extracted entities saved to data/lulc_entities.csv



