## Data Extraction and Feature Engineering

This script prepares raw BBC news text → structured dataset with extracted countries, cities, and nationalities for downstream analysis.

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt


import spacy


import pycountry

import geonamescache



In [None]:
# load in dataset
df_bbc = pd.read_csv('dataset/bbc_news.csv')
df_bbc.head()

Unnamed: 0,title,pubDate,guid,link,description
0,Ukraine: Angry Zelensky vows to punish Russian...,"Mon, 07 Mar 2022 08:01:56 GMT",https://www.bbc.co.uk/news/world-europe-60638042,https://www.bbc.co.uk/news/world-europe-606380...,The Ukrainian president says the country will ...
1,War in Ukraine: Taking cover in a town under a...,"Sun, 06 Mar 2022 22:49:58 GMT",https://www.bbc.co.uk/news/world-europe-60641873,https://www.bbc.co.uk/news/world-europe-606418...,"Jeremy Bowen was on the frontline in Irpin, as..."
2,Ukraine war 'catastrophic for global food',"Mon, 07 Mar 2022 00:14:42 GMT",https://www.bbc.co.uk/news/business-60623941,https://www.bbc.co.uk/news/business-60623941?a...,One of the world's biggest fertiliser firms sa...
3,Manchester Arena bombing: Saffie Roussos's par...,"Mon, 07 Mar 2022 00:05:40 GMT",https://www.bbc.co.uk/news/uk-60579079,https://www.bbc.co.uk/news/uk-60579079?at_medi...,The parents of the Manchester Arena bombing's ...
4,Ukraine conflict: Oil price soars to highest l...,"Mon, 07 Mar 2022 08:15:53 GMT",https://www.bbc.co.uk/news/business-60642786,https://www.bbc.co.uk/news/business-60642786?a...,Consumers are feeling the impact of higher ene...


In [4]:
# drop title duplicates
df_bbc = df_bbc.drop_duplicates(subset='title')

In [5]:
df_bbc.shape

(39653, 5)

In [7]:
df_bbc.columns

Index(['title', 'pubDate', 'guid', 'link', 'description'], dtype='object')

In [8]:
df_bbc.tail()

Unnamed: 0,title,pubDate,guid,link,description
42110,Highlights: Wales make history in Dublin,"Tue, 03 Dec 2024 23:07:27 GMT",https://www.bbc.com/sport/football/videos/ckg1...,https://www.bbc.com/sport/football/videos/ckg1...,Watch highlights as Wales win 2-1 in Dublin fo...
42111,Gang jailed over £200m of cocaine in banana boxes,"Tue, 03 Dec 2024 17:48:54 GMT",https://www.bbc.com/news/articles/c3e8pvg284no#9,https://www.bbc.com/news/articles/c3e8pvg284no,More than two tonnes of the Class A drug was s...
42112,Scottish Budget presents huge challenges for SNP,"Tue, 03 Dec 2024 23:00:17 GMT",https://www.bbc.com/news/articles/ckg1m1j2grpo#9,https://www.bbc.com/news/articles/ckg1m1j2grpo,Finance Secretary Shona Robison is preparing t...
42113,Celebrations as Wales make history qualifying ...,"Wed, 04 Dec 2024 00:05:52 GMT",https://www.bbc.com/news/articles/c791nq8nl73o#9,https://www.bbc.com/news/articles/c791nq8nl73o,Wales defeated the Republic of Ireland 2-1 mak...
42114,School tells Muslim girls it’s ‘not safe’ for ...,"Wed, 04 Dec 2024 00:03:58 GMT",https://www.bbc.com/news/articles/cx2wk0k4597o#9,https://www.bbc.com/news/articles/cx2wk0k4597o,Three children have not been back to the Belfa...


In [None]:
# combine for title and text values for feature
df_bbc['text'] = df_bbc['title'] + ' ' + df_bbc['description']

In [10]:
df_bbc.head()

Unnamed: 0,title,pubDate,guid,link,description,text
0,Ukraine: Angry Zelensky vows to punish Russian...,"Mon, 07 Mar 2022 08:01:56 GMT",https://www.bbc.co.uk/news/world-europe-60638042,https://www.bbc.co.uk/news/world-europe-606380...,The Ukrainian president says the country will ...,Ukraine: Angry Zelensky vows to punish Russian...
1,War in Ukraine: Taking cover in a town under a...,"Sun, 06 Mar 2022 22:49:58 GMT",https://www.bbc.co.uk/news/world-europe-60641873,https://www.bbc.co.uk/news/world-europe-606418...,"Jeremy Bowen was on the frontline in Irpin, as...",War in Ukraine: Taking cover in a town under a...
2,Ukraine war 'catastrophic for global food',"Mon, 07 Mar 2022 00:14:42 GMT",https://www.bbc.co.uk/news/business-60623941,https://www.bbc.co.uk/news/business-60623941?a...,One of the world's biggest fertiliser firms sa...,Ukraine war 'catastrophic for global food' One...
3,Manchester Arena bombing: Saffie Roussos's par...,"Mon, 07 Mar 2022 00:05:40 GMT",https://www.bbc.co.uk/news/uk-60579079,https://www.bbc.co.uk/news/uk-60579079?at_medi...,The parents of the Manchester Arena bombing's ...,Manchester Arena bombing: Saffie Roussos's par...
4,Ukraine conflict: Oil price soars to highest l...,"Mon, 07 Mar 2022 08:15:53 GMT",https://www.bbc.co.uk/news/business-60642786,https://www.bbc.co.uk/news/business-60642786?a...,Consumers are feeling the impact of higher ene...,Ukraine conflict: Oil price soars to highest l...


In [None]:
# determine length of word
df_bbc['word_in_description'] = df_bbc['text'].str.split().str.len()

In [None]:
# df_bbc['word_in_description']

0        24
1        28
2        23
3        26
4        26
         ..
42110    30
42111    30
42112    22
42113    23
42114    29
Name: word_in_description, Length: 39653, dtype: int64

In [13]:
df_bbc.head(2)

Unnamed: 0,title,pubDate,guid,link,description,text,word_in_description
0,Ukraine: Angry Zelensky vows to punish Russian...,"Mon, 07 Mar 2022 08:01:56 GMT",https://www.bbc.co.uk/news/world-europe-60638042,https://www.bbc.co.uk/news/world-europe-606380...,The Ukrainian president says the country will ...,Ukraine: Angry Zelensky vows to punish Russian...,24
1,War in Ukraine: Taking cover in a town under a...,"Sun, 06 Mar 2022 22:49:58 GMT",https://www.bbc.co.uk/news/world-europe-60641873,https://www.bbc.co.uk/news/world-europe-606418...,"Jeremy Bowen was on the frontline in Irpin, as...",War in Ukraine: Taking cover in a town under a...,28


# Training Pipelines 

In [None]:
# NLP Processing 
nlp = spacy.load("en_core_web_sm")

from spacy.lang.en.stop_words import STOP_WORDS

stopwords = list(STOP_WORDS)

In [15]:
# test = df_bbc['text'][31]
# doc = nlp(test)
# print(doc.text)

# for token in doc:
#     print(token.text, token.pos_)

In [16]:
test = df_bbc['text'][0]
doc = nlp(test)
print(doc.text)

for ent in doc.ents:
    if ent.label_ == "NORP":
        print(ent.text, "->", ent.label_)

Ukraine: Angry Zelensky vows to punish Russian atrocities The Ukrainian president says the country will not forgive or forget those who murder its civilians.
Russian -> NORP
Ukrainian -> NORP


In [17]:
# def preprocessing(text):
#     stopwords = set(STOP_WORDS)  # build once for O(1) lookup
#     exclude_pos = {"PUNCT", "SYM", "SPACE"}  # predefine excluded POS
#     doc = nlp(text)
#     clean_tokens = [
#         token.lemma_.lower()
#         for token in doc
#         if token.text.lower() not in stopwords and token.pos_ not in exclude_pos
#     ]
#     return " ".join(clean_tokens)

In [None]:
# Normalize everything to lowercase once
abbrev_countries = {c.lower() for c in {
    "USA", "UK", "UAE", "DRC", "ROC", "China", "Russia", 
    "S. Korea", "South Korea", "N. Korea", "North Korea", 
    "Ivory Coast", "Syria", "Palestine", "Holland", "Burma", 
    "Zaire", "Kosovo", "Hong Kong", "Macau", "Scotland", 
    "Wales", "England", "Northern Ireland", "Taiwan"
}}

# Set of all official ISO-recognized country names (normalized to lowercase)
official_countries = {c.name.lower() for c in pycountry.countries}

def get_countries(text):
    """
    Extract countries from a given text using spaCy NER + pycountry.
    
    Args:
        text (str): Input text to analyze.
    
    Returns:
        list: Unique list of country names or abbreviations mentioned in the text.
    
    Process:
    1. Run spaCy's NLP pipeline on the text.
    2. Collect named entities labeled as 'GPE' (Geo-Political Entity).
    3. Keep only those that match official countries or known abbreviations.
    4. Return a unique list of matches (case-insensitive).
    """
    # Process text with spaCy model
    doc = nlp(text)

    # Use a set to avoid duplicates, then convert to list
    return list({
        ent.text.strip()
        for ent in doc.ents
        if ent.label_ == "GPE" and ent.text.lower() in official_countries.union(abbrev_countries) # only consider Geo-Political Entities
    })


In [None]:
# get nationalities

def get_nationalities(text):
    doc = nlp(text) # Process text with spaCy model
    
    return list({
        ent.text.lower().strip()
        for ent in doc.ents
        if ent.label_ == "NORP"
    })


In [None]:
# Initialize geonamescache
gc = geonamescache.GeonamesCache()

# Precompute a set of all city names (lowercased) for fast membership lookup
# Using a set makes "is this a city?" checks O(1) instead of O(n).
all_cities = {city['name'].lower() for city in gc.get_cities().values()}  # precompute once

def get_cities(text):
    """
    Extract cities from a given text using spaCy NER + geonamescache lookup.

    Args:
        text (str): Input text to analyze.

    Returns:
        list: List of detected city names (lowercased) found in the text.

    Process:
    1. Run spaCy's NLP pipeline on the text.
    2. Collect named entities from spaCy.
    3. Check if the entity text matches a known city in geonamescache.
    4. Return a list of all matches (duplicates may occur if cities repeat).
    """
    # run through spacy model 
    doc = nlp(text)
    # Collect entities that match known city names
    return [ent.text.lower() for ent in doc.ents if ent.text.lower() in all_cities]
        

In [21]:
# df_bbc['processed_desc'] = df_bbc['text'].map(preprocessing)

In [None]:
# extract countries 
df_bbc['countries'] = df_bbc['text'].map(get_countries)

In [None]:
# extract nationalities / ideologies 
df_bbc['nationalities'] = df_bbc['text'].map(get_nationalities)

In [None]:
# extract cities 
df_bbc['cities'] = df_bbc['text'].map(get_cities)

In [25]:
# remove empty sets 
df_bbc['countries'] = df_bbc['countries'].apply(lambda x: str(x) if x else '' )
df_bbc['nationalities'] = df_bbc['nationalities'].apply(lambda x: str(x) if x else '' )
df_bbc['cities'] = df_bbc['cities'].apply(lambda x: str(x) if x else '' )

In [26]:
# df_bbc = df_bbc[['text', 'processed_desc', 'countries', 'nationalities', 'cities' ]]
df_bbc = df_bbc[['text', 'countries', 'nationalities', 'cities' ]]

In [27]:
df_bbc.head(2)

Unnamed: 0,text,countries,nationalities,cities
0,Ukraine: Angry Zelensky vows to punish Russian...,['Ukraine'],"['ukrainian', 'russian']",
1,War in Ukraine: Taking cover in a town under a...,['Ukraine'],['russian'],['irpin']


In [36]:
cols_to_split = ['countries', 'nationalities', 'cities']

def safe_split(x):
    if pd.isna(x):               # None/NaN → empty list
        return []
    if isinstance(x, list):      # already list → leave it
        return x
    if isinstance(x, str):       # split string by comma
        return [i.strip() for i in x.split(",") if i.strip() != ""]
    return []

for col in cols_to_split:
    df_bbc[col] = df_bbc[col].apply(safe_split)


In [37]:
# ​​Function to pad lists to the same length
def pad_lists(row):
    max_len = max(len(row['countries']), len(row['nationalities']), len(row['cities']))
    row['countries'] += [None] * (max_len - len(row['countries']))
    row['nationalities'] += [None] * (max_len - len(row['nationalities']))
    row['cities'] += [None] * (max_len - len(row['cities']))
    return row

# Apply the padding function to each row
df_bbc = df_bbc.apply(pad_lists, axis=1)

# Now, safely explode both columns
df_bbc = df_bbc.explode(['countries','nationalities', 'cities'])

In [38]:
df_bbc.head()

Unnamed: 0,text,countries,nationalities,cities
0,Ukraine: Angry Zelensky vows to punish Russian...,['Ukraine'],['ukrainian',
0,Ukraine: Angry Zelensky vows to punish Russian...,,'russian'],
1,War in Ukraine: Taking cover in a town under a...,['Ukraine'],['russian'],['irpin']
2,Ukraine war 'catastrophic for global food' One...,['Ukraine'],,
3,Manchester Arena bombing: Saffie Roussos's par...,,,


In [39]:
# df_bbc.replace("np.nan", "", inplace=True)


In [40]:
df_bbc.countries = df_bbc.countries.str.replace("[", "").str.replace("]", "").str.replace("'", "").str.replace(",", "")
df_bbc.nationalities = df_bbc.nationalities.str.replace("[", "").str.replace("]", "").str.replace("'", "").str.replace(",", "")
df_bbc.cities = df_bbc.cities.str.replace("[", "").str.replace("]", "").str.replace("'", "").str.replace(",", "")


In [41]:
df_bbc.fillna("", inplace=True)

In [42]:
# capitalize nationality values for better exploration
df_bbc.nationalities = df_bbc.nationalities.str.capitalize()
df_bbc.cities = df_bbc.cities.str.capitalize()

In [43]:
df_bbc.countries.value_counts()

countries
                      31172
UK                     2379
England                2036
Ukraine                1800
Russia                  992
                      ...  
Sierra Leone              1
Eritrea                   1
Burundi                   1
Dominican Republic        1
Equatorial Guinea         1
Name: count, Length: 157, dtype: int64

In [44]:
df_bbc.replace('USA', 'United States', inplace=True)

57

In [45]:
df_bbc.to_csv('dataset/new_data.csv', index=False)

In [None]:
# df_bbc["notice"] = df_bbc["text"].apply(
#     lambda x: "Found" if "usa" in x.lower() else "Not Found"
# )
