# Data Exploration and Geographical Representation

In [17]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

import geonamescache
import geopandas as gpd
# from shapely.geometry import Point

import plotly.express as px
# import country_converter as coco

In [18]:
# read data
df_exp = pd.read_csv('dataset/new_data.csv')
df_exp.head()

Unnamed: 0,text,countries,nationalities,cities
0,Ukraine: Angry Zelensky vows to punish Russian...,Ukraine,Ukrainian,
1,Ukraine: Angry Zelensky vows to punish Russian...,,Russian,
2,War in Ukraine: Taking cover in a town under a...,Ukraine,Russian,Irpin
3,Ukraine war 'catastrophic for global food' One...,Ukraine,,
4,Manchester Arena bombing: Saffie Roussos's par...,,,


In [19]:
### some values in 'Nationalities' virtually mean the same thing. E.g. Russian and Russians, Ukrainian and ukrainians, Nigeria and Nigerians.
### We strip the last word 's' in the values for better exploration and Analysis

df_exp.nationalities = df_exp.nationalities.str.rstrip('s')
# df_exp.cities = df_exp.cities.str.replace()

In [20]:
df_exp.shape

(46938, 4)

In [21]:
df_exp.isna().sum()

text                 0
countries        31172
nationalities    38873
cities           35527
dtype: int64

In [22]:
# Un stands for the United Nations
df_exp.cities = df_exp.cities.str.replace('Un', 'UN')

In [None]:
# # Mapping of non-standard → official country names
# country_fix = {
#     "UK": "United Kingdom",
#     "USA": "United States",
#     "UAE": "United Arab Emirates",
#     "South Korea": "Korea, Republic of",
#     "North Korea": "Korea, Democratic People's Republic of",
#     }

# # Apply fixes
# df_exp["countries"] = df_exp["countries"].replace(country_fix)

In [24]:
# list(df_exp.cities.unique())

#### Geographical Map of Countries around the Globe

In [25]:
dfc = df_exp.countries.value_counts().reset_index()
dfc.columns = ['country', 'count']
dfc = pd.DataFrame(dfc)

gc = geonamescache.GeonamesCache()
countries_dict = gc.get_countries()

# Convert dict → DataFrame
gcountries = pd.DataFrame.from_dict(countries_dict, orient="index")
gcountries = gcountries[["name", "iso3"]]

# Merge by lowercase name
dfc["country_lower"] = dfc["country"].str.lower()
gcountries["name_lower"] = gcountries["name"].str.lower()

merged = dfc.merge(gcountries, left_on="country_lower", right_on="name_lower", how="left")
merged = merged.drop_duplicates(subset=['country']).reset_index(drop=['index'])
# print('\n --------')
# return merged

plot = px.choropleth(merged, locations="iso3",
            color = "count",
            # projection='natural earth',
            color_continuous_scale="ylorrd",
            title=f"News Coverage Around the Globe")

# Fix the figure size
plot.update_layout(
    width=1000,   # set figure width
    height=600,   # set figure height
    geo=dict(
        projection_type="natural earth",  # optional: fix projection
        showframe=False,
        showcoastlines=True,
        showcountries=True,
        projection_scale=1,  # keep globe size constant
        center=dict(lat=0, lon=0)  # keep centered
    )
)

# Disable zooming, dragging, etc.
plot.update_layout(
    dragmode=False
)
plot.update_geos(fitbounds="locations", visible=False)

plot.show()


* News Coverage were more concentrated on Ukraine and Russia. Relatable so due to the current war going on in both countries.

* Australia, India and China were also covered larged from The BBC

In [26]:
# set country list 
country_list = set(list(df_exp.countries.values))

#### Map of Cities Mentioned with a Particular Country

In [30]:
df_exp.query("countries == 'United Kingdom'")

Unnamed: 0,text,countries,nationalities,cities
8,Ukraine war: UK grants 50 Ukrainian refugee vi...,United Kingdom,Ukrainian,
71,Covid-19 in the UK: How many coronavirus cases...,United Kingdom,,
74,What are the UK's 'Living with Covid' plans? A...,United Kingdom,,
76,Covid: Five things we still need to keep an ey...,United Kingdom,,
78,What are the latest rules for face coverings a...,United Kingdom,,
...,...,...,...,...
46902,'Pickling a cucumber changed my life' says Tik...,United Kingdom,,
46910,Why has an additive called Bovaer sparked cont...,United Kingdom,,
46919,"Massive Elon Musk donation news to me, says Fa...",United Kingdom,,
46923,Migrants brought to UK from remote military is...,United Kingdom,,


In [31]:
def select_country_map(country):
    """
    Generate a geo-scatter plot of cities mentioned within a given country.
    
    Steps:
    1. Filter the dataframe for rows matching the selected country.
    2. Count how many times each city appears.
    3. Use geonamescache to get latitude/longitude for each city.
    4. Match city names between your data and geonamescache - within selected country.
    5. If multiple entries for a city exist, keep the one with the largest population.
    6. Plot the cities on a world map with bubble sizes proportional to frequency.
    """
    

    if country in country_list:
        # Count how many times each city is mentioned in df_exp for the given country
        dfc = df_exp[df_exp['countries'] == country]['cities'].value_counts().reset_index()
        dfc.columns = ['city', 'count']  # Rename columns for clarity

        # Ensure dfc is a DataFrame (value_counts returns Series initially)
        dfc = pd.DataFrame(dfc)

        # Load geonamescache data (cities + countries metadata)
        gc = geonamescache.GeonamesCache()
        cities_dict = gc.get_cities()
        countries_dict = gc.get_countries()

        # Convert the geonamescache dictionary into a DataFrame
        # Keep only relevant columns for plotting
        gcities = pd.DataFrame.from_dict(cities_dict, orient="index")
        gcities = gcities[["name", "countrycode", "latitude", "longitude", "population"]]

        # Find the ISO country code (e.g., "UA" for Ukraine) that matches the input country name
        country_code = None
        for code, info in countries_dict.items():
            if info["name"].lower() == country.lower():
                country_code = code
                break

        # If no country match is found, stop the function
        if not country_code:
            print(f"Country '{country}' not found in geonamecache.")
            return

        # Prepare for merging: capitalize city names for consistent matching
        dfc["city_cap"] = dfc["city"].str.capitalize()
        gcities["name_cap"] = gcities["name"].str.capitalize()

        # Merge user city data with geonamescache cities,
        # but only keep cities from the selected country
        merged = dfc.merge(
            gcities[gcities["countrycode"] == country_code], 
            left_on="city_cap", 
            right_on="name_cap", 
            how="left"
        )

        # If a city has multiple entries, keep the one with the largest population
        merged = merged.sort_values(["city", "population"], ascending=[True, False])
        merged = merged.drop_duplicates(subset=["city"]).reset_index(drop=True)


        if merged.values.any():
            # Create a scatter plot on a world map
            plot = px.scatter_geo(
                merged, 
                lat='latitude', 
                lon='longitude',
                size="count",           # bubble size ~ frequency
                color="name",           # color-coded by city name
                title=f"Cities mentioned within {country}"
            )
            
            # Customize the layout for better visuals
            plot.update_layout(
                width=1000,   # figure width
                height=600,   # figure height
                geo=dict(
                    projection_type="natural earth",  # projection type
                    showframe=False,
                    showcoastlines=True,
                    showcountries=True,
                    projection_scale=1,   # zoom level
                    center=dict(lat=0, lon=0)  # keep centered on the globe
                ),
                dragmode=False  # disable dragging/zooming
            )

            # Show the plot
            plot.show()
        
        else:
            print(f"{country} correlations could not be sourced!")

    else:
        # If the provided country is not in the country_list, print a warning
        print(f"{country} not in records!")


# Example usage
select_country_map('United Kingdom')


In [89]:
def select_global_country_map(country):
    """
    Generate a geo-scatter plot of cities mentioned within a given country.
    
    Steps:
    1. Filter the dataframe for rows matching the selected country.
    2. Count how many times each city appears.
    3. Use geonamescache to get latitude/longitude for each city.
    4. Match city names between your data and geonamescache - around the globe.
    5. If multiple entries for a city exist, keep the one with the largest population.
    6. Plot the cities on a world map with bubble sizes proportional to frequency.
    """

    if country in country_list:
        # Count how many times each city is mentioned in df_exp for the given country
        dfc = df_exp[df_exp['countries'] == country]['cities'].value_counts().reset_index()
        dfc.columns = ['city', 'count']  # Rename columns for clarity

        # Ensure dfc is a DataFrame (value_counts returns Series initially)
        dfc = pd.DataFrame(dfc)

        # Load geonamescache data (cities + countries metadata)
        gc = geonamescache.GeonamesCache()
        cities_dict = gc.get_cities()
        countries_dict = gc.get_countries()

        # Convert the geonamescache dictionary into a DataFrame
        # Keep only relevant columns for plotting
        gcities = pd.DataFrame.from_dict(cities_dict, orient="index")
        gcities = gcities[["name", "countrycode", "latitude", "longitude", "population"]]

        # Find the ISO country code (e.g., "UA" for Ukraine) that matches the input country name
        country_code = None
        for code, info in countries_dict.items():
            if info["name"].lower() == country.lower():
                country_code = code
                break

        # If no country match is found, stop the function
        if not country_code:
            print(f"Country '{country}' not found in geonamecache.")
            return

        # Prepare for merging: capitalize city names for consistent matching
        dfc["city_cap"] = dfc["city"].str.capitalize()
        gcities["name_cap"] = gcities["name"].str.capitalize()

        # Merge user city data with geonamescache cities,
        # but only keep cities from the selected country
        merged = dfc.merge(
            gcities, 
            left_on="city_cap", 
            right_on="name_cap", 
            how="left"
        )

        # If a city has multiple entries, keep the one with the largest population
        merged = merged.sort_values(["city", "population"], ascending=[True, False])
        merged = merged.drop_duplicates(subset=["city"]).reset_index(drop=True)

        if merged.values.any():
            # Create a scatter plot on a world map
            plot = px.scatter_geo(
                merged, 
                lat='latitude', 
                lon='longitude',
                size="count",           # bubble size ~ frequency
                color="name",           # color-coded by city name
                title=f"Cities mentioned with {country} Globally"
            )
            
            # Customize the layout for better visuals
            plot.update_layout(
                width=1000,   # figure width
                height=600,   # figure height
                geo=dict(
                    projection_type="natural earth",  # projection type
                    showframe=False,
                    showcoastlines=True,
                    showcountries=True,
                    projection_scale=1,   # zoom level
                    center=dict(lat=0, lon=0)  # keep centered on the globe
                ),
                dragmode=False  # disable dragging/zooming
            )

            # Show the plot
            plot.show()
        
        else:
            print(f"{country} correlations could not be sourced!")

    else:
        # If the provided country is not in the country_list, print a warning
        print(f"{country} not in records")


# Example usage
select_global_country_map('Ukraine')


In [90]:
def select_country(country):
    """
    Plot the top cities mentioned in relation to a given country.

    Steps:
    1. Filter df_exp for rows belonging to the selected country.
    2. Count how many times each city is mentioned.
    3. Keep only the top 15 most frequent cities.
    4. Plot the results as a bar chart.
    """

    if country in country_list:
        # Filter rows for the selected country and count city mentions
        dfc = df_exp[df_exp['countries'] == country]['cities'].value_counts().reset_index()

        # Rename columns: 'city' for the name and 'count' for frequency
        dfc.columns = ['city', 'count']

        # Capitalize city names for consistency
        dfc.city = dfc.city.str.capitalize()

        # Convert to DataFrame and keep only the top 15 cities
        dfc = pd.DataFrame(dfc.iloc[:15])

        # If there are results for this country
        if dfc.values.any():
            # Create a bar chart showing top cities mentioned
            fig = px.bar(dfc, x="city", y="count")

            # Customize layout (title, axis labels, hide y-axis ticks)
            fig.update_layout(
                title_text=f"Top Cities mentioned with {country}",
                xaxis_title="Cities",
                yaxis=dict(title=None, showticklabels=False, ticks="")
                # Optionally set a range for the y-axis: yaxis_range=[0,10]
            )

            # Customize bar color
            fig.update_traces(marker_color='#873260')

            # Show the plot
            fig.show()
        else:
            # If no cities were found for this country
            print(f"{country} correlations could not be sourced! ")

    else:
        # If the given country is not in your country_list
        print(f"{country} does not exist in records! ")


# Example usage
select_country('Ukraine')


## Nationality / Ideology Exploration

##### What Ideology or Nationality does each country relate with or mentioned with?

In [96]:
def select_ideology(country):
    """
    Plot the top ideologies / beliefs / nationalities mentioned
    in connection with a given country.

    Steps:
    1. Filter the main dataframe for the selected country.
    2. Count how many times each 'nationality' is associated with it.
    3. Keep the top 15 most frequent nationalities.
    4. Plot the results as a bar chart.
    """

    if country in country_list:
        # Filter df_exp to include only rows for the chosen country
        dfr = df_exp[df_exp['countries'] == country]

        # Count occurrences of each nationality associated with the country
        dfr = dfr.groupby('countries')['nationalities'].value_counts().reset_index()

        # Keep only relevant columns (nationality + count)
        dfr = dfr[['nationalities', 'count']]

        # Ensure result is a DataFrame (good practice after groupby operations)
        dfr = pd.DataFrame(dfr)

        # Keep only the top 15 nationalities (for readability in the plot)
        dfr = dfr.iloc[:15, :]

        if dfr.values.any():
            # --- Plotting the bar chart ---
            fig = px.bar(
                dfr, 
                x="nationalities", 
                y="count"
            )

            # Update layout for better presentation
            fig.update_layout(
                title_text=f"Top Ideologies / Beliefs / Nationalities related to {country}",
                xaxis_title="Ideologies / Beliefs / Nationalities",
                
                # Remove y-axis label & tick marks for a cleaner look
                yaxis=dict(title=None, showticklabels=False, ticks="")
                # Optionally, you could enforce a range: yaxis_range=[0,10]
            )

            # Customize bar color
            fig.update_traces(marker_color='#873260')
            
            # Show the chart
            fig.show()
        else:
            print("{country} correlations could not be sourced!")

        # Optionally return the processed DataFrame instead of just plotting
        # return dfr  

    else:
        # If the country is not in your country_list, print a warning
        print(f"{country} does not exist in records!")


# Example usage
select_ideology('China')


#### Nationalities and Cities per mentioned Country

In [92]:
def nat_city(country):
    """
    Plot the relationship between nationalities/ideologies and cities mentioned
    for a given country.

    Steps:
    1. Filter the main dataframe for rows belonging to the selected country.
    2. Group by 'nationalities' and count how many times each 'city' appears with it.
    3. Sort by frequency and keep the top 15 combinations.
    4. Plot as a bubble chart (scatter plot with bubble size = count).
    """

    if country in country_list:
        # Group data by nationality → city, then count occurrences
        dfgl = (
            df_exp[df_exp['countries'] == country]
            .groupby('nationalities')['cities']
            .value_counts()  # counts how often each city appears per nationality
            .reset_index()
            .sort_values('count', ascending=False) 
        )

        if dfgl.values.any():

            # --- Plot a scatter plot (bubble chart) ---
            fig = px.scatter(
                dfgl,
                x="cities",             # cities on x-axis
                y="nationalities",      # nationalities on y-axis
                size="count"            # bubble size ~ frequency
            )

            # Update layout for readability
            fig.update_layout(
                title_text=f"Nationality-Ideology vs Cities per Country Mentioned ({country})",
                xaxis_title="Cities where mentioned",
                yaxis_title="Nationality-Ideology"
            )

            # Optional: customize colors
            # fig.update_traces(marker_color='#6D8196')

            # Show the chart
            fig.show()
        else:
            print(f"{country} correlations relationship could not be sourced!")

    else:
        # If country not in list, log a warning
        print(f"{country} does not exist in records!")


# Example usage
nat_city("Ukraine")
