In [2]:
#!pip install pycountry
#!pip install plotly==5.18.0

In [3]:
import json
import pandas as pd
import pycountry as pc
import plotly.express as px

In [4]:
def cleanCountryName(country: str) -> str:
    '''
    Cleans up the scraped name with proper capitalization
    Parameters:
        country: current name of the country
    Returns:
        the country name properly capitalized and formatted
    '''
    country = country.replace("_", " ")
    country = country.split()

    new_name = []
    for i in country:
        if i == "of" or i == "the" or i == "and":
            new_name.append(i)
        else:
            new_name.append(i.capitalize())

    if new_name[-1] == "the":
        new_name = new_name[0:-1]

    return " ".join(new_name)

def sortByCountry(lang_data: "dict[list[dict]]", countries: "list[str] | str" = "all") -> "list[dict]":
    '''
    Sorts the scraped JSON data of the countries given.
    If no countries are given, it returns the sorted data for
    all countries
    Parameters:
        lang_data: the webscraped language data dictionary
        countries: list of strings of country names
    Returns:
        list of the language data for the given countries
    '''
    if isinstance(countries, str) and countries != "all":
        countries = [countries]

    all_countries = []
    if countries == "all":
        for country in lang_data.keys():
            cleaned_name = cleanCountryName(country)
            country_dict = {"country": cleaned_name, "langs": lang_data[country]}
            all_countries.append(country_dict)

    else:  
        for country in lang_data.keys():
            cleaned_name = cleanCountryName(country)
            if cleaned_name in countries:
                country_dict = {"country": cleaned_name, "langs": lang_data[country]}
                all_countries.append(country_dict)
    
    return all_countries

def sortByLang(lang_data: "dict[list[dict]]", language: str) -> "list[dict]":
    '''
    Finds all of the countries with the given language and 
    returns a list of them, with their associated percentages
    Parameters:
        lang_data: the webscraped language data dictionary
        language:  string of the language to find data for
    Returns:
        list of dicts of countries with the given langauge data
    Raises:
        Error if there are no countries with the given language
    '''
    countries = []
    for country in lang_data.keys():
        for lang in lang_data[country]:
            name = lang["name"]
           
            if language in name:
                country = cleanCountryName(country)
                lang["country"] = country
                countries.append(lang)

    if len(countries) == 0:
        raise ValueError("No countries have speakers of that language")

    return countries

In [27]:
def countryToCode(country_data: "list[dict]") -> "list[dict]":
    ''' 
    Uses the pycountry module to convert the country names 
    into their ISO 3166-1 alpha-3 codes for plotting
    Parameters:
        country_data: the list of country dictionaries
    Returns:
        a list where the dicts have an "iso" key pair
    '''
    new_data = []

    for i in range(len(country_data)):
        country = country_data[i]
        name    = country["country"]
        
        try:
            new_name = pc.countries.search_fuzzy(name)[0]
            new_name = new_name.alpha_3
            country["iso"] = new_name

            new_data.append(country)
            
        except:
            pass

    return new_data

def langChloropleth(language: str, lang_data: "list[dict]") -> None:
    ''' 
    Plots the chloropleth map of the given language
    Parameters:
        language:  string of the language name
        lang_data: the opened JSON file with the language data
    Returns:
        None, displays thr chloropleth map
    '''
    language = language.capitalize()

    country_data = sortByLang(lang_data, language)
    country_data = countryToCode(country_data)

    country_frame = pd.DataFrame.from_records(country_data, index = "name")
    country_frame["percent"] = country_frame["percent"].fillna(0)
    country_frame["percent"] = country_frame["percent"].replace("", 0)
    country_frame.rename(columns={"percent": "Speaker %"}, inplace=True)

    country_frame.loc[country_frame["Speaker %"] > 100, "Speaker %"] = 0
    

    fig = px.choropleth(country_frame, locations="iso",
                    color="Speaker %", 
                    hover_name="country", 
                    hover_data="note",
                    title=f"Countries with {language} speakers",
                    color_continuous_scale=px.colors.sequential.Plasma,
                    width=1000,
                    height=700)


    fig.add_annotation(x=0.55, y=0, text="0 means no percentage is available, but the language is spoken there", showarrow=False, font=dict(color="black", size=10))

    fig.show()

In [28]:
def main() -> None:
    with open("language_data.json", "r") as read_file:
        lang_data = json.load(read_file)

    langChloropleth("english", lang_data)

In [29]:
main()

SubdivisionHierarchy(code='US-AS', country_code='US', name='American Samoa', parent_code=None, type='Outlying area')
SubdivisionHierarchy(code='NL-AW', country_code='NL', name='Aruba', parent_code=None, type='Country')
SubdivisionHierarchy(code='BZ-BZ', country_code='BZ', name='Belize', parent_code=None, type='District')
SubdivisionHierarchy(code='NL-CW', country_code='NL', name='Curaçao', parent_code=None, type='Country')
SubdivisionHierarchy(code='US-GU', country_code='US', name='Guam', parent_code=None, type='Outlying area')
SubdivisionHierarchy(code='BE-WLX', country_code='BE', name='Luxembourg', parent='WAL', parent_code='BE-WAL', type='Province')
SubdivisionHierarchy(code='LU-LU', country_code='LU', name='Luxembourg', parent_code=None, type='Canton')
SubdivisionHierarchy(code='US-MP', country_code='US', name='Northern Mariana Islands', parent_code=None, type='Outlying area')
SubdivisionHierarchy(code='PA-8', country_code='PA', name='Panamá', parent_code=None, type='Province')
Sub