In [20]:
#imports
from pathlib import Path
import datetime as dt
import hvplot.pandas
import pandas as pd
from shapely.geometry import Polygon
import geopandas as gpd

# Data Cleaning

### Lat/Lon CSV Cleaning

In [21]:
# name csv path
world_geo_path = Path("world_geo_data.csv")

# create dataframe from csv
world_geo_df =pd.read_csv(world_geo_path)

world_geo_df.head()

Unnamed: 0,country_code,latitude,longitude,country,usa_state_code,usa_state_latitude,usa_state_longitude,usa_state
0,AD,42.546245,1.601554,Andorra,AK,63.588753,-154.493062,Alaska
1,AE,23.424076,53.847818,United Arab Emirates,AL,32.318231,-86.902298,Alabama
2,AF,33.93911,67.709953,Afghanistan,AR,35.20105,-91.831833,Arkansas
3,AG,17.060816,-61.796428,Antigua and Barbuda,AZ,34.048928,-111.093731,Arizona
4,AI,18.220554,-63.068615,Anguilla,CA,36.778261,-119.417932,California


In [22]:
# rename columns for clarity and consistency
world_geo_df.rename(columns={"country_code": "alpha-2"}, inplace=True)

# drop unnecessary columns
world_geo_df.drop(labels=["usa_state_code",
    "usa_state_latitude",
    "usa_state_longitude",
    "usa_state"
    ], axis=1, inplace=True)

world_geo_df.head()


Unnamed: 0,alpha-2,latitude,longitude,country
0,AD,42.546245,1.601554,Andorra
1,AE,23.424076,53.847818,United Arab Emirates
2,AF,33.93911,67.709953,Afghanistan
3,AG,17.060816,-61.796428,Antigua and Barbuda
4,AI,18.220554,-63.068615,Anguilla


In [23]:
# check for nulls
nulls = world_geo_df.isnull().any(axis=1)
null_rows = world_geo_df[nulls]
display(null_rows)

Unnamed: 0,alpha-2,latitude,longitude,country
156,,-22.95764,18.49041,Namibia
226,UM,,,U.S. Minor Outlying Islands


In [24]:
# fix alpha-2 for Namibia
world_geo_df.at[156, "alpha-2"] = "NA"

# check
print(world_geo_df.at[156, "alpha-2"])

# delete row for US Minor Outlying Islands because their geography is not bound by lon/lat
world_geo_df.drop(226, inplace=True)

# check
nulls = world_geo_df.isnull().any(axis=1)
null_rows = world_geo_df[nulls]
display(null_rows)


NA


Unnamed: 0,alpha-2,latitude,longitude,country


In [25]:
# check data types
world_geo_df.dtypes

alpha-2       object
latitude     float64
longitude    float64
country       object
dtype: object

### World Inflation CSV Cleaning

In [26]:
# name csv path
world_inflation_path = Path("world_inflation_data.csv")

# create dataframe from csv, skipping blank rows and naming the header
world_inflation_df =pd.read_csv(world_inflation_path, header=2, skiprows=[0, 1])

world_inflation_df.head()

Unnamed: 0,Country Name,Country Code,Indicator Name,Indicator Code,1960,1961,1962,1963,1964,1965,...,2013,2014,2015,2016,2017,2018,2019,2020,2021,2022
0,Aruba,ABW,"Inflation, consumer prices (annual %)",FP.CPI.TOTL.ZG,,,,,,,...,-2.372065,0.421441,0.474764,-0.931196,-1.028282,3.626041,4.257462,,,
1,Africa Eastern and Southern,AFE,"Inflation, consumer prices (annual %)",FP.CPI.TOTL.ZG,,,,,,,...,5.750981,5.37029,5.245878,6.571396,6.399343,4.720811,4.653665,7.321106,6.824727,10.773751
2,Afghanistan,AFG,"Inflation, consumer prices (annual %)",FP.CPI.TOTL.ZG,,,,,,,...,7.385772,4.673996,-0.661709,4.383892,4.975952,0.626149,2.302373,,,
3,Africa Western and Central,AFW,"Inflation, consumer prices (annual %)",FP.CPI.TOTL.ZG,,,,,,,...,2.439201,1.768436,2.130817,1.487416,1.764635,1.78405,1.760112,2.437609,3.653533,7.967574
4,Angola,AGO,"Inflation, consumer prices (annual %)",FP.CPI.TOTL.ZG,,,,,,,...,8.777814,7.280387,9.35384,30.698958,29.842578,19.630594,17.079704,22.271564,25.754266,


In [27]:
# drop unnecessary columns
world_inflation_df.drop(columns=["Indicator Name", "Indicator Code"], inplace=True)

# rename country_code column for clarity and consistency
world_inflation_df.rename(columns={"Country Code": "alpha-3", "Country Name":"country"}, inplace=True)

world_inflation_df.head()

Unnamed: 0,country,alpha-3,1960,1961,1962,1963,1964,1965,1966,1967,...,2013,2014,2015,2016,2017,2018,2019,2020,2021,2022
0,Aruba,ABW,,,,,,,,,...,-2.372065,0.421441,0.474764,-0.931196,-1.028282,3.626041,4.257462,,,
1,Africa Eastern and Southern,AFE,,,,,,,,,...,5.750981,5.37029,5.245878,6.571396,6.399343,4.720811,4.653665,7.321106,6.824727,10.773751
2,Afghanistan,AFG,,,,,,,,,...,7.385772,4.673996,-0.661709,4.383892,4.975952,0.626149,2.302373,,,
3,Africa Western and Central,AFW,,,,,,,,,...,2.439201,1.768436,2.130817,1.487416,1.764635,1.78405,1.760112,2.437609,3.653533,7.967574
4,Angola,AGO,,,,,,,,,...,8.777814,7.280387,9.35384,30.698958,29.842578,19.630594,17.079704,22.271564,25.754266,


In [28]:
# check data types
world_inflation_df.dtypes

country     object
alpha-3     object
1960       float64
1961       float64
1962       float64
            ...   
2018       float64
2019       float64
2020       float64
2021       float64
2022       float64
Length: 65, dtype: object

### ISO2/ISO3 CSV Cleaning

In [29]:
# path
country_info_path = Path("country_info.csv")

# create dataframe from csv
country_info_df = pd.read_csv(country_info_path)

country_info_df.head()


Unnamed: 0,name,alpha-2,alpha-3,country-code,iso_3166-2,region,sub-region,intermediate-region,region-code,sub-region-code,intermediate-region-code
0,Afghanistan,AF,AFG,4,ISO 3166-2:AF,Asia,Southern Asia,,142.0,34.0,
1,Åland Islands,AX,ALA,248,ISO 3166-2:AX,Europe,Northern Europe,,150.0,154.0,
2,Albania,AL,ALB,8,ISO 3166-2:AL,Europe,Southern Europe,,150.0,39.0,
3,Algeria,DZ,DZA,12,ISO 3166-2:DZ,Africa,Northern Africa,,2.0,15.0,
4,American Samoa,AS,ASM,16,ISO 3166-2:AS,Oceania,Polynesia,,9.0,61.0,


In [30]:
# keep necessary columns
country_info_df = country_info_df[["name", "alpha-2", "alpha-3"]]

# change column names for consistency and clarity
country_info_df.rename(columns={"name":"country"}, inplace=True)

country_info_df.head()

Unnamed: 0,country,alpha-2,alpha-3
0,Afghanistan,AF,AFG
1,Åland Islands,AX,ALA
2,Albania,AL,ALB
3,Algeria,DZ,DZA
4,American Samoa,AS,ASM


In [31]:
# check for nulls
nulls = country_info_df.isnull().any(axis=1)
null_rows = country_info_df[nulls]
display(null_rows)

Unnamed: 0,country,alpha-2,alpha-3
153,Namibia,,NAM


In [32]:
# fix issue with Namibia alpha-2 (NaN)
country_info_df.at[153, "alpha-2"] = "NA"
# check
print(country_info_df.at[153, "alpha-2"])

NA


In [33]:
# check datatypes
country_info_df.dtypes

country    object
alpha-2    object
alpha-3    object
dtype: object

# Combining Dataframes

In [34]:
# view dataframes together
display(country_info_df.head())
display(world_inflation_df.head())
display(world_geo_df.head())

Unnamed: 0,country,alpha-2,alpha-3
0,Afghanistan,AF,AFG
1,Åland Islands,AX,ALA
2,Albania,AL,ALB
3,Algeria,DZ,DZA
4,American Samoa,AS,ASM


Unnamed: 0,country,alpha-3,1960,1961,1962,1963,1964,1965,1966,1967,...,2013,2014,2015,2016,2017,2018,2019,2020,2021,2022
0,Aruba,ABW,,,,,,,,,...,-2.372065,0.421441,0.474764,-0.931196,-1.028282,3.626041,4.257462,,,
1,Africa Eastern and Southern,AFE,,,,,,,,,...,5.750981,5.37029,5.245878,6.571396,6.399343,4.720811,4.653665,7.321106,6.824727,10.773751
2,Afghanistan,AFG,,,,,,,,,...,7.385772,4.673996,-0.661709,4.383892,4.975952,0.626149,2.302373,,,
3,Africa Western and Central,AFW,,,,,,,,,...,2.439201,1.768436,2.130817,1.487416,1.764635,1.78405,1.760112,2.437609,3.653533,7.967574
4,Angola,AGO,,,,,,,,,...,8.777814,7.280387,9.35384,30.698958,29.842578,19.630594,17.079704,22.271564,25.754266,


Unnamed: 0,alpha-2,latitude,longitude,country
0,AD,42.546245,1.601554,Andorra
1,AE,23.424076,53.847818,United Arab Emirates
2,AF,33.93911,67.709953,Afghanistan
3,AG,17.060816,-61.796428,Antigua and Barbuda
4,AI,18.220554,-63.068615,Anguilla


In [35]:
# merge dataframes (merge instead of join because of dissimilar indexes)
combined_df = country_info_df.merge(world_geo_df, how="inner", on="alpha-2")
combined_df = combined_df.merge(world_inflation_df, how="inner", on="alpha-3")
combined_df

Unnamed: 0,country_x,alpha-2,alpha-3,latitude,longitude,country_y,country,1960,1961,1962,...,2013,2014,2015,2016,2017,2018,2019,2020,2021,2022
0,Afghanistan,AF,AFG,33.939110,67.709953,Afghanistan,Afghanistan,,,,...,7.385772,4.673996,-0.661709,4.383892,4.975952,0.626149,2.302373,,,
1,Albania,AL,ALB,41.153332,20.168331,Albania,Albania,,,,...,1.937621,1.625865,1.896174,1.275432,1.986661,2.028060,1.411091,1.620887,2.041472,6.725203
2,Algeria,DZ,DZA,28.033886,1.659626,Algeria,Algeria,,,,...,3.254239,2.916927,4.784447,6.397695,5.591116,4.269990,1.951768,2.415131,7.226063,9.265516
3,American Samoa,AS,ASM,-14.270972,-170.132217,American Samoa,American Samoa,,,,...,,,,,,,,,,
4,Andorra,AD,AND,42.546245,1.601554,Andorra,Andorra,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
206,Virgin Islands (British),VG,VGB,18.420695,-64.639968,British Virgin Islands,British Virgin Islands,,,,...,,,,,,,,,,
207,Virgin Islands (U.S.),VI,VIR,18.335765,-64.896335,U.S. Virgin Islands,Virgin Islands (U.S.),,,,...,,,,,,,,,,
208,Yemen,YE,YEM,15.552727,48.516388,Yemen,"Yemen, Rep.",,,,...,10.968442,8.104726,,,,,,,,
209,Zambia,ZM,ZMB,-13.133897,27.849332,Zambia,Zambia,,,,...,6.977676,7.806876,10.110593,17.869730,6.577312,7.494572,9.150316,15.733060,22.020768,10.993204


In [36]:
# drop duplicate columns
combined_df.drop(columns= ["country_y", "country", "alpha-2", "alpha-3"], inplace=True)

# rename country_x
combined_df.rename(columns={"country_x":"country"}, inplace=True)

# assign index
combined_df.set_index("country", inplace=True)

# sort by index
combined_df.sort_index(inplace=True)

# check
combined_df


Unnamed: 0_level_0,latitude,longitude,1960,1961,1962,1963,1964,1965,1966,1967,...,2013,2014,2015,2016,2017,2018,2019,2020,2021,2022
country,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Afghanistan,33.939110,67.709953,,,,,,,,,...,7.385772,4.673996,-0.661709,4.383892,4.975952,0.626149,2.302373,,,
Albania,41.153332,20.168331,,,,,,,,,...,1.937621,1.625865,1.896174,1.275432,1.986661,2.028060,1.411091,1.620887,2.041472,6.725203
Algeria,28.033886,1.659626,,,,,,,,,...,3.254239,2.916927,4.784447,6.397695,5.591116,4.269990,1.951768,2.415131,7.226063,9.265516
American Samoa,-14.270972,-170.132217,,,,,,,,,...,,,,,,,,,,
Andorra,42.546245,1.601554,,,,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
Virgin Islands (British),18.420695,-64.639968,,,,,,,,,...,,,,,,,,,,
Virgin Islands (U.S.),18.335765,-64.896335,,,,,,,,,...,,,,,,,,,,
Yemen,15.552727,48.516388,,,,,,,,,...,10.968442,8.104726,,,,,,,,
Zambia,-13.133897,27.849332,,,,,,,,,...,6.977676,7.806876,10.110593,17.869730,6.577312,7.494572,9.150316,15.733060,22.020768,10.993204


In [37]:
combined_df.dtypes

latitude     float64
longitude    float64
1960         float64
1961         float64
1962         float64
              ...   
2018         float64
2019         float64
2020         float64
2021         float64
2022         float64
Length: 65, dtype: object

In [38]:
# save whole df as csv
combined_df.to_csv("combined_inflation_cleaned_data.csv")

# save usa inflation data as df then csv for future use
usa_inflation_df = pd.DataFrame(combined_df.loc["United States of America", :])
usa_inflation_df.drop(labels=["latitude", "longitude"], inplace=True)
usa_inflation_df.rename(columns={"United States of America":"inflation rate"}, inplace=True)
usa_inflation_df.to_csv("usa_inflation_cleaned_data.csv")


# Data Visualizations

In [None]:
map = combined_df.hvplot.points(
    "longitude",
    "latitude",
    geo=True,
    tiles="CartoLight",
    size="inflation_rate",
    color="inflation_rate",
    frame_width=700,
    frame_height=500,
    title="World Inflation by Country in 2022",
    hover_cols=["country", "inflation_rate"]
)

map

In [None]:
world = gpd.read_file(gpd.datasets.get_path('naturalearth_lowres'))
world

In [None]:
world.set_index("name", inplace=True)

In [None]:
new_df = world.merge(combined_df, how='left', left_on='name', right_on='country')

In [None]:
new_df

In [None]:
# find nulls
# null mask
# attributed resource: https://saturncloud.io/blog/python-pandas-selecting-rows-whose-column-value-is-null-none-nan/#:~:text=The%20simplest%20way%20to%20select,rows%20that%20have%20null%20values.
nulls = new_df.isnull().any(axis=1)
null_rows = new_df[nulls]
null_rows

In [None]:
new_df.hvplot(
    geo=True,
    c="2022",
)

In [None]:
poly_combined.hvplot(
    
    
)