In [1]:
from pathlib import Path
import datetime as dt
import pandas as pd

In [2]:
# name csv path
world_geo_path = Path("world_geo_data.csv")

# read csv
world_geo_df =pd.read_csv(world_geo_path)

world_geo_df.head()

Unnamed: 0,country_code,latitude,longitude,country,usa_state_code,usa_state_latitude,usa_state_longitude,usa_state
0,AD,42.546245,1.601554,Andorra,AK,63.588753,-154.493062,Alaska
1,AE,23.424076,53.847818,United Arab Emirates,AL,32.318231,-86.902298,Alabama
2,AF,33.93911,67.709953,Afghanistan,AR,35.20105,-91.831833,Arkansas
3,AG,17.060816,-61.796428,Antigua and Barbuda,AZ,34.048928,-111.093731,Arizona
4,AI,18.220554,-63.068615,Anguilla,CA,36.778261,-119.417932,California


In [3]:
# set index
world_geo_df.set_index("country", inplace=True)

# sort index
world_geo_df.sort_index(inplace=True)

world_geo_df.head()

Unnamed: 0_level_0,country_code,latitude,longitude,usa_state_code,usa_state_latitude,usa_state_longitude,usa_state
country,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
Afghanistan,AF,33.93911,67.709953,AR,35.20105,-91.831833,Arkansas
Albania,AL,41.153332,20.168331,CO,39.550051,-105.782067,Colorado
Algeria,DZ,28.033886,1.659626,,,,
American Samoa,AS,-14.270972,-170.132217,HI,19.898682,-155.665857,Hawaii
Andorra,AD,42.546245,1.601554,AK,63.588753,-154.493062,Alaska


In [4]:
# drop unnecessary columns
world_geo_df.drop(labels=["usa_state_code",
    "usa_state_latitude",
    "usa_state_longitude",
    "usa_state"
    ], axis=1, inplace=True)

display(world_geo_df.head())
display(world_geo_df.tail())

Unnamed: 0_level_0,country_code,latitude,longitude
country,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Afghanistan,AF,33.93911,67.709953
Albania,AL,41.153332,20.168331
Algeria,DZ,28.033886,1.659626
American Samoa,AS,-14.270972,-170.132217
Andorra,AD,42.546245,1.601554


Unnamed: 0_level_0,country_code,latitude,longitude
country,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Wallis and Futuna,WF,-13.768752,-177.156097
Western Sahara,EH,24.215527,-12.885834
Yemen,YE,15.552727,48.516388
Zambia,ZM,-13.133897,27.849332
Zimbabwe,ZW,-19.015438,29.154857


In [5]:
# find nulls
# null mask
# attributed resource: https://saturncloud.io/blog/python-pandas-selecting-rows-whose-column-value-is-null-none-nan/#:~:text=The%20simplest%20way%20to%20select,rows%20that%20have%20null%20values.
nulls = world_geo_df.isnull().any(axis=1)
null_rows = world_geo_df[nulls]
null_rows

# Namibia's country code is NA, so it may have been treated as a null previously

Unnamed: 0_level_0,country_code,latitude,longitude
country,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Namibia,,-22.95764,18.49041
U.S. Minor Outlying Islands,UM,,


In [6]:
# delete US Minor Outlying Islands because they are not true countries
world_geo_df.drop(labels="U.S. Minor Outlying Islands", axis=0, inplace=True)

# check
nulls = world_geo_df.isnull().any(axis=1)
null_rows = world_geo_df[nulls]
null_rows

Unnamed: 0_level_0,country_code,latitude,longitude
country,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Namibia,,-22.95764,18.49041


In [7]:
# cast country code as a strong to eliminate issue with NA Namibia
world_geo_df["country_code"] = world_geo_df["country_code"].astype(str)

# check
nulls = world_geo_df.isnull().any(axis=1)
null_rows = world_geo_df[nulls]
display(null_rows)
display(world_geo_df.loc["Namibia"])

# replace nan with NA
world_geo_df.at["Namibia", "country_code"] = "NA"
display(world_geo_df.loc["Namibia"])

Unnamed: 0_level_0,country_code,latitude,longitude
country,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1


country_code         nan
latitude       -22.95764
longitude       18.49041
Name: Namibia, dtype: object

country_code          NA
latitude       -22.95764
longitude       18.49041
Name: Namibia, dtype: object

In [8]:
# check data types
world_geo_df.dtypes

country_code     object
latitude        float64
longitude       float64
dtype: object

In [9]:
# export cleaned df to csv
world_geo_df.to_csv("world_geo_cleaned_data.csv")