# Initial clean-up of world data set


Dataset from: https://www.kaggle.com/datasets/nelgiriyewithana/countries-of-the-world-2023, Author: NIDULA ELGIRIYEWITHANA, accessed 11/7/2023

In [1]:
# Dependencies
from pathlib import Path
import pandas as pd
import numpy as np

In [2]:
# Path variable
world_data_2023 = Path('1.world-data-2023_original_raw.csv')

In [3]:
# Read the CSV in pandas
world_data_2023 = pd.read_csv(world_data_2023)

In [4]:
#Reorder columns for ease of analysis, and remove columns that are not relevant: Armed Forces, Abbreviation, 
#Calling Code, Currency, #Offical Language, Physicians per thousand, Minimum wage

organised_world_data = pd.DataFrame(world_data_2023[["Country","Capital/Major City","Density(P/Km2)","Population","Urban_population",\
                                        "Land Area(Km2)", "Agricultural Land( %)","Forested Area (%)","Co2-Emissions","GDP",\
                                        "Life expectancy","Birth Rate","Maternal mortality ratio","Infant mortality",\
                                        "Latitude","Longitude"]])

In [5]:
# Check row counts and type for each
organised_world_data['Forested Area (%)']=organised_world_data['Forested Area (%)'].str.replace("%","")
organised_world_data["Agricultural Land( %)"]=organised_world_data["Agricultural Land( %)"].str.replace("%","")
organised_world_data["GDP"]=organised_world_data["GDP"].str.replace("$","",regex=True)
organised_world_data.head()


Unnamed: 0,Country,Capital/Major City,Density(P/Km2),Population,Urban_population,Land Area(Km2),Agricultural Land( %),Forested Area (%),Co2-Emissions,GDP,Life expectancy,Birth Rate,Maternal mortality ratio,Infant mortality,Latitude,Longitude
0,Afghanistan,Kabul,60,38041754,9797273,652230,58.1,2.1,8672,19101353833,64.5,32.49,638.0,47.9,33.93911,67.709953
1,Albania,Tirana,105,2854191,1747593,28748,43.1,28.1,4536,15278077447,78.5,11.78,15.0,7.8,41.153332,20.168331
2,Algeria,Algiers,18,43053054,31510100,2381741,17.4,0.8,150006,169988236398,76.7,24.28,112.0,20.1,28.033886,1.659626
3,Andorra,Andorra la Vella,164,77142,67873,468,40.0,34.0,469,3154057987,,7.2,,2.7,42.506285,1.521801
4,Angola,Luanda,26,31825295,21061025,1246700,47.5,46.3,34693,94635415870,60.8,40.73,241.0,51.6,-11.202692,17.873887


In [6]:
#Drop those rows with incomplete data

organised_world_data = organised_world_data.dropna(how='any')
organised_world_data.set_index(["Country"])

Unnamed: 0_level_0,Capital/Major City,Density(P/Km2),Population,Urban_population,Land Area(Km2),Agricultural Land( %),Forested Area (%),Co2-Emissions,GDP,Life expectancy,Birth Rate,Maternal mortality ratio,Infant mortality,Latitude,Longitude
Country,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
Afghanistan,Kabul,60,38041754,9797273,652230,58.10,2.10,8672,19101353833,64.5,32.49,638.0,47.9,33.939110,67.709953
Albania,Tirana,105,2854191,1747593,28748,43.10,28.10,4536,15278077447,78.5,11.78,15.0,7.8,41.153332,20.168331
Algeria,Algiers,18,43053054,31510100,2381741,17.40,0.80,150006,169988236398,76.7,24.28,112.0,20.1,28.033886,1.659626
Angola,Luanda,26,31825295,21061025,1246700,47.50,46.30,34693,94635415870,60.8,40.73,241.0,51.6,-11.202692,17.873887
Antigua and Barbuda,"St. John's, Saint John",223,97118,23800,443,20.50,22.30,557,1727759259,76.9,15.33,42.0,5.0,17.060816,-61.796428
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
Venezuela,Caracas,32,28515829,25162368,912050,24.50,52.70,164175,482359318768,72.1,17.88,125.0,21.4,6.423750,-66.589730
Vietnam,Hanoi,314,96462106,35332140,331210,39.30,48.10,192668,261921244843,75.3,16.75,43.0,16.5,14.058324,108.277199
Yemen,Sanaa,56,29161922,10869523,527968,44.60,1.00,10609,26914402224,66.1,30.45,164.0,42.9,15.552727,48.516388
Zambia,Lusaka,25,17861030,7871713,752618,32.10,65.20,5141,23064722446,63.5,36.19,213.0,40.4,-13.133897,27.849332


In [7]:
organised_world_data['GDP']=organised_world_data['GDP'].str.replace(",","").astype("int64")
organised_world_data['Population']=organised_world_data['Population'].str.replace(",","").astype("int64")
organised_world_data['Urban_population']=organised_world_data['Urban_population'].str.replace(",","").astype("int64")
organised_world_data['Land Area(Km2)']=organised_world_data['Land Area(Km2)'].str.replace(",","").astype("int64")
organised_world_data['Co2-Emissions']=organised_world_data['Co2-Emissions'].str.replace(",","").astype("int64")
organised_world_data['Agricultural Land( %)']=organised_world_data['Agricultural Land( %)'].astype("float")
organised_world_data['Forested Area (%)']=organised_world_data['Forested Area (%)'].astype("float")

In [8]:
organised_world_data.dtypes

Country                      object
Capital/Major City           object
Density(P/Km2)               object
Population                    int64
Urban_population              int64
Land Area(Km2)                int64
Agricultural Land( %)       float64
Forested Area (%)           float64
Co2-Emissions                 int64
GDP                           int64
Life expectancy             float64
Birth Rate                  float64
Maternal mortality ratio    float64
Infant mortality            float64
Latitude                    float64
Longitude                   float64
dtype: object

In [9]:
organised_world_data.to_csv("organised_world_data.csv", index = False, header = True)