### Libraries

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
import warnings
warnings.filterwarnings('ignore')

# 1. Dataset

In [2]:
df = pd.read_csv("world-data-2023.csv")
df.head()

Unnamed: 0,Country,Density\n(P/Km2),Abbreviation,Agricultural Land( %),Land Area(Km2),Armed Forces size,Birth Rate,Calling Code,Capital/Major City,Co2-Emissions,...,Out of pocket health expenditure,Physicians per thousand,Population,Population: Labor force participation (%),Tax revenue (%),Total tax rate,Unemployment rate,Urban_population,Latitude,Longitude
0,Afghanistan,60,AF,58.10%,652230,323000.0,32.49,93.0,Kabul,8672,...,78.40%,0.28,38041754,48.90%,9.30%,71.40%,11.12%,9797273,33.93911,67.709953
1,Albania,105,AL,43.10%,28748,9000.0,11.78,355.0,Tirana,4536,...,56.90%,1.2,2854191,55.70%,18.60%,36.60%,12.33%,1747593,41.153332,20.168331
2,Algeria,18,DZ,17.40%,2381741,317000.0,24.28,213.0,Algiers,150006,...,28.10%,1.72,43053054,41.20%,37.20%,66.10%,11.70%,31510100,28.033886,1.659626
3,Andorra,164,AD,40.00%,468,,7.2,376.0,Andorra la Vella,469,...,36.40%,3.33,77142,,,,,67873,42.506285,1.521801
4,Angola,26,AO,47.50%,1246700,117000.0,40.73,244.0,Luanda,34693,...,33.40%,0.21,31825295,77.50%,9.20%,49.10%,6.89%,21061025,-11.202692,17.873887


In [3]:
df.shape

(195, 35)

# 2.EDA

In [5]:
#transform column names

col_name = df.columns

def string_operation(string):
    string = string.replace('%','')
    string = string.replace('(','')
    string = string.replace(')','')
    string = string.strip()
    string = string.replace(' ','_')
    return string

col_name = list(map(string_operation,col_name))
df.columns = col_name

In [6]:
#transform column type

object_cols = df.select_dtypes('object').columns.drop(['Country','Capital/Major_City','Largest_city',
                                                      'Abbreviation','Official_language','Currency-Code'])

df[object_cols] = df[object_cols].applymap(lambda x : float(str(x).replace(',','').replace('$','').replace('%','')))

In [7]:
df['Country'].unique()

array(['Afghanistan', 'Albania', 'Algeria', 'Andorra', 'Angola',
       'Antigua and Barbuda', 'Argentina', 'Armenia', 'Australia',
       'Austria', 'Azerbaijan', 'The Bahamas', 'Bahrain', 'Bangladesh',
       'Barbados', 'Belarus', 'Belgium', 'Belize', 'Benin', 'Bhutan',
       'Bolivia', 'Bosnia and Herzegovina', 'Botswana', 'Brazil',
       'Brunei', 'Bulgaria', 'Burkina Faso', 'Burundi', 'Ivory Coast',
       'Cape Verde', 'Cambodia', 'Cameroon', 'Canada',
       'Central African Republic', 'Chad', 'Chile', 'China', 'Colombia',
       'Comoros', 'Republic of the Congo', 'Costa Rica', 'Croatia',
       'Cuba', 'Cyprus', 'Czech Republic',
       'Democratic Republic of the Congo', 'Denmark', 'Djibouti',
       'Dominica', 'Dominican Republic', 'Ecuador', 'Egypt',
       'El Salvador', 'Equatorial Guinea', 'Eritrea', 'Estonia',
       'Eswatini', 'Ethiopia', 'Fiji', 'Finland', 'France', 'Gabon',
       'The Gambia', 'Georgia', 'Germany', 'Ghana', 'Greece', 'Grenada',
       'Guatemal

In [8]:
df.groupby(df['Country']).get_group('S�����������') #we have gibberish here

Unnamed: 0,Country,Density\nP/Km2,Abbreviation,Agricultural_Land,Land_AreaKm2,Armed_Forces_size,Birth_Rate,Calling_Code,Capital/Major_City,Co2-Emissions,...,Out_of_pocket_health_expenditure,Physicians_per_thousand,Population,Population:_Labor_force_participation,Tax_revenue,Total_tax_rate,Unemployment_rate,Urban_population,Latitude,Longitude
150,S�����������,228.0,ST,50.7,964.0,1000.0,31.54,239.0,S����,121.0,...,11.7,0.05,215056.0,57.8,14.6,37.0,13.37,158277.0,,


In [11]:
df = df[df['Country'] != 'S�����������']

In [12]:
df.isnull().sum()

Country                                   0
Density\nP/Km2                            0
Abbreviation                              7
Agricultural_Land                         7
Land_AreaKm2                              1
Armed_Forces_size                        24
Birth_Rate                                6
Calling_Code                              1
Capital/Major_City                        3
Co2-Emissions                             7
CPI                                      17
CPI_Change                               16
Currency-Code                            15
Fertility_Rate                            7
Forested_Area                             7
Gasoline_Price                           19
GDP                                       2
Gross_primary_education_enrollment        7
Gross_tertiary_education_enrollment      12
Infant_mortality                          6
Largest_city                              6
Life_expectancy                           8
Maternal_mortality_ratio        

In [18]:
#lets check missing Abbreviation for countries

rows = df[df['Abbreviation'].isnull()]
contries = list(rows['Country'])
abr = ['ROC', 'ES', 'VC', 'ROI', 'NA', 'NM', 'PNA']
ab_dict = dict(zip(contries, abr))
ab_dict

{'Republic of the Congo': 'ROC',
 'Eswatini': 'ES',
 'Vatican City': 'VC',
 'Republic of Ireland': 'ROI',
 'Namibia': 'NA',
 'North Macedonia': 'NM',
 'Palestinian National Authority': 'PNA'}

In [23]:
def get_abr(country_name):
    return ab_dict.get(country_name)

mask = df['Abbreviation'].isnull()

df.loc[mask,'Abbreviation'] = df.loc[mask,'Country'].apply(get_abr)

In [25]:
df['Abbreviation'].isnull().sum()

0

In [28]:
#Countries with missing Agricultural Land

df[df['Agricultural_Land'].isnull()].Country

56                           Eswatini
73                       Vatican City
113                            Monaco
120                             Nauru
128                   North Macedonia
133    Palestinian National Authority
163                       South Sudan
Name: Country, dtype: object

In [29]:
#find the missing values on google
agri_land = {'Eswatini':71.0, 'Vatican City':0.0, 'Monaco':0.0, 'Nauru':50.2 , 'North Macedonia':20.0,
             'Palestinian National Authority':0.0, 'South Sudan':44.7 }

mask= df['Agricultural_Land'].isnull()

def get_al(country_name):
    return agri_land.get(country_name)

df.loc[mask, 'Agricultural_Land'] = df.loc[mask, 'Country'].apply(get_al)

df['Agricultural_Land'].isnull().sum()

0

In [31]:
# Missung Land Ares
df[df['Land_AreaKm2'].isnull()].Country

133    Palestinian National Authority
Name: Country, dtype: object

In [32]:
df['Land_AreaKm2'].fillna(6026,inplace = True)
df['Land_AreaKm2'].isnull().sum()

0