### Load and Inspect "Countries-Coordinates" CSV format Data


In [23]:
# Read semi-colon separated file with geolocation data
import pandas as pd

df_geo = pd.read_csv("raw-countries-coordinates.csv", sep=";")

# print the shape of the Geo data
print(df_geo.shape)

# Show first 5 rows
df_geo.head()

(255, 96)


Unnamed: 0,Geo Point,Geo Shape,featurecla,scalerank,LABELRANK,SOVEREIGNT,SOV_A3,ADM0_DIF,LEVEL,TYPE,...,NAME_JA,NAME_KO,NAME_NL,NAME_PL,NAME_PT,NAME_RU,NAME_SV,NAME_TR,NAME_VI,NAME_ZH
0,"53.5318804783, 28.033566395","{""coordinates"": [[[23.60623824100014, 51.51739...",Admin-0 country,0,4,Belarus,BLR,0,2,Sovereign country,...,ベラルーシ,벨라루스,Wit-Rusland,Białoruś,Bielorrússia,Белоруссия,Vitryssland,Beyaz Rusya,Belarus,白罗斯
1,"51.1063634863, 10.3814938434","{""coordinates"": [[[[13.815724731000072, 48.766...",Admin-0 country,0,2,Germany,DEU,0,2,Sovereign country,...,ドイツ,독일,Duitsland,Niemcy,Alemanha,Германия,Tyskland,Almanya,Đức,德国
2,"27.0302779758, 18.009605121","{""coordinates"": [[[11.505111803000144, 33.1812...",Admin-0 country,0,3,Libya,LBY,0,2,Sovereign country,...,リビア,리비아,Libië,Libia,Líbia,Ливия,Libyen,Libya,Libya,利比亚
3,"11.7501558755, 42.5664536523","{""coordinates"": [[[43.240733269000074, 11.4878...",Admin-0 country,0,5,Djibouti,DJI,0,2,Sovereign country,...,ジブチ,지부티,Djibouti,Dżibuti,Djibouti,Джибути,Djibouti,Cibuti,Djibouti,吉布提
4,"24.1223270695, 44.5368636774","{""coordinates"": [[[[50.80787194100008, 24.7466...",Admin-0 country,0,2,Saudi Arabia,SAU,0,2,Sovereign country,...,サウジアラビア,사우디아라비아,Saoedi-Arabië,Arabia Saudyjska,Arábia Saudita,Саудовская Аравия,Saudiarabien,Suudi Arabistan,Ả Rập Saudi,沙特阿拉伯


### Data Cleaning


In [24]:
# Rename columns to match the other dataframes
df_geo.rename(
    columns={
        "SOVEREIGNT": "Country",
        "Geo Point": "coordinates",
        "SOV_A3": "Country-code",
        "CONTINENT": "continent",
    },
    inplace=True,
)

# Show first 5 rows
df_geo.head()

Unnamed: 0,coordinates,Geo Shape,featurecla,scalerank,LABELRANK,Country,Country-code,ADM0_DIF,LEVEL,TYPE,...,NAME_JA,NAME_KO,NAME_NL,NAME_PL,NAME_PT,NAME_RU,NAME_SV,NAME_TR,NAME_VI,NAME_ZH
0,"53.5318804783, 28.033566395","{""coordinates"": [[[23.60623824100014, 51.51739...",Admin-0 country,0,4,Belarus,BLR,0,2,Sovereign country,...,ベラルーシ,벨라루스,Wit-Rusland,Białoruś,Bielorrússia,Белоруссия,Vitryssland,Beyaz Rusya,Belarus,白罗斯
1,"51.1063634863, 10.3814938434","{""coordinates"": [[[[13.815724731000072, 48.766...",Admin-0 country,0,2,Germany,DEU,0,2,Sovereign country,...,ドイツ,독일,Duitsland,Niemcy,Alemanha,Германия,Tyskland,Almanya,Đức,德国
2,"27.0302779758, 18.009605121","{""coordinates"": [[[11.505111803000144, 33.1812...",Admin-0 country,0,3,Libya,LBY,0,2,Sovereign country,...,リビア,리비아,Libië,Libia,Líbia,Ливия,Libyen,Libya,Libya,利比亚
3,"11.7501558755, 42.5664536523","{""coordinates"": [[[43.240733269000074, 11.4878...",Admin-0 country,0,5,Djibouti,DJI,0,2,Sovereign country,...,ジブチ,지부티,Djibouti,Dżibuti,Djibouti,Джибути,Djibouti,Cibuti,Djibouti,吉布提
4,"24.1223270695, 44.5368636774","{""coordinates"": [[[[50.80787194100008, 24.7466...",Admin-0 country,0,2,Saudi Arabia,SAU,0,2,Sovereign country,...,サウジアラビア,사우디아라비아,Saoedi-Arabië,Arabia Saudyjska,Arábia Saudita,Саудовская Аравия,Saudiarabien,Suudi Arabistan,Ả Rập Saudi,沙特阿拉伯


In [25]:
# Drop all columns except Country, coordinates and continent

# Create a list of columns
columns_list = ["Country", "coordinates", "continent"]

# Drop all columns except the ones in the list
df_geo = df_geo[columns_list]

# Show first 5 rows
df_geo.head()

Unnamed: 0,Country,coordinates,continent
0,Belarus,"53.5318804783, 28.033566395",Europe
1,Germany,"51.1063634863, 10.3814938434",Europe
2,Libya,"27.0302779758, 18.009605121",Africa
3,Djibouti,"11.7501558755, 42.5664536523",Africa
4,Saudi Arabia,"24.1223270695, 44.5368636774",Asia


In [26]:
# Filter only European countries
df_geo_eur = df_geo[df_geo["continent"] == "Europe"]

In [27]:
# checking for duplicates in column Country
df_geo_eur[df_geo_eur.duplicated(subset=["Country"])]

Unnamed: 0,Country,coordinates,continent
51,United Kingdom,"49.4843729229, -2.53048002329",Europe
147,Denmark,"55.9653774566, 10.0513897028",Europe
157,Finland,"64.4953589823, 26.2707950346",Europe
164,United Kingdom,"36.1294636704, -5.34670032371",Europe
214,United Kingdom,"54.2291019004, -4.53369508308",Europe
216,United Kingdom,"54.1470481882, -2.88695151937",Europe


In [28]:
# Drop duplicates in column Country
df_geo_eur.drop_duplicates(subset=["Country"], inplace=True)

# checking for duplicates in column Country
df_geo_eur[df_geo_eur.duplicated(subset=["Country"])]

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_geo_eur.drop_duplicates(subset=["Country"], inplace=True)


Unnamed: 0,Country,coordinates,continent


In [29]:
# Save dataframe to csv file
df_geo_eur.to_csv("clean-coordinates-data.csv", index=False)

In [30]:
df_geo_eur["Country"].__len__()

45