# Vector Space Model

In [4]:
from pandas_datareader import wb
import numpy as np
import pandas as pd
pd.options.display.width = 0

Download Data from World Bank

In [9]:
names = [
    "NE.EXP.GNFS.CD", # Exports of goods and services (current US$)
    "NE.IMP.GNFS.CD", # Imports of goods and services (current US$)
    "NV.AGR.TOTL.CD", # Agriculture, forestry, and fishing, value added (current US$)
    "NY.GDP.MKTP.CD", # GDP (current US$)
    "NE.RSB.GNFS.CD", # External balance on goods and services (current US$)
]
df = wb.download(country="all", indicator=names, start=2010, end=2010).reset_index()

Remove aggregates and keep only countries with no missing data

In [8]:
countries = wb.get_countries()
non_aggregates = countries[countries["region"] != "Aggregates"].name
df_nonagg = df[df["country"].isin(non_aggregates)].dropna()

Extract Vector for each Country

In [10]:
vectors = {}
for rowid, row in df_nonagg.iterrows():
    vectors[row["country"]] = row[names].values

Get the Euclidean amd Cosine distances

In [11]:
euclid = {}
cosine = {}

In [14]:
target = "Australia"
for country in vectors:
    vecA = vectors[target]
    vecB = vectors[country]
    dist = np.linalg.norm(vecA - vecB)
    cos = (vecA @ vecB) / (np.linalg.norm(vecA) * np.linalg.norm(vecB))
    euclid[country] = dist    # Euclidean distance
    cosine[country] = 1-cos   # cosine distance
df_distance = pd.DataFrame({"euclid": euclid, "cos": cosine})

Print Results

In [15]:
print("Closest by Euclidean distance:")
print(df_distance.sort_values(by="euclid").head())
print()
print("Closest by Cosine distance:")
print(df_distance.sort_values(by="cos").head())

Closest by Euclidean distance:
                 euclid           cos
Australia  0.000000e+00 -2.220446e-16
Mexico     1.545268e+11  8.030430e-03
Spain      3.416636e+11  3.105920e-03
Turkey     3.812584e+11  3.569451e-03
Indonesia  4.095371e+11  7.389964e-03

Closest by Cosine distance:
                    euclid           cos
Australia     0.000000e+00 -2.220446e-16
Colombia      8.993897e+11  1.700920e-03
South Africa  7.525828e+11  2.284180e-03
Cuba          1.127318e+12  2.395475e-03
Italy         1.089453e+12  2.734843e-03


Detail Metrics

In [16]:
print()
print("Detail metrics:")
print(df_nonagg[df_nonagg.country.isin(["Mexico", "Colombia", "Australia"])])


Detail metrics:
       country  year  NE.EXP.GNFS.CD  NE.IMP.GNFS.CD  NV.AGR.TOTL.CD  \
59   Australia  2010    2.274165e+11    2.380041e+11    2.531842e+10   
91    Colombia  2010    4.682683e+10    5.136288e+10    1.812470e+10   
176     Mexico  2010    3.141423e+11    3.285812e+11    3.405226e+10   

     NY.GDP.MKTP.CD  NE.RSB.GNFS.CD  
59     1.147589e+12   -1.058751e+10  
91     2.865631e+11   -4.536047e+09  
176    1.057801e+12   -1.443887e+10  
