## Create weight matrix to represent proximity between regions


In [1]:
import requests
import numpy as np
import pandas as pd
import pickle

In [2]:
# input
with open('country_iata.pkl','rb') as f:
    iata=pickle.load(f)
    
exclude=['Liechtenstein', 'San Marino','Andorra','West Bank and Gaza','Burma']
for k in exclude:
    if k in iata.keys():
        del iata[k]

In [4]:
# flight hours

z={}
for i in range(158):
    with open('flight_hrs_4/country_'+str(i)+'.pkl','rb') as f:
        x=pickle.load(f)
    z.update(x)
    
exclude=['Liechtenstein', 'San Marino','Andorra','West Bank and Gaza','Burma']
for k in exclude:
    if k in z.keys():
        del z[k]

In [5]:
airports=pd.read_csv("airports.csv")
#airports.dropna(subset=['iata_code']).to_csv("airports.csv",index=False)

## correction

### 1. fill in missing data

In [13]:
# missing data

# check the number of missing entries per country
n={i:0 for i in z}
for i in n:
    for k in z[i]:
        if z[i][k]==10**6: # missing entry 
            n[i]+=1
for i in n:
    if n[i]>10:
        print(i,iata[i],n[i])

Cuba ['CFG'] 157
Monaco ['MCM'] 157


###  replace missing data with data from nearest neighbor
- find coordinates for countries, proxy by the location of the airport of their capitals
- use neighbor hours as proxy 
        - find the closest neighbor with data available

In [16]:
airports=pd.read_csv("airports.csv")

# coordinates for countries
location={i:0 for i in iata}
df={'country':[],'latitude':[],'longitude':[]}

for i in location:
    loc=np.zeros(2)
    nport=len(iata[i])
    for j in iata[i]:
        cond=(airports.iata_code==j) #& (airports.scheduled_service=='yes')
        if cond.sum()>1:
            cond = cond & (airports.type!='closed')
        if cond.sum()>1:
            cond = cond & (airports.scheduled_service=='yes')
        if cond.sum()>1:
            print(i,j,airports[cond].latitude_deg.to_numpy(),airports[cond].longitude_deg.to_numpy())
            print()
        lat=airports[cond].iloc[-1].latitude_deg.item()
        long=airports[cond].iloc[-1].longitude_deg.item()
        loc[0]+=lat/nport
        loc[1]+=long/nport
    #location[i]=loc
    df['country'].append(i)
    df['latitude'].append(loc[0])
    df['longitude'].append(loc[1])

In [17]:
df=pd.DataFrame(df)

In [19]:
df 

Unnamed: 0,country,latitude,longitude
0,Afghanistan,34.565899,69.212303
1,Albania,41.414700,19.720600
2,Algeria,36.691002,3.215410
3,Angola,-8.858370,13.231200
4,Antigua and Barbuda,17.136700,-61.792702
...,...,...,...
152,Uzbekistan,41.257900,69.281197
153,Venezuela,10.601194,-66.991222
154,Zambia,-15.330800,28.452600
155,Zimbabwe,-17.931801,31.092800


In [13]:
#df.to_csv("country_coordinates.csv")

In [9]:
# init weight matrix
countries=location.keys()
weight={}
for c in countries:
    weight[c]={}
    


In [10]:
# rank neighbors by by distance
neighbors={c:{'name':[],'dist':[]} for c in countries}
def distance(a,b):
    a=location[a]
    b=location[b]
    return ((a-b)**2).sum()

for c in countries:
    for c2 in countries:
        if c==c2: continue
        neighbors[c]['name'].append(c2)
        neighbors[c]['dist'].append(distance(c,c2))
    neighbors[c]=[x for _, x in sorted(zip(neighbors[c]['dist'], neighbors[c]['name']))]

In [11]:

neighbors['Cuba']

['Bahamas',
 'Jamaica',
 'Haiti',
 'Belize',
 'Honduras',
 'Dominican Republic',
 'Nicaragua',
 'El Salvador',
 'Guatemala',
 'Costa Rica',
 'Panama',
 'US',
 'Venezuela',
 'Colombia',
 'Mexico',
 'Antigua and Barbuda',
 'Saint Lucia',
 'Saint Vincent and the Grenadines',
 'Trinidad and Tobago',
 'Ecuador',
 'Barbados',
 'Canada',
 'Guyana',
 'Suriname',
 'Peru',
 'Bolivia',
 'Brazil',
 'Paraguay',
 'Chile',
 'Cabo Verde',
 'Argentina',
 'Uruguay',
 'Senegal',
 'Gambia',
 'Mauritania',
 'Guinea-Bissau',
 'Guinea',
 'Iceland',
 'Mali',
 'Portugal',
 'Morocco',
 "Cote d'Ivoire",
 'Spain',
 'Burkina Faso',
 'Ireland',
 'Ghana',
 'Togo',
 'Algeria',
 'United Kingdom',
 'France',
 'Nigeria',
 'Belgium',
 'Sao Tome and Principe',
 'Netherlands',
 'Monaco',
 'Luxembourg',
 'Equatorial Guinea',
 'Switzerland',
 'Tunisia',
 'Gabon',
 'Cameroon',
 'Libya',
 'Italy',
 'Malta',
 'Slovenia',
 'Germany',
 'Angola',
 'Denmark',
 'Czechia',
 'Norway',
 'Congo (Brazzaville)',
 'Croatia',
 'Congo (Kinsh

In [39]:
def get_weight(c,c2):        
    ''' 
    get the weiight between c and c2
    - if flight_hours[c][c2] exist, return
    - find nearest neighbor(s) with data available '''
    
    if c==c2: return 0,0
    if (c2 in z[c]) and z[c][c2]<10**6:
        return z[c][c2]
    else: # no flight from c2 to c
        # look for neighbors of c
        # and neighbors of c2
        
        dist=[]
        step=[]
        step_penalty=0.5
        print(c,c2)
        for i in range(len(neighbors[c])):
            c_i=neighbors[c][i]
            # replace c by its neighbor
            if not (c_i==c2) and z[c_i][c2]<10**6:
                
                dist.append(z[c_i][c2])
                step.append(i+step_penalty)
                break
            c_i=neighbors[c2][i]
            # replace c2 by its neighbor
            if not (c_i==c) and z[c][c_i]<10**6:

                dist.append(z[c][c_i])
                step.append(i+step_penalty)
                break
        # replace both c and c2 by their neighbors
        for i in range(len(neighbors[c])):
            c_i=neighbors[c][i]
            for j in range(len(neighbors[c])):
                c_j=neighbors[c2][j]
                if not (c_i==c_j) and z[c_i][c_j]<10**6:
                    dist.append(z[c_i][c_j])
                    step.append(i+j+step_penalty*2)
                    break
        min_step=min(step)
        min_i=step.index(min_step)
        print('steps',min_step)
        return dist[min_i]+min_step

In [40]:
get_weight('Eswatini','Australia')

27.58

In [41]:
get_weight('Eswatini','Papua New Guinea')

33.25

In [42]:
get_weight('Australia','Papua New Guinea')

7.08

In [43]:
get_weight('Papua New Guinea','Australia')

7.58

In [44]:
c=''
for i in neighbors[c]:
    print(i,get_weight(i,c))
    print()

Australia 7.08

Philippines 5.33

Indonesia 17.17

Singapore 6.5

Japan 24.83

Malaysia 19.67

Korea, South 25.33

Thailand 25.17

China 19.58

Bangladesh 22.0

Sri Lanka 20.58

Mongolia 26.17

Nepal 29.25

Maldives 27.67

India 27.67

Pakistan 36.83

Afghanistan 33.17

Kyrgyzstan 29.17

Seychelles 25.67

Tajikistan 35.25

Uzbekistan 28.83

Oman 29.83

Kazakhstan 30.67

United Arab Emirates 29.0

Madagascar 39.92

Qatar 23.83

Somalia 29.42

Bahrain 32.92

Comoros 36.83

Iran 31.08

Djibouti 26.17

Saudi Arabia 34.67

Kuwait 34.75

Azerbaijan 24.17

Ethiopia 44.67

Kenya 34.25

Iraq 30.0

Malawi 44.17

Armenia 25.58

Georgia 33.42

Uganda 28.58

Mozambique 32.92

Zimbabwe 32.17

South Sudan 30.58

Eswatini 33.25

Rwanda 33.67

Sudan 25.17

Jordan 25.33

Syria 47.33

Zambia 29.25

Israel 24.33

Lebanon 30.75

South Africa 31.5

Lesotho 33.08

Cyprus 30.33

Botswana 32.67

Egypt 47.33

Turkey 33.83

Russia 30.08

Central African Republic 41.5

Namibia 38.17

Moldova 31.67

Ukraine 32.92


In [61]:
''' make weight matrix for all regions '''
for c in iata:
    for n in neighbors[c]:
        #print(c,n)
        weight[c][n]=get_weight(c,n)

Afghanistan Monaco
steps 0.5
Afghanistan Cuba
steps 0.5
Albania Monaco
steps 0.5
Albania Cuba
steps 0.5
Algeria Monaco
steps 0.5
Algeria Cuba
steps 0.5
Angola Monaco
steps 0.5
Angola Belarus
steps 0.5
Angola Cuba
steps 0.5
Antigua and Barbuda Cuba
steps 0.5
Antigua and Barbuda Monaco
steps 0.5
Argentina Cuba
steps 0.5
Argentina Monaco
steps 0.5
Armenia Monaco
steps 0.5
Armenia Barbados
steps 0.5
Armenia Cuba
steps 0.5
Australia Syria
steps 0.5
Australia Monaco
steps 0.5
Australia Barbados
steps 0.5
Australia Cuba
steps 0.5
Austria Monaco
steps 0.5
Austria Barbados
steps 0.5
Austria Cuba
steps 0.5
Azerbaijan Belarus
steps 0.5
Azerbaijan Monaco
steps 0.5
Azerbaijan Cuba
steps 0.5
Bahamas Cuba
steps 1.5
Bahamas Monaco
steps 0.5
Bahamas Azerbaijan
steps 0.5
Bahrain Monaco
steps 0.5
Bahrain Cuba
steps 0.5
Bangladesh Monaco
steps 0.5
Bangladesh Barbados
steps 0.5
Bangladesh Cuba
steps 0.5
Barbados Cuba
steps 0.5
Barbados Monaco
steps 0.5
Barbados Bangladesh
steps 0.5
Belarus Monaco
steps 0.5

steps 0.5
Romania Cuba
steps 0.5
Russia Monaco
steps 0.5
Russia Cuba
steps 0.5
Rwanda Bahrain
steps 0.5
Rwanda Monaco
steps 0.5
Rwanda Cuba
steps 0.5
Saint Lucia Cuba
steps 0.5
Saint Lucia Monaco
steps 0.5
Saint Vincent and the Grenadines Cuba
steps 0.5
Saint Vincent and the Grenadines Monaco
steps 0.5
Sao Tome and Principe Monaco
steps 0.5
Sao Tome and Principe Bahrain
steps 0.5
Sao Tome and Principe Cuba
steps 0.5
Saudi Arabia Monaco
steps 0.5
Saudi Arabia Cuba
steps 0.5
Senegal Monaco
steps 0.5
Senegal Cuba
steps 0.5
Serbia Monaco
steps 0.5
Serbia Bahrain
steps 0.5
Serbia Cuba
steps 0.5
Seychelles Monaco
steps 0.5
Seychelles Cuba
steps 0.5
Singapore Monaco
steps 0.5
Singapore Cuba
steps 0.5
Slovakia Monaco
steps 0.5
Slovakia Barbados
steps 0.5
Slovakia Cuba
steps 0.5
Slovenia Monaco
steps 0.5
Slovenia Barbados
steps 0.5
Slovenia Cuba
steps 0.5
Somalia Monaco
steps 0.5
Somalia Barbados
steps 0.5
Somalia Cuba
steps 0.5
South Africa Monaco
steps 0.5
South Africa Cuba
steps 0.5
South Su

In [8]:
import copy

def normalize_weight(weight):
    ''' invert and normalize weight elements '''
    weight_I=copy.deepcopy(weight) #.copy()
    for i in weight_I:
        norm=0
        for j in weight_I[i]:
            if weight_I[i][j]==0:
                print(i,j)
                weight_I[i][j]=weight_I[j][i]
            weight_I[i][j]=1/weight_I[i][j]
            norm+=weight_I[i][j]
        for j in weight_I[i]:
            weight_I[i][j]=weight_I[i][j]/norm
    return weight_I


In [None]:
weight_I=normalize_weight(weight)

In [86]:
#weight_I['Congo (Brazzaville)']

In [85]:
with open('weight_I_norm_apr29.pkl','wb') as f:
    pickle.dump(weight_I,f)

## US demostic weights

In [2]:
distance=pd.read_csv("distance_us_states.csv")

In [3]:
states=list(distance['from'].unique())

In [4]:
states=list(distance['to'].unique())
states.append('Wyoming')
for i in range(len(states)):
    states[i]='US.'+states[i]

In [5]:
weight_D={s:{} for s in states}

ns=len(states)
for i in range(ns):
    for j in range(i):
        o=states[i]
        d=states[j]
        weight_D[d][o]=distance[(distance['from']==o[3:])&(distance['to']==d[3:])].distance.item()
        weight_D[o][d]=weight_D[d][o]
        
    

In [9]:
weight_D=normalize_weight(weight_D)

In [10]:
weight_D['US.Maryland']

{'US.Alabama': 0.009401699618191513,
 'US.Alaska': 0.0023461577603785776,
 'US.Arizona': 0.0033229102627923405,
 'US.Arkansas': 0.007298282755628265,
 'US.California': 0.00278331432327533,
 'US.Colorado': 0.00443768066507311,
 'US.Connecticut': 0.023837781000467618,
 'US.Delaware': 0.11583034729384235,
 'US.District of Columbia': 0.2602699078513038,
 'US.Florida': 0.009093764304338326,
 'US.Georgia': 0.01181963004716242,
 'US.Hawaii': 0.0013784337617210728,
 'US.Idaho': 0.0032358070577400925,
 'US.Illinois': 0.009538103846779293,
 'US.Indiana': 0.01303113621344056,
 'US.Iowa': 0.007316428117815291,
 'US.Kansas': 0.006486647994841765,
 'US.Kentucky': 0.014790317250224297,
 'US.Louisiana': 0.006573524130535371,
 'US.Maine': 0.013162678946440937,
 'US.Massachusetts': 0.017986133838203282,
 'US.Michigan': 0.013549150599220768,
 'US.Minnesota': 0.007099353070162776,
 'US.Mississippi': 0.007487103535400506,
 'US.Missouri': 0.007959782782702195,
 'US.Montana': 0.003602286013706425,
 'US.Nebra

In [11]:
with open('weight_D_norm_apr29.pkl','wb') as f:
    pickle.dump(weight_D,f)