In [1]:
import pandas as pd
import numpy as np
import pycountry
from scipy.optimize import least_squares
import random
import statsmodels
from scipy.optimize import minimize
from scipy.optimize import fsolve
#from pandas.core import datetools
import statsmodels.api as sm
import datetime
import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib.lines as mlines
from collections import defaultdict
from math import pi, e
import model_functions as mf

  from pandas.core import datetools


In [46]:
resident_foreigners_norm = pd.read_table("/home/sara/Documents/Immigration/Shared_models/Data/resident_foreigners_norm.csv", index_col=0)
# Untill 2011 - in 2011 change
years = list(range(2004, 2008))

In [47]:
zones_data = pd.read_table("/home/sara/Documents/Immigration/Shared_statistics/Data_final/territori.csv")
zones_data = zones_data.replace(['Provincia Autonoma Bolzano / Bozen', 'Provincia Autonoma Trento'], ['Bolzano / Bozen', 'Trento'])

In [48]:
zones = sorted(list(set(zones_data.Zona)))

In [49]:
data = resident_foreigners_norm.copy()
data = data.groupby(["Province", "Country", "Year"], as_index=False).sum()

In [6]:
data.head()

Unnamed: 0,Province,Country,Year,Value
0,Agrigento,AFG,2008,2
1,Agrigento,AFG,2009,12
2,Agrigento,AFG,2010,51
3,Agrigento,AFG,2011,35
4,Agrigento,AFG,2012,21


In [7]:
# Distance matrix related to the interested locations (regions capitals)
temp_W = pd.read_table("/home/sara/Documents/Immigration/Shared_models/Data/Zones_distances_matrix_mean.csv", sep = "\t", index_col=0)

In [8]:
temp_W.head()

Unnamed: 0,Centro,Isole,Nord-est,Nord-ovest,Sud
Centro,0.0,722899.184542,454026.585438,517383.500724,463645.585679
Isole,722899.184542,0.0,785789.965069,956897.591406,729986.205378
Nord-est,454026.585438,785789.965069,0.0,371960.371519,823860.587237
Nord-ovest,517383.500724,956897.591406,371960.371519,0.0,913786.897237
Sud,463645.585679,729986.205378,823860.587237,913786.897237,0.0


In [9]:
temp_W = (1/temp_W)**2
# w_ij = 0 if i=j
temp_W[temp_W == np.inf] = 0
# row standardization: every row sum up to 1
temp_W = temp_W.div(temp_W.sum(axis=1), axis=0)

Given a spatial weights matrix W is a nonnegative matrix with $w_{ij} >= 0$ and $w_{ii} = 0$. W uses to be symmetric.

The row-normalized W is used for ease of interpretation. It is defined as $\sum_{j=1}^n w_{ij} = 1, \forall i = 1, \dots, n$. This ensure that all weights are between 0 and 1.

Each rownormalized weight, $wij$, can be interpreted as the fraction of all spatial influence on unit $i$ attributable to unit $j$.

In [22]:
temp_W.head()

Unnamed: 0,Centro,Isole,Nord-est,Nord-ovest,Sud
Centro,0.0,0.12629,0.320155,0.246546,0.307009
Isole,0.294314,0.0,0.249088,0.167971,0.288627
Nord-est,0.319744,0.106746,0.0,0.476401,0.097109
Nord-ovest,0.281872,0.082404,0.545361,0.0,0.090362
Sud,0.505673,0.203992,0.160153,0.130182,0.0


## Step I

In [11]:
def stepI(param, data_, W, times, ref_I, territories):
    beta = param[0]
    a = param[1:-1]
    ro = param[-1]
    
    T = len(times)
    I = len(territories)
    
    identity_I = np.identity(I)
    identity_I_1 = np.identity(I-1)
    neg1 = np.negative(np.ones((I-1, 1)))
    # Not-squared matrix
    Q = np.append(identity_I_1, neg1, axis=1)
    # All the I-1 locations (all but the reference one)
    terr_not_ref = [i for i in territories if i != ref_I]
    
    # Modify W s.t. the "ref_I" location is the last one (so that Q is well defined)
    W = W.reindex(index = terr_not_ref+[ref_I], columns = terr_not_ref+[ref_I])
     
    # Time-invariant quantity
    L = Q.dot(np.linalg.inv(identity_I-ro*W)).dot(np.linalg.inv(identity_I-ro*W.T)).dot(Q.T)
    
    log_lik = T*np.log(np.linalg.det(L))
    
    for t in times[1:]:
        y = data_.loc[(t, terr_not_ref), "Value"].values/data_.loc[t].loc[ref_I].values
        x = data_.loc[(t-1, terr_not_ref), "Value"].values/data_.loc[t-1].loc[ref_I].values
        #print(y.shape, x.shape, len(a))
        main_term = np.log(y) - beta*np.log(x) - a
        
        log_lik += main_term.T.dot(np.linalg.inv(L)).dot(main_term)
        
    return(log_lik)

## Step II

In [12]:
def stepII(theta, a, x_, ref_I, territories):
    # All the I-1 locations (all but the reference one)
    terr_not_ref = [i for i in territories if i != ref_I]
    
    x_I = x_.loc[ref_I].values
    temp = np.array([(a[terr_not_ref.index(i)] - np.dot(np.subtract(x_.loc[i].values, x_I), theta)) for i in terr_not_ref])
    
    log_lik = temp.T.dot(temp)
    return(log_lik)

In [13]:
'''def stepII(param, a, x_, ref_I, territories):
    theta = param[:-1]
    c = param[-1]
    # All the I-1 locations (all but the reference one)
    terr_not_ref = [i for i in territories if i != ref_I]
    
    x_I = x_.loc[ref_I].values
    temp = np.array([(a[terr_not_ref.index(i)] - np.dot(np.subtract(x_.loc[i].values, x_I), theta) - c) for i in terr_not_ref])
    
    log_lik = temp.T.dot(temp)
    return(log_lik)'''

'def stepII(param, a, x_, ref_I, territories):\n    theta = param[:-1]\n    c = param[-1]\n    # All the I-1 locations (all but the reference one)\n    terr_not_ref = [i for i in territories if i != ref_I]\n    \n    x_I = x_.loc[ref_I].values\n    temp = np.array([(a[terr_not_ref.index(i)] - np.dot(np.subtract(x_.loc[i].values, x_I), theta) - c) for i in terr_not_ref])\n    \n    log_lik = temp.T.dot(temp)\n    return(log_lik)'

# Run the different models for the different origin country

In [58]:
def run_model(data_all, country, times, I, W, x_ = None, territories = None):
    if not territories:
        territories = sorted(list(set(data_all["Province"])))
        
    data_all = data_all[data_all["Year"].isin(times)]
    #missing_territories = mf.not_including(data_all, times, territories)
    #territories = [i for i in territories if i not in missing_territories]
    data_all = data_all[data_all["Province"].isin(territories)]

    # Also the stock in the refered province is needed in the optimization 
    try:
        data_ = data_all[data_all["Country"] == pycountry.countries.get(name=country).alpha_3]
    except KeyError:
        iso3 = input("ISO3 of %s" %country)
        data_ = data_all[data_all["Country"] == iso3]
    del data_["Country"]

    data_ = pd.DataFrame(data_.groupby(["Year", "Province"])["Value"].sum())

    print("---------- Step I ----------")
    initial_time = datetime.datetime.now()
    print ("Current time: " + str(initial_time.strftime('%H:%M:%S') ))

    # I-1 locations + beta + ro
    random.seed(123)
    param_init = [np.random.random() for i in range(len(territories)+1)]
    #param_init = np.random.rand(len(territories)+1)
    res_stepI =  minimize(stepI, param_init, args = (data_, W, times, I, territories), method='CG')
    print(res_stepI.message)

    final_time = datetime.datetime.now() 
    print ("Current time: " + str(final_time.strftime('%H:%M:%S')))
    print("Computational time: " + str((final_time - initial_time)))
    
    
    # Step I results and validation
    beta_hat = res_stepI.x[0]
    a_hat = res_stepI.x[1:-1]
    ro_hat = res_stepI.x[-1]

    return(beta_hat, ro_hat, a_hat)


In [59]:
res_rou = run_model(data, "Romania", list(range(2004, 2008)), "Centro", temp_W, territories=zones)

---------- Step I ----------
Current time: 15:33:48
Optimization terminated successfully.
Current time: 15:37:16
Computational time: 0:03:27.664493


In [60]:
print("Beta parameter: %f" %res_rou[0])
print("Rho parameter: %f" %res_rou[1])
#print("Theta parameter: %s %s %s" %tuple(res_rou[2][0]))
print(res_rou[2])

Beta parameter: -0.189521
Rho parameter: 5.396606
[-4.10238909 -0.36839393  0.01252352 -2.59944242]


In [63]:
times = list(range(2004, 2008))
ref_I = "Centro"
territories = zones
country = "Romania"
data_all = data
terr_not_ref = [i for i in territories if i != ref_I]
data_all = data_all[data_all["Year"].isin(times)]
#missing_territories = mf.not_including(data_all, times, territories)
#territories = [i for i in territories if i not in missing_territories]
data_all = data_all[data_all["Province"].isin(territories)]

# Also the stock in the refered province is needed in the optimization 
try:
    data_ = data_all[data_all["Country"] == pycountry.countries.get(name=country).alpha_3]
except KeyError:
    iso3 = input("ISO3 of %s" %country)
    data_ = data_all[data_all["Country"] == iso3]
del data_["Country"]

data_ = pd.DataFrame(data_.groupby(["Year", "Province"])["Value"].sum())

y = {}
y_hat = {}
for t in times[1:]:
    y[t] = data_.loc[(t, terr_not_ref), "Value"].values/data_.loc[t].loc[ref_I].values
    y_hat[t] = (data_.loc[(t-1, terr_not_ref), "Value"].values/data_.loc[t-1].loc[ref_I].values)

In [69]:
y

array([0.0350921 , 0.73292401, 1.00487529, 0.11874574])

In [70]:
x

array([0.03165084, 0.73314394, 1.01231982, 0.11158618])

In [65]:
x*res_rou[0]

array([0.03165084, 0.73314394, 1.01231982, 0.11158618])

In [68]:
x*res_rou[0] - x*res_rou[2]

array([ 0.12384554,  0.13113946, -0.20453387,  0.2689139 ])

In [35]:
res_mor = run_model(data, "Morocco", list(range(2004, 2008)), "Roma", x_df, regions_cap)

---------- Step I ----------
Current time: 18:00:17


KeyboardInterrupt: 

In [28]:
print("Beta parameter: %f" %res_mor[0])
print("Rho parameter: %f" %res_mor[1])
print(res_mor[3])

Beta parameter: 0.405562
[-0.57604795 -0.63599664  0.52749393 -0.45020308 -1.03197367 -1.39652979
 -0.38466515 -0.01826092 -0.14793186 -0.58912264  0.807071   -0.56898018
 -0.68313184  0.21787595 -1.35364605  0.88787277 -0.13349202 -2.3222791
 -0.6540766  -0.27099463]
[ 0.39143861  0.24506161 -0.59092007 -0.37752043]


In [None]:
res_fr = run_model(data, "France", list(range(2004, 2008)), "Roma", x_df, regions_cap)
print("Beta parameter: %f" %res_fr[0])
print("Rho parameter: %f" %res_fr[1])
print(res_mor[3])

In [None]:
res_phi = run_model(data, "Philippines", list(range(2004, 2008)), "Roma", x_df, regions_cap)
print("Beta parameter: %f" %res_phi[0])
print("Rho parameter: %f" %res_phi[1])
print(res_phi[3])