In [5]:
import sys
import warnings

if not sys.warnoptions:
    warnings.simplefilter("ignore")
    
import pandas as pd
import numpy as np
import pycountry
from scipy.optimize import least_squares
import random
import statsmodels
from scipy.optimize import minimize
from scipy.optimize import fsolve
#from pandas.core import datetools
import statsmodels.api as sm
import datetime
import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib.lines as mlines
from collections import defaultdict
from math import pi, e
import model_functions as mf
import build_data_functions as bdf
import plot_model_functions as pmf
import matplotlib.pyplot as plt

In [6]:
years = list(range(2005, 2017))

In [7]:
y = pd.read_table("/home/sara/Documents/Immigration/Shared_models/Data/resident_foreigners_norm.csv", sep = "\t", index_col=0)
y = y.groupby(["Province", "Country", "Year"], as_index=False)["Value"].sum()
y = bdf.pivot(y, "Country", "Value")

## Zone level

In [71]:
xs = pd.read_table("/home/sara/Documents/Immigration/Shared_models/Data/x_zones.csv", sep = "\t", index_col=["Province", "Year"])

zones_data = pd.read_table("/home/sara/Documents/Immigration/Shared_statistics/Data_final/territori.csv")
zones_data = zones_data.replace(['Provincia Autonoma Bolzano / Bozen', 'Provincia Autonoma Trento'], ['Bolzano / Bozen', 'Trento'])

In [41]:
#terr = list(set(zones_data.Zona))

In [126]:
# Distance matrix related to the interested locations (regions capitals)
temp_W = pd.read_table("/home/sara/Documents/Immigration/Shared_models/Data/Zones_distances_matrix_mean.csv", sep = "\t", index_col=0)

In [127]:
temp_W.head()

Unnamed: 0,Centro,Isole,Nord-est,Nord-ovest,Sud
Centro,0.0,722899.184542,454026.585438,517383.500724,463645.585679
Isole,722899.184542,0.0,785789.965069,956897.591406,729986.205378
Nord-est,454026.585438,785789.965069,0.0,371960.371519,823860.587237
Nord-ovest,517383.500724,956897.591406,371960.371519,0.0,913786.897237
Sud,463645.585679,729986.205378,823860.587237,913786.897237,0.0


In [128]:
temp_W = (1/temp_W)**2
# w_ij = 0 if i=j
temp_W[temp_W == np.inf] = 0
# row standardization: every row sum up to 1
temp_W = temp_W.div(temp_W.sum(axis=1), axis=0)

Given a spatial weights matrix W is a nonnegative matrix with $w_{ij} >= 0$ and $w_{ii} = 0$. W uses to be symmetric.

The row-normalized W is used for ease of interpretation. It is defined as $\sum_{j=1}^n w_{ij} = 1, \forall i = 1, \dots, n$. This ensure that all weights are between 0 and 1.

Each rownormalized weight, $wij$, can be interpreted as the fraction of all spatial influence on unit $i$ attributable to unit $j$.

In [129]:
temp_W.head()

Unnamed: 0,Centro,Isole,Nord-est,Nord-ovest,Sud
Centro,0.0,0.12629,0.320155,0.246546,0.307009
Isole,0.294314,0.0,0.249088,0.167971,0.288627
Nord-est,0.319744,0.106746,0.0,0.476401,0.097109
Nord-ovest,0.281872,0.082404,0.545361,0.0,0.090362
Sud,0.505673,0.203992,0.160153,0.130182,0.0


In [57]:
#xs.index = xs.index.swaplevel(0, 1)
#xs.sort_index(inplace=True)

#xs = xs.loc[2013]
#xs

## Step I

In [105]:
def stepI(param, data_, W, times, ref_I, territories):
    beta = param[0]
    a = param[1:-1]
    ro = param[-1]
    
    T = len(times)
    I = len(territories)
    
    identity_I = np.identity(I)
    identity_I_1 = np.identity(I-1)
    neg1 = np.negative(np.ones((I-1, 1)))
    # Not-squared matrix
    Q = np.append(identity_I_1, neg1, axis=1)
    # All the I-1 locations (all but the reference one)
    terr_not_ref = [i for i in territories if i != ref_I]
    
    # Modify W s.t. the "ref_I" location is the last one (so that Q is well defined)
    W = W.reindex(index = terr_not_ref+[ref_I], columns = terr_not_ref+[ref_I])
     
    # Time-invariant quantity
    L = Q.dot(np.linalg.inv(identity_I-ro*W)).dot(np.linalg.inv(identity_I-ro*W.T)).dot(Q.T)
    
    log_lik = T*np.log(np.linalg.det(L))
    
    for t in times[1:]:
        y = (data_.loc[(t, terr_not_ref), "y"]/data_.loc[(t, ref_I), "y"]).values
        x = (data_.loc[(t, terr_not_ref), "y_prev_1"]/data_.loc[(t, ref_I), "y_prev_1"]).values
        #print(y.shape, x.shape, len(a))
        main_term = np.log(y) - beta*np.log(x) - a
        
        log_lik += main_term.T.dot(np.linalg.inv(L)).dot(main_term)
        
    return(log_lik)

## Step II

In [98]:
def stepII(theta, a, x_, ref_I, territories):
    # All the I-1 locations (all but the reference one)
    terr_not_ref = [i for i in territories if i != ref_I]
    
    x_I = x_.loc[ref_I].values
    temp = np.array([(a[terr_not_ref.index(i)] - np.dot(np.subtract(x_.loc[i].values, x_I), theta)) for i in terr_not_ref])
    
    log_lik = temp.T.dot(temp)
    return(log_lik)

### Computations

At zones level as refered territory Centro is used.

In [159]:
data_all, country, times, I, x_ = y, "Romania", list(range(2005, 2017)), "Centro", xs

terr = sorted(list(set(zones_data.Zona)))
territories = terr

country = pycountry.countries.get(name=country).alpha_3
data_ = bdf.filter_origin_country_dataset(y, country, years, xs.index.levels[0].tolist(), xs, prev = 1)

In the model the features are time invariant. Since the time period is short, it assumed the variables change over it does not influence the model. As refered time, 2013 is choosen.

In [160]:
data_.index = data_.index.swaplevel(0, 1)
data_.sort_index(inplace=True)
data_ = data_
y_ = data_[["y_prev_1", "y"]]
#y_.head()

In [161]:
print("---------- Step I ----------")
initial_time = datetime.datetime.now()
print ("Current time: " + str(initial_time.strftime('%H:%M:%S') ))

# I-1 locations + beta + ro
random.seed(123)
param_init = [np.random.random() for i in range(len(territories)+1)]
#param_init = np.random.rand(len(territories)+1)
res_stepI =  minimize(stepI, param_init, args = (y_, temp_W, times, I, territories), method='CG')
print(res_stepI.message)

final_time = datetime.datetime.now() 
print ("Current time: " + str(final_time.strftime('%H:%M:%S')))
print("Computational time: " + str((final_time - initial_time)))


# Step I results and validation
beta_hat = res_stepI.x[0]
a_hat = res_stepI.x[1:-1]
rho_hat = res_stepI.x[-1]

---------- Step I ----------
Current time: 20:03:27
Optimization terminated successfully.
Current time: 20:03:56
Computational time: 0:00:29.339229


In [162]:
print(beta_hat, a_hat, rho_hat)

0.7744427764548738 [-0.35201252 -0.08162944 -0.01099394 -0.19029142] 5.332751598248623


In [201]:
var_selection = ['native population - Total', 'unemployment - Total', 'reach_difficulty - Emergency room']
    
#xs_ = data_.loc[2013, [i for i in data_.columns.tolist() if i not in ["y_prev_1", "y"]]]
xs_ = data_.loc[2013, var_selection]

In [202]:
print("---------- Step II ----------")

initial_time = datetime.datetime.now()
print ("Current time: " + str(initial_time.strftime('%H:%M:%S') ))

random.seed(123)
param_init = [np.random.random() for i in range(len(xs_.columns))]
#param_init = np.random.uniform(0, 1, len(x_df.columns)-1)
res_stepII =  minimize(stepII, param_init, args = (a_hat, xs_, I, territories), method='CG')
#print(model_I([b, a], data_rou), b, a)
#print(res_stepII.x)
print(res_stepII.message)
#print(res_stepII.fun)
final_time = datetime.datetime.now() 
print ("Current time: " + str(final_time.strftime('%H:%M:%S')))
print("Computational time: " + str((final_time - initial_time)))

---------- Step II ----------
Current time: 20:18:34
Desired error not necessarily achieved due to precision loss.
Current time: 20:18:34
Computational time: 0:00:00.345194


In [203]:
theta_hat = res_stepII.x
theta_hat

array([ 1.72980830e-06,  7.27089749e-02, -2.47635171e-02])

In [204]:
xs_

Unnamed: 0_level_0,native population - Total,unemployment - Total,reach_difficulty - Emergency room
Province,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Centro,11681498.0,564.4515,2552.0
Isole,6640311.0,470.065,1597.0
Nord-est,12560971.0,409.8595,2206.0
Nord-ovest,15989392.0,647.4835,2994.0
Sud,13980833.0,976.806,3363.0


In [205]:
terr_not_ref = [i for i in territories if i != I]
terr_not_ref

['Isole', 'Nord-est', 'Nord-ovest', 'Sud']

In [206]:
territories

['Centro', 'Isole', 'Nord-est', 'Nord-ovest', 'Sud']

In [207]:
x_I = xs_.loc[I].values

In [210]:
for t in times[1:]:
    y = (data_.loc[(t, terr_not_ref), "y"]/data_.loc[(t, ref_I), "y"]).values
    #x = beta_hat*(data_.loc[(t, terr_not_ref), "y_prev_1"]/data_.loc[(t, ref_I), "y_prev_1"]).values + [np.dot(np.subtract(xs_.loc[i].values, x_I), theta_hat) for i in terr_not_ref]
    x = beta_hat*(data_.loc[(t, terr_not_ref), "y_prev_1"]/data_.loc[(t, ref_I), "y_prev_1"]).values + a_hat

In [211]:
x

array([ 8.20442936, -0.61557019,  3.26298139, 14.18003376])

In [200]:
y

array([0.18397817, 0.69547766, 0.93518215, 0.4024562 ])

In [192]:
theta_hat

array([-1.64625062e-04,  4.71894108e-01,  6.91006571e-01, -4.76202882e-02,
        4.15307468e-01,  9.29512442e-01,  3.91908875e-01])

In [197]:
np.dot((xs_.loc["Isole"] - xs_.loc[I]), theta_hat)

-34.824995591727884

# Run the different models for the different origin country

In [172]:
def run_model(data_all, country, times, I, x_, territories = None):
    if not territories:
        territories = sorted(list(set(data_all["Province"])))
        
    data_all = data_all[data_all["Year"].isin(times)]
    missing_territories = mf.not_including(data_all, times, territories)
    territories = [i for i in territories if i not in missing_territories]
    data_all = data_all[data_all["Province"].isin(territories)]

    # Also the stock in the refered province is needed in the optimization 
    data_ = data_all[data_all["Country"] == pycountry.countries.get(name=country).alpha_3]
    del data_["Country"]

    data_ = pd.DataFrame(data_.groupby(["Year", "Province"])["Value"].sum())

    print("---------- Step I ----------")
    initial_time = datetime.datetime.now()
    print ("Current time: " + str(initial_time.strftime('%H:%M:%S') ))

    # I-1 locations + beta + ro
    random.seed(123)
    param_init = [np.random.random() for i in range(len(territories))]
    #param_init = np.random.rand(len(territories)+1)
    res_stepI =  minimize(stepI, param_init, args = (data_, temp_W, times, I, territories), method='CG')
    print(res_stepI.message)

    final_time = datetime.datetime.now() 
    print ("Current time: " + str(final_time.strftime('%H:%M:%S')))
    print("Computational time: " + str((final_time - initial_time)))
    
    
    # Step I results and validation
    beta_hat = res_stepI.x[0]
    a_hat = res_stepI.x[1:]
    '''y_hat = []
    y = []
    for i in territories:
        time_invariant = a_hat[territories.index(i)]
        for t in times[1:]:
            y.append(np.log(n_it(data_, i, t)/n_it(data_, I, t)))
            y_hat.append(beta_hat*(np.log(n_it(data_, i, t-1)/n_it(data_, I, t-1))) + time_invariant)
            
    y_mean = np.mean(y)

    R2 = 1 - sum(np.subtract(y, y_hat)**2) / sum((y - y_mean)**2)
    # Equivalently: 1 - (res_stepI.fun / sum((y - y_mean)**2))
    #print(R2)
    print("The R2 score from the step I is: %f" %R2)
    '''
    #print(beta_hat, a_hat)
    print("---------- Step II ----------")

    initial_time = datetime.datetime.now()
    print ("Current time: " + str(initial_time.strftime('%H:%M:%S') ))

    random.seed(123)
    param_init = [np.random.random() for i in range(len(x_.columns))]
    #param_init = np.random.uniform(0, 1, len(x_df.columns)-1)
    res_stepII =  minimize(stepII, param_init, args = (a_hat, x_, I, territories), method='CG')
    #print(model_I([b, a], data_rou), b, a)
    #print(res_stepII.x)
    print(res_stepII.message)
    #print(res_stepII.fun)
    final_time = datetime.datetime.now() 
    print ("Current time: " + str(final_time.strftime('%H:%M:%S')))
    print("Computational time: " + str((final_time - initial_time)))
    
    # Step II results and validation
    theta_hat = res_stepII.x
    '''x_I = x_[x_["Prov Capitals"] == I][["Area", "Dens", "Mezzogiorno"]].values
    y_hat = []
    #y = []
    for i in territories:
        x_i = x_[x_["Prov Capitals"] == i][["Area", "Dens", "Mezzogiorno"]].values
        time_invariant = np.dot(np.subtract(x_i, x_I), theta_hat)
        for t in times[1:]:
            #y.append(np.log(n_it(data_, i, t)/n_it(data_, I, t)))
            y_hat.append((beta_hat*(np.log(n_it(data_, i, t-1)/n_it(data_, I, t-1))) + time_invariant)[0])
            
    #y_mean = np.mean(y)
    R2 = 1 - sum(np.subtract(y, y_hat)**2) / sum((y - y_mean)**2)
    #print(R2)
    print("The final R2 score is: %f" %R2)
    
    n = len(y)
    k = len(x_.columns)-1
    R2_adj = 1 - (1 - R2)*((n - 1)/(n - k -1))
    print("The final Adjusted R2 score is: %f" %R2_adj)'''
    
    return(beta_hat, a_hat, theta_hat)

In [173]:
res_rou = run_model(data, "Romania", list(range(2004, 2008)), "Roma", x_df)

---------- Step I ----------
Current time: 16:14:03
Desired error not necessarily achieved due to precision loss.
Current time: 16:25:39
Computational time: 0:11:35.605179
---------- Step II ----------
Current time: 16:25:39


ValueError: setting an array element with a sequence.

In [None]:
print("Beta parameter: %f" %res_rou[0])
print(res_rou[1])
#print("Theta parameter: %s %s %s" %tuple(res_rou[2][0]))
print(res_rou[2])

In [None]:
res_mor = run_model(data, "Morocco", list(range(2004, 2008)), "Roma", x_df)

In [None]:
print("Beta parameter: %f" %res_mor[0])
print(res_mor[1])
print("Theta parameter: %s %s %s" %tuple(res_mor[2]))