In [2]:
import pandas as pd
import numpy as np
import pycountry
from scipy.optimize import least_squares
import random
import statsmodels
from scipy.optimize import minimize
from scipy.optimize import fsolve
#from pandas.core import datetools
import statsmodels.api as sm
import datetime
import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib.lines as mlines
from collections import defaultdict
from math import pi, e
import model_functions as mf

  from pandas.core import datetools


In [3]:
resident_foreigners_norm = pd.read_table("/home/sara/Documents/Immigration/Shared_statistics/Data_final/resident_foreigners_norm.csv")
# Untill 2011 - in 2011 change
years = list(range(2004, 2008))

In [4]:
reg_cap_info = pd.read_table("/home/sara/Documents/Immigration/Data_not_git/Prov_info/reg_cap_info.csv")
regions_cap = sorted(reg_cap_info["Prov Capitals"].values)

In [5]:
regions = pd.read_table("/home/sara/Documents/Immigration/Shared_statistics/Data_final/regioni.csv")
regions_info = pd.read_table("/home/sara/Documents/Immigration/Shared_statistics/Data_final/region_info.csv")

mezzogiorno = {'Abruzzo': 1, 'Lazio': 0, 'Umbria': 0, 'Provincia Autonoma Trento': 0, 
               'Friuli-Venezia Giulia': 0, 'Molise': 1, 'Calabria': 1, 
               "Valle d'Aosta / Vallée d'Aoste": 0, 'Lombardia': 0, 'Liguria': 0, 
               'Emilia-Romagna': 0, 'Sicilia': 1, 
               'Provincia Autonoma Bolzano / Bozen': 0, 'Puglia': 1, 'Campania': 1, 
               'Piemonte': 0, 'Toscana': 0, 'Sardegna': 1, 'Marche': 0, 
               'Basilicata': 1, 'Veneto': 0}

In [6]:
x_df = pd.DataFrame()
x_df = reg_cap_info[["Prov Capitals", "Area", "Dens"]].copy()
# Logarithmic transformation
x_df["Area"] = np.log(x_df["Area"])
x_df["Dens"] = np.log(x_df["Dens"])
x_df["Mezzogiorno"] = [mezzogiorno[regions[regions["Provincia"] == i]["Regione"].values[0]] for i in x_df["Prov Capitals"].values]

In [7]:
x_df.head()

Unnamed: 0,Prov Capitals,Area,Dens,Mezzogiorno
0,Ancona,7.582341,5.488938,0
1,Valle d'Aosta / Vallée d'Aoste,8.089759,3.663562,0
2,L'Aquila,8.526658,4.094345,1
3,Bari,8.259168,5.786897,1
4,Bologna,8.216715,5.609472,0


In [8]:
data = resident_foreigners_norm.copy()
data = data.groupby(["Province", "Country", "Year"], as_index=False).sum()

In [9]:
data.head()

Unnamed: 0,Province,Country,Year,Value
0,Agrigento,AFG,2008,2
1,Agrigento,AFG,2009,12
2,Agrigento,AFG,2010,51
3,Agrigento,AFG,2011,35
4,Agrigento,AFG,2012,21


In [10]:
# Distance matrix related to the interested locations (regions capitals)
temp_W = pd.read_table("/home/sara/Documents/Immigration/Shared_models/Data/Distances_matrix.csv", sep = "\t", index_col=0).loc[regions_cap][regions_cap]

In [11]:
temp_W = temp_W.div(temp_W.sum(axis=1), axis=0)

In [12]:
temp_W.head()

Unnamed: 0,Ancona,Bari,Bologna,Bolzano / Bozen,Cagliari,Campobasso,Catanzaro,Firenze,Genova,L'Aquila,...,Napoli,Palermo,Perugia,Potenza,Roma,Torino,Trento,Trieste,Valle d'Aosta / Vallée d'Aoste,Venezia
Ancona,0.0,0.048057,0.021979,0.051705,0.093029,0.033236,0.08506,0.029876,0.053743,0.019814,...,0.044391,0.118678,0.016704,0.049263,0.030888,0.057255,0.045798,0.053478,0.064089,0.037877
Bari,0.034066,0.0,0.049764,0.070652,0.076232,0.016307,0.026295,0.052213,0.069828,0.029489,...,0.019538,0.050485,0.043477,0.009424,0.033082,0.074587,0.066465,0.071909,0.079431,0.06085
Bologna,0.023108,0.07381,0.0,0.031024,0.083814,0.058247,0.106711,0.011464,0.033141,0.043774,...,0.063931,0.142514,0.027566,0.078773,0.042585,0.03686,0.024814,0.033144,0.044045,0.016759
Bolzano / Bozen,0.043964,0.084748,0.02509,0.0,0.047858,0.072097,0.1098,0.03278,0.036397,0.06035,...,0.075257,0.051219,0.046337,0.087595,0.058111,0.036827,0.005248,0.036902,0.040637,0.023858
Cagliari,0.062867,0.072674,0.053871,0.038036,0.0,0.05656,0.060469,0.04278,0.053907,0.04972,...,0.05645,0.032917,0.032917,0.066185,0.041249,0.065502,0.025678,0.075024,0.025678,0.025678


## Step I

In [16]:
def stepI(param, data_, W, ro, times, ref_I, territories):
    beta = param[0]
    a = param[1:]
    
    T = len(times)
    I = len(territories)
    
    identity_I = np.identity(I)
    identity_I_1 = np.identity(I-1)
    neg1 = np.negative(np.ones((I-1, 1)))
    # Not-squared matrix
    Q = np.append(identity_I_1, neg1, axis=1)
    # All the I-1 locations (all but the reference one)
    terr_not_ref = [i for i in territories if i != ref_I]
    
    # Modify W s.t. the "ref_I" location is the last one (so that Q is well defined)
    W = W.reindex(index = terr_not_ref+[ref_I], columns = terr_not_ref+[ref_I])
    
    # Time-invariant quantity
    L = Q.dot(np.linalg.inv(identity_I-ro*W)).dot(np.linalg.inv(identity_I-ro*W.T)).dot(Q.T)
    
    log_lik = T*np.linalg.det(L)
    
    for t in times[1:]:
        y = data_.loc[(t, terr_not_ref), "Value"].values/data_.loc[t].loc[ref_I].values
        x = data_.loc[(t-1, terr_not_ref), "Value"].values/data_.loc[t-1].loc[ref_I].values
        #print(y.shape, x.shape, len(a))
        
        main_term = y - beta*x - a
        
        log_lik += main_term.T.dot(np.linalg.inv(L)).dot(main_term)
        
    return(log_lik)

In [17]:
data_all = resident_foreigners_norm.copy()
times = years
territories = regions_cap
country = "Romania"
I = "Roma"

data_all = data_all[data_all["Year"].isin(times)]
missing_territories = mf.not_including(data_all, times, territories)
territories = [i for i in territories if i not in missing_territories]
data_all = data_all[data_all["Province"].isin(territories)]

# Also the stock in the refered province is needed in the optimization 
data_ = data_all[data_all["Country"] == pycountry.countries.get(name=country).alpha_3]
del data_["Country"]

data_ = pd.DataFrame(data_.groupby(["Year", "Province"])["Value"].sum())

In [18]:
print("---------- Step I ----------")
initial_time = datetime.datetime.now()
print ("Current time: " + str(initial_time.strftime('%H:%M:%S') ))

# I-1 locations + beta 
param_init = [random.random for i in range(len(territories))]
# fixed ro
rho = .981
res_stepI =  minimize(stepI, param_init, args = (data_, temp_W, rho, times, I, territories), method='CG')
print(res_stepI.message)

final_time = datetime.datetime.now() 
print ("Current time: " + str(final_time.strftime('%H:%M:%S')))
print("Computational time: " + str((final_time - initial_time)))

---------- Step I ----------
Current time: 17:47:28


KeyboardInterrupt: 

In [None]:
beta_hat = res_stepI.x[0]
a_hat = res_stepI.x[1:]
print(res_stepI.fun, beta_hat, a_hat)

In [None]:
rho_list = np.random.rand(3)

for rho in rho_list:
    print("---------- Step I ----------")
    initial_time = datetime.datetime.now()
    print ("Current time: " + str(initial_time.strftime('%H:%M:%S') ))
    res_stepI =  minimize(stepI, param_init, args = (data_, temp_W, rho, times, I, territories), method='CG')
    print(res_stepI.message)

    final_time = datetime.datetime.now() 
    print ("Current time: " + str(final_time.strftime('%H:%M:%S')))
    print("Computational time: " + str((final_time - initial_time)))
    
    beta_hat = res_stepI.x[0]
    a_hat = res_stepI.x[1:]
    print(rho, res_stepI.fun, beta_hat)

## Step II

In [13]:
def stepII(theta, a, x_, I, territories):
    log_lik = 0
    x_I = x_[x_["Prov Capitals"] == I][["Area", "Dens", "Mezzogiorno"]].values
    
    for i in territories:
        x_i = x_[x_["Prov Capitals"] == i][["Area", "Dens", "Mezzogiorno"]].values
        log_lik += (a[territories.index(i)] - np.dot(np.subtract(x_i, x_I), theta))**2
    return(log_lik)

# Run the different models for the different origin country

In [34]:
def run_model(data_all, country, times, I, x_, territories = None):
    if not territories:
        territories = list(set(data_all["Province"]))
        
    #territories = list(set(data_all["Province"]))

    data_all = data_all[data_all["Year"].isin(times)]
    missing_territories = not_including(data_all, times, territories)
    territories = [i for i in territories if i not in missing_territories]
    data_all = data_all[data_all["Province"].isin(territories)]

    # Also the stock in the refered province is needed in the optimization 
    data_ = data_all[data_all["Country"] == pycountry.countries.get(name=country).alpha_3]
    del data_["Country"]

    # Do not include the refered province in the optimization 
    territories = [p for p in territories if p != I]
    # Handle missing values
    #data_ = missing_values(data_, times, territories)

    print("---------- Step I ----------")
    initial_time = datetime.datetime.now()
    print ("Current time: " + str(initial_time.strftime('%H:%M:%S') ))

    #param_init = np.random.uniform(0, 1, len(territories)+1)
    param_init = [0 for i in range(len(territories)+1)]
    res_stepI =  minimize(stepI, param_init, args = (data_, times, I, territories), method='CG')
    print(res_stepI.message)
    
    final_time = datetime.datetime.now() 
    print ("Current time: " + str(final_time.strftime('%H:%M:%S')))
    print("Computational time: " + str((final_time - initial_time)))
    
    # Step I results and validation
    beta_hat = res_stepI.x[0]
    a_hat = res_stepI.x[1:]
    y_hat = []
    y = []
    for i in territories:
        time_invariant = a_hat[territories.index(i)]
        for t in times[1:]:
            y.append(np.log(n_it(data_, i, t)/n_it(data_, I, t)))
            y_hat.append(beta_hat*(np.log(n_it(data_, i, t-1)/n_it(data_, I, t-1))) + time_invariant)
            
    y_mean = np.mean(y)

    R2 = 1 - sum(np.subtract(y, y_hat)**2) / sum((y - y_mean)**2)
    # Equivalently: 1 - (res_stepI.fun / sum((y - y_mean)**2))
    #print(R2)
    print("The R2 score from the step I is: %f" %R2)
    
    print("---------- Step II ----------")
    
    initial_time = datetime.datetime.now()
    print ("Current time: " + str(initial_time.strftime('%H:%M:%S') ))

    param_init = [0 for i in range(len(x_.columns)-1)]
    #param_init = np.random.uniform(0, 1, len(x_df.columns)-1)
    res_stepII =  minimize(stepII, param_init, args = (a_hat, x_, I, territories), method='CG')
    #print(model_I([b, a], data_rou), b, a)
    #print(res_stepII.x)
    print(res_stepII.message)
    #print(res_stepII.fun)
    final_time = datetime.datetime.now() 
    print ("Current time: " + str(final_time.strftime('%H:%M:%S')))
    print("Computational time: " + str((final_time - initial_time)))
    
    # Step II results and validation
    theta_hat = res_stepII.x
    x_I = x_[x_["Prov Capitals"] == I][["Area", "Dens", "Mezzogiorno"]].values
    y_hat = []
    #y = []
    for i in territories:
        x_i = x_[x_["Prov Capitals"] == i][["Area", "Dens", "Mezzogiorno"]].values
        time_invariant = np.dot(np.subtract(x_i, x_I), theta_hat)
        for t in times[1:]:
            #y.append(np.log(n_it(data_, i, t)/n_it(data_, I, t)))
            y_hat.append((beta_hat*(np.log(n_it(data_, i, t-1)/n_it(data_, I, t-1))) + time_invariant)[0])
            
    #y_mean = np.mean(y)
    R2 = 1 - sum(np.subtract(y, y_hat)**2) / sum((y - y_mean)**2)
    #print(R2)
    print("The final R2 score is: %f" %R2)
    
    n = len(y)
    k = len(x_.columns)-1
    R2_adj = 1 - (1 - R2)*((n - 1)/(n - k -1))
    print("The final Adjusted R2 score is: %f" %R2_adj)
    
    return(beta_hat, a_hat, theta_hat)

In [35]:
res_rou = run_model(data, "Romania", list(range(2004, 2008)), "Roma", x_df, province_capitals)

---------- Step I ----------
Current time: 12:23:58
Optimization terminated successfully.
Current time: 12:30:26
Computational time: 0:06:28.211165
The R2 score from the step I is: 0.998170
---------- Step II ----------
Current time: 12:30:27
Optimization terminated successfully.
Current time: 12:30:27
Computational time: 0:00:00.856256
The final R2 score is: 0.939099
The final Adjusted R2 score is: 0.935652


In [39]:
print("Beta parameter: %f" %res_rou[0])
print(res_rou[1])
print("Theta parameter: %s %s %s" %tuple(res_rou[2]))

Beta parameter: 0.501037
[-1.55331351 -2.29499766 -1.70195229 -2.29362672 -1.23865002 -2.88924935
 -2.48635617 -2.73574985 -1.09472628 -1.8283857  -0.55687868 -2.23760183
 -2.55967187 -1.23072563 -2.49997589 -0.18148499 -1.39697481 -2.45963957
 -1.23271326]
Theta parameter: 0.907874756097 0.544429955325 -1.14652506679
