In [1]:
import sys
import warnings

if not sys.warnoptions:
    warnings.simplefilter("ignore")
    
import numpy as np
from statsmodels.datasets import grunfeld
from linearmodels.panel  import PanelOLS
import pandas as pd
import build_data_functions as bdf
import plot_data_functions as pdf
import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib.lines as mlines
import model_functions as mf
import plot_model_functions as pmf
import panelOLS_models 
import spatial_error_model as sem
import statsmodels.api as sm
from sklearn.feature_selection import SelectKBest, f_regression, mutual_info_regression
import pycountry
import os
from sys import argv
import json
from collections import defaultdict
from sklearn import datasets, linear_model
from sklearn.metrics import mean_squared_error, r2_score

from sklearn.preprocessing import normalize

from sklearn.model_selection import train_test_split, ShuffleSplit, cross_val_score
from scipy.stats import normaltest

In [2]:
#first, last = argv
#years = list(range(first, last))
years = list(range(2012, 2017))

In [3]:
directory = "/home/sara/Documents/Immigration/Shared_models/Plot_%d_%d" %(years[0], years[-1])
if not os.path.exists(directory):
    os.makedirs(directory)

In [4]:
y = pd.read_table("/home/sara/Documents/Immigration/Shared_models/Data/resident_foreigners_norm.csv", sep = "\t", index_col=0)
y = y.groupby(["Province", "Country", "Year"], as_index=False)["Value"].sum()
y = bdf.pivot(y, "Country", "Value")

resident_foreigners_norm = pd.read_table("/home/sara/Documents/Immigration/Shared_statistics/Data_final/resident_foreigners_norm.csv")

#unesco_demo = pd.read_table("/home/sara/Documents/Immigration/Shared_models/Data/countries_info.csv", sep = "\t", index_col=[0,1])

## Italian Stock Prediction

In [5]:
# To get the Italy info we need to sum the 5 Italian zones
zones = list(pd.read_table("/home/sara/Documents/Immigration/Shared_models/Data/x_zones.csv", sep = "\t", index_col=["Province", "Year"]).index.levels[0])

for t in y.index.levels[1]:
    y.loc[("Italia", t), :] = y.loc[(zones, t), :].sum()

In [6]:
temp = pd.read_table("/home/sara/Documents/Immigration/Shared_models/Data/x_zones.csv", sep = "\t", index_col=["Province", "Year"])
xs = pd.DataFrame(columns=["Province", "Year"]+temp.columns.tolist())
xs = xs.set_index(["Province","Year"])

for t in temp.index.levels[1]:
    xs.loc[("Italia", t), :] = temp.loc[(zones, t), :].sum()

In [7]:
palette = ['blue', 'darkgreen', 'yellowgreen', 'orange', 'lightcoral', 'red', 'paleturquoise', 'deepskyblue', 'mediumpurple', 'fuchsia']

In [8]:
countries_list = ["Romania", "Morocco", "Albania", "Tunisia", "Egypt", "Ecuador", "Peru", "China", "Philippines"]
countries_list_iso3 = [pycountry.countries.get(name=country).alpha_3 for country in countries_list]
target = "Italia"
# number of features to select
ks = [3, 5, 7, 10, 15]

In [9]:
y_hat, models = mf.compute_regression_model(y, xs, years, countries_list, target, ks)

ROU
MAR
ALB
TUN
EGY
ECU
PER
CHN
PHL


In [10]:
pdf.relation_plot_time_variant_intern_function(y_hat, countries_list_iso3, years, ["Predicted"], resident_foreigners_norm.groupby(["Country", "Year"]), plt.figure(1, figsize=(15,14)), 331, 45, palette, None, "Immigrant Stock Real VS Predicted", True, directory+"/regression_model_italy", False)

In [11]:
y_italia_pred = y.copy()
for country in countries_list_iso3:
    y_italia_pred.loc[("Italia", years), country] = np.array(y_hat.loc[(slice(None), country), "Predicted"].values, dtype=np.float32)

In [12]:
for k1 in models.keys():
    models[k1]["coefficients"] = list(models[k1]["coefficients"])

In [13]:
res_it = defaultdict(dict)

for k1 in models.keys():
    for k2, v in zip(models[k1]["features"], models[k1]["coefficients"]):
        res_it[k1][k2] = v

In [14]:
with open(directory+"/regression_model_italy.txt", 'w') as outfile:
    json.dump(res_it, outfile)

## Zones Prediction - H. Jayet et al. paper

In [15]:
xs_zones = pd.read_table("/home/sara/Documents/Immigration/Shared_models/Data/x_zones.csv", sep = "\t", index_col=["Province", "Year"])
xs_zones = pd.concat([xs_zones, xs])
zones_data = pd.read_table("/home/sara/Documents/Immigration/Shared_statistics/Data_final/territori.csv")
zones_data = zones_data.replace(['Provincia Autonoma Bolzano / Bozen', 'Provincia Autonoma Trento'], ['Bolzano / Bozen', 'Trento'])

In [16]:
# Distance matrix related to the interested locations (regions capitals)
temp_W = pd.read_table("/home/sara/Documents/Immigration/Shared_models/Data/Zones_distances_matrix_mean.csv", sep = "\t", index_col=0)

In [17]:
# Add Italy distance info as mean of the five zones
temp_W["Italia"] = [temp_W.loc[z].mean() for z in temp_W.index]
temp_W.loc["Italia"] = [temp_W.loc[z].mean() for z in temp_W.index] + [0]

In [18]:
temp_W = (1/temp_W)**2
# w_ij = 0 if i=j
temp_W[temp_W == np.inf] = 0
# row standardization: every arow sum up to 1
temp_W = temp_W.div(temp_W.sum(axis=1), axis=0)
# Just to make sure the matrix has the right sort
temp_W = temp_W.sort_index(axis=1)
temp_W = temp_W.sort_index(axis=0)

In [19]:
#palette = sns.color_palette()
palette = ['blue', 'darkgreen', 'yellowgreen', 'orange', 'lightcoral', 'red', 'paleturquoise', 'deepskyblue', 'mediumpurple', 'fuchsia']

### Romania

In [20]:
vars_ = [["y_prev_2", "Average age of mothers at birth"], 
 ["y_prev_2", "Average age of mothers at birth", "Average age of fathers at birth", 
  "Free activities in voluntary associations"]]

In [22]:
print("-------------------------------", "Romania", "-------------------------------")
res_pred, res_params = sem.run_model(y, "Romania", years, "Italia", xs_zones, temp_W, temp_W.columns.tolist(), vars_, False, palette, "Spatial Error Model", save = True, path = directory+"/spatial_autocorr_model_romania1", data_hat = y_italia_pred, train_test = True, test_size = 1)

res_params_dict = defaultdict(dict)
for i, r in zip(["beta", "a", "rho"], res_params[:3]):
    if type(r) == np.ndarray:
        res_params_dict[i] = list(r)
    else:
        res_params_dict[i] = r
    
name = ['MI 3 selection','MI 5 selection', 'MI 7 selection']
for i, r, n in zip(vars_, res_params[3], name):
    for j, s in zip(i, r):
        res_params_dict[n][j] = s
        
with open(directory+"/spatial_autocorr_model_romania1_fitted_values_params.txt", 'w') as outfile:
    json.dump(res_params_dict, outfile)
    
res_pred.to_csv(directory+"/spatial_autocorr_model_romania1_fitted_values.csv", sep = "\t")

------------------------------- Romania -------------------------------
---------- Step I ----------
Current time: 10:14:20
Optimization terminated successfully.
Current time: 10:18:44
Computational time: 0:04:24.077412
---------- Step II ----------
---------- Validation ----------
R-squared for Prediction step I: 1.000000.
Adjusted R-squared for Prediction step I: 0.999000.
R-squared for MI 3 selection: 0.999000.
Adjusted R-squared for MI 3 selection: 0.999000.
R-squared for MI 5 selection: 0.999000.
Adjusted R-squared for MI 5 selection: 0.999000.


### Morocco

In [23]:
vars_ = [["y_prev_2","Free activities in voluntary associations"], 
 ["y_prev_2","Free activities in voluntary associations", "internal_migration - Foreign country", 
  "native population - Total"]]

In [24]:
print("-------------------------------", "Morocco", "-------------------------------")
res_pred, res_params = sem.run_model(y, "Morocco", years, "Italia", xs_zones, temp_W, temp_W.columns.tolist(), vars_, False, palette, "Spatial Error Model", save = True, path = directory+"/spatial_autocorr_model_Morocco1", data_hat = y_italia_pred, train_test = True, test_size = 1)

res_params_dict = defaultdict(dict)
for i, r in zip(["beta", "a", "rho"], res_params[:3]):
    if type(r) == np.ndarray:
        res_params_dict[i] = list(r)
    else:
        res_params_dict[i] = r
    
name = ['MI 3 selection','MI 5 selection', 'MI 7 selection', 'MI 10 selection', 'MI 15 selection', 'Manual selection']
for i, r, n in zip(vars_, res_params[3], name):
    for j, s in zip(i, r):
        res_params_dict[n][j] = s
        
with open(directory+"/spatial_autocorr_model_morocco1_fitted_values_params.txt", 'w') as outfile:
    json.dump(res_params_dict, outfile)
    
res_pred.to_csv(directory+"/spatial_autocorr_model_Morocco1_fitted_values.csv", sep = "\t")

------------------------------- Morocco -------------------------------
---------- Step I ----------
Current time: 10:19:13
Optimization terminated successfully.
Current time: 10:25:30
Computational time: 0:06:16.314181
---------- Step II ----------
---------- Validation ----------
R-squared for Prediction step I: 1.000000.
Adjusted R-squared for Prediction step I: 1.000000.
R-squared for MI 3 selection: 1.000000.
Adjusted R-squared for MI 3 selection: 1.000000.
R-squared for MI 5 selection: 1.000000.
Adjusted R-squared for MI 5 selection: 1.000000.


### Albania

In [25]:
vars_ = [["y_prev_2", "native population - Total"], 
 ["y_prev_2", "native population - Total",  "internal_migration - Foreign country", 
  "Free activities in voluntary associations"]]

In [26]:
print("-------------------------------", "Albania", "-------------------------------")
res_pred, res_params = sem.run_model(y, "Albania", years, "Italia", xs_zones, temp_W, temp_W.columns.tolist(), vars_, False, palette, "Spatial Error Model", save = True, path = directory+"/spatial_autocorr_model_albania1", data_hat = y_italia_pred, train_test = True, test_size = 1)
res_params_dict = defaultdict(dict)
for i, r in zip(["beta", "a", "rho"], res_params[:3]):
    if type(r) == np.ndarray:
        res_params_dict[i] = list(r)
    else:
        res_params_dict[i] = r
    
name = ['MI 3 selection','MI 5 selection', 'MI 7 selection', 'MI 10 selection', 'MI 15 selection', 'Manual selection']
for i, r, n in zip(vars_, res_params[3], name):
    for j, s in zip(i, r):
        res_params_dict[n][j] = s
        
with open(directory+"/spatial_autocorr_model_albania1_fitted_values_params.txt", 'w') as outfile:
    json.dump(res_params_dict, outfile)
    
res_pred.to_csv(directory+"/spatial_autocorr_model_albania1_fitted_values.csv", sep = "\t")

------------------------------- Albania -------------------------------
---------- Step I ----------
Current time: 10:25:44
Optimization terminated successfully.
Current time: 10:29:20
Computational time: 0:03:36.128000
---------- Step II ----------
---------- Validation ----------
R-squared for Prediction step I: 1.000000.
Adjusted R-squared for Prediction step I: 1.000000.
R-squared for MI 3 selection: 1.000000.
Adjusted R-squared for MI 3 selection: 1.000000.
R-squared for MI 5 selection: 1.000000.
Adjusted R-squared for MI 5 selection: 1.000000.


### Tunisia

In [27]:
vars_ = [["y_prev_2", "Pay money to an association"], 
 ["y_prev_2", "Pay money to an association", "internal_migration - Foreign country", 
  "native population - Total"]]

In [28]:
print("-------------------------------", "Tunisia", "-------------------------------")
res_pred, res_params = sem.run_model(y, "Tunisia", years, "Italia", xs_zones, temp_W, temp_W.columns.tolist(), vars_, False, palette, "Spatial Error Model", save = True, path = directory+"/spatial_autocorr_model_tunisia1", data_hat = y_italia_pred, train_test = True, test_size = 1)

res_params_dict = defaultdict(dict)
for i, r in zip(["beta", "a", "rho"], res_params[:3]):
    if type(r) == np.ndarray:
        res_params_dict[i] = list(r)
    else:
        res_params_dict[i] = r
    
name = ['MI 3 selection','MI 5 selection', 'MI 7 selection', 'MI 10 selection', 'MI 15 selection', 'Manual selection']
for i, r, n in zip(vars_, res_params[3], name):
    for j, s in zip(i, r):
        res_params_dict[n][j] = s
        
with open(directory+"/spatial_autocorr_model_tunisia1_fitted_values_params.txt", 'w') as outfile:
    json.dump(res_params_dict, outfile)
    
res_pred.to_csv(directory+"/spatial_autocorr_model_tunisia1_fitted_values.csv", sep = "\t")

------------------------------- Tunisia -------------------------------
---------- Step I ----------
Current time: 10:29:28
Optimization terminated successfully.
Current time: 10:32:13
Computational time: 0:02:45.331969
---------- Step II ----------
---------- Validation ----------
R-squared for Prediction step I: 0.999000.
Adjusted R-squared for Prediction step I: 0.999000.
R-squared for MI 3 selection: 0.998000.
Adjusted R-squared for MI 3 selection: 0.998000.
R-squared for MI 5 selection: 0.999000.
Adjusted R-squared for MI 5 selection: 0.998000.


### Egypt

In [29]:
vars_ = [["y_prev_2", "native population - Total"], 
 ["y_prev_2", "native population - Total", "Pay money to an association", "Non food"]]

In [30]:
print("-------------------------------", "Egypt", "-------------------------------")
res_pred, res_params = sem.run_model(y, "Egypt", years, "Italia", xs_zones, temp_W, temp_W.columns.tolist(), vars_, False, palette, "Spatial Error Model", save = True, path = directory+"/spatial_autocorr_model_egypt1", data_hat = y_italia_pred, train_test = True, test_size = 1)

res_params_dict = defaultdict(dict)
for i, r in zip(["beta", "a", "rho"], res_params[:3]):
    if type(r) == np.ndarray:
        res_params_dict[i] = list(r)
    else:
        res_params_dict[i] = r
    
name = ['MI 3 selection','MI 5 selection', 'MI 7 selection']
for i, r, n in zip(vars_, res_params[3], name):
    for j, s in zip(i, r):
        res_params_dict[n][j] = s
        
with open(directory+"/spatial_autocorr_model_egypt1_fitted_values_params.txt", 'w') as outfile:
    json.dump(res_params_dict, outfile)
    
res_pred.to_csv(directory+"/spatial_autocorr_model_egypt1_fitted_values.csv", sep = "\t")

------------------------------- Egypt -------------------------------
---------- Step I ----------
Current time: 10:32:26
Optimization terminated successfully.
Current time: 10:34:26
Computational time: 0:02:00.009126
---------- Step II ----------
---------- Validation ----------
R-squared for Prediction step I: 1.000000.
Adjusted R-squared for Prediction step I: 1.000000.
R-squared for MI 3 selection: 0.999000.
Adjusted R-squared for MI 3 selection: 0.999000.
R-squared for MI 5 selection: 1.000000.
Adjusted R-squared for MI 5 selection: 1.000000.


### Ecuador

In [31]:
vars_ = [["y_prev_2", "native population - Total"], 
 ["y_prev_2", "native population - Total", "internal_migration - Foreign country",
  "Pay money to an association"]]

In [32]:
print("-------------------------------", "Ecuador", "-------------------------------")
res_pred, res_params = sem.run_model(y, "Ecuador", years, "Italia", xs_zones, temp_W, temp_W.columns.tolist(), vars_, False, palette, "Spatial Error Model", save = True, path = directory+"/spatial_autocorr_model_ecuador1", data_hat = y_italia_pred, train_test = True, test_size = 1)

res_params_dict = defaultdict(dict)
for i, r in zip(["beta", "a", "rho"], res_params[:3]):
    if type(r) == np.ndarray:
        res_params_dict[i] = list(r)
    else:
        res_params_dict[i] = r
    
name = ['MI 3 selection','MI 5 selection', 'MI 7 selection', 'MI 10 selection', 'MI 15 selection', 'Manual selection']
for i, r, n in zip(vars_, res_params[3], name):
    for j, s in zip(i, r):
        res_params_dict[n][j] = s
        
with open(directory+"/spatial_autocorr_model_ecuador1_fitted_values_params.txt", 'w') as outfile:
    json.dump(res_params_dict, outfile)
    
res_pred.to_csv(directory+"/spatial_autocorr_model_ecuador1_fitted_values.csv", sep = "\t")

------------------------------- Ecuador -------------------------------
---------- Step I ----------
Current time: 10:34:40
Desired error not necessarily achieved due to precision loss.
Current time: 10:40:42
Computational time: 0:06:02.075289
---------- Step II ----------
---------- Validation ----------
R-squared for Prediction step I: 1.000000.
Adjusted R-squared for Prediction step I: 1.000000.
R-squared for MI 3 selection: 1.000000.
Adjusted R-squared for MI 3 selection: 1.000000.
R-squared for MI 5 selection: 1.000000.
Adjusted R-squared for MI 5 selection: 1.000000.


### Peru

In [33]:
vars_ = [["y_prev_2", "native population - Total"], 
 ["y_prev_2", "native population - Total", "internal_migration - Foreign country", 
  "Free activities in voluntary associations"]]

In [34]:
print("-------------------------------", "Peru", "-------------------------------")
res_pred, res_params = sem.run_model(y, "Peru", years, "Italia", xs_zones, temp_W, temp_W.columns.tolist(), vars_, False, palette, "Spatial Error Model", save = True, path = directory+"/spatial_autocorr_model_peru1", data_hat = y_italia_pred, train_test = True, test_size = 1)

res_params_dict = defaultdict(dict)
for i, r in zip(["beta", "a", "rho"], res_params[:3]):
    if type(r) == np.ndarray:
        res_params_dict[i] = list(r)
    else:
        res_params_dict[i] = r
    
name = ['MI 3 selection','MI 5 selection', 'MI 7 selection', 'MI 10 selection', 'MI 15 selection', 'Manual selection']
for i, r, n in zip(vars_, res_params[3], name):
    for j, s in zip(i, r):
        res_params_dict[n][j] = s
        
with open(directory+"/spatial_autocorr_model_peru1_fitted_values_params.txt", 'w') as outfile:
    json.dump(res_params_dict, outfile)
    
res_pred.to_csv(directory+"/spatial_autocorr_model_peru1_fitted_values.csv", sep = "\t")

------------------------------- Peru -------------------------------
---------- Step I ----------
Current time: 10:40:50
Optimization terminated successfully.
Current time: 10:47:10
Computational time: 0:06:20.320980
---------- Step II ----------
---------- Validation ----------
R-squared for Prediction step I: 1.000000.
Adjusted R-squared for Prediction step I: 1.000000.
R-squared for MI 3 selection: 1.000000.
Adjusted R-squared for MI 3 selection: 1.000000.
R-squared for MI 5 selection: 1.000000.
Adjusted R-squared for MI 5 selection: 1.000000.


### China

In [35]:
vars_ = [["y_prev_2", "Born alive"], 
 ["y_prev_2", "Born alive", "internal_migration - Foreign country", 
  "political_info - Some times in a week"]]

In [36]:
print("-------------------------------", "China", "-------------------------------")
res_pred, res_params = sem.run_model(y, "China", years, "Italia", xs_zones, temp_W, temp_W.columns.tolist(), vars_, False, palette, "Spatial Error Model", save = True, path = directory+"/spatial_autocorr_model_china1", data_hat = y_italia_pred, train_test = True, test_size = 1)

res_params_dict = defaultdict(dict)
for i, r in zip(["beta", "a", "rho"], res_params[:3]):
    if type(r) == np.ndarray:
        res_params_dict[i] = list(r)
    else:
        res_params_dict[i] = r
    
name = ['MI 3 selection','MI 5 selection', 'MI 7 selection', 'MI 10 selection', 'MI 15 selection', 'Manual selection']
for i, r, n in zip(vars_, res_params[3], name):
    for j, s in zip(i, r):
        res_params_dict[n][j] = s
        
with open(directory+"/spatial_autocorr_model_china1_fitted_values_params.txt", 'w') as outfile:
    json.dump(res_params_dict, outfile)
    
res_pred.to_csv(directory+"/spatial_autocorr_model_china1_fitted_values.csv", sep = "\t")

------------------------------- China -------------------------------
---------- Step I ----------
Current time: 10:47:33
Optimization terminated successfully.
Current time: 10:49:30
Computational time: 0:01:57.362852
---------- Step II ----------
---------- Validation ----------
R-squared for Prediction step I: 0.999000.
Adjusted R-squared for Prediction step I: 0.999000.
R-squared for MI 3 selection: 0.998000.
Adjusted R-squared for MI 3 selection: 0.998000.
R-squared for MI 5 selection: 0.999000.
Adjusted R-squared for MI 5 selection: 0.999000.


### Philippines

In [37]:
vars_ = [["y_prev_2", "native population - Total"], 
 ["y_prev_2", "native population - Total", "internal_migration - Foreign country",
  "Other goods and services"]]

In [38]:
print("-------------------------------", "Philippines", "-------------------------------")
res_pred, res_params = sem.run_model(y, "Philippines", years, "Italia", xs_zones, temp_W, temp_W.columns.tolist(), vars_, False, palette, "Spatial Error Model", save = True, path = directory+"/spatial_autocorr_model_philippines1", data_hat = y_italia_pred, train_test = True, test_size = 1)

res_params_dict = defaultdict(dict)
for i, r in zip(["beta", "a", "rho"], res_params[:3]):
    if type(r) == np.ndarray:
        res_params_dict[i] = list(r)
    else:
        res_params_dict[i] = r
    
name = ['MI 3 selection','MI 5 selection', 'MI 7 selection', 'MI 10 selection', 'MI 15 selection', 'Manual selection']
for i, r, n in zip(vars_, res_params[3], name):
    for j, s in zip(i, r):
        res_params_dict[n][j] = s
        
with open(directory+"/spatial_autocorr_model_philippines1_fitted_values_params.txt", 'w') as outfile:
    json.dump(res_params_dict, outfile)
    
res_pred.to_csv(directory+"/spatial_autocorr_model_philippines1_fitted_values.csv", sep = "\t")

------------------------------- Philippines -------------------------------
---------- Step I ----------
Current time: 10:49:42
Optimization terminated successfully.
Current time: 10:54:21
Computational time: 0:04:38.805967
---------- Step II ----------
---------- Validation ----------
R-squared for Prediction step I: 0.999000.
Adjusted R-squared for Prediction step I: 0.999000.
R-squared for MI 3 selection: 0.997000.
Adjusted R-squared for MI 3 selection: 0.997000.
R-squared for MI 5 selection: 0.999000.
Adjusted R-squared for MI 5 selection: 0.998000.
