In [1]:
import pandas as pd
import numpy as np
import time
import random
import json
import operator
import pycountry
import pycountry_convert
from iso3166 import countries

### Merge information tables with the main (ISTAT) one - after language coherence

In [2]:
# upload main table
resident_foreigners_norm = pd.read_table("Data_final/resident_foreigners_norm.csv", sep = "\t")
resident_foreigners_norm.head()

Unnamed: 0,Province,Country,Year,Gender,Value
0,Biella,Andorra,2005,male,1
1,Biella,Andorra,2005,female,0
2,Novara,Andorra,2005,male,0
3,Novara,Andorra,2005,female,2
4,Alessandria,Andorra,2017,male,0


In [3]:
# upload country name coherence table
country_name = pd.read_table("Data_final/country_name_coherence.csv", sep = "\t")
country_name.head()

Unnamed: 0,iso2,iso3,english name,italian name,italian name istat
0,AD,AND,Andorra,Andorra,Andorra
1,AE,ARE,United Arab Emirates,Emirati Arabi Uniti,Emirati Arabi Uniti
2,AF,AFG,Afghanistan,Afghanistan,Afghanistan
3,AG,ATG,Antigua and Barbuda,Antigua e Barbuda,Antigua e Barbuda
4,AI,AIA,Anguilla,Anguilla,Anguilla


In [4]:
'''
file = open("testfile.txt", "w") 
for c in country_name["english name"].values[240:260]:
    file.write("{{geocodeArea: "+c+"\n")
    file.write("}};"+"\n")
    file.write("rel(pivot);"+"\n")
    file.write("out body geom;"+"\n")
file.close()
'''

'\nfile = open("testfile.txt", "w") \nfor c in country_name["english name"].values[240:260]:\n    file.write("{{geocodeArea: "+c+"\n")\n    file.write("}};"+"\n")\n    file.write("rel(pivot);"+"\n")\n    file.write("out body geom;"+"\n")\nfile.close()\n'

In [12]:
# replace the country name with the country id
for c in list(set(resident_foreigners_norm["Country"])):
    resident_foreigners_norm["Country"] = resident_foreigners_norm["Country"].replace({c: country_name[country_name["italian name istat"] == c]["iso3"].values})
resident_foreigners_norm.head()

Unnamed: 0,Province,Country,Year,Gender,Value
0,Biella,AND,2005,male,1
1,Biella,AND,2005,female,0
2,Novara,AND,2005,male,0
3,Novara,AND,2005,female,2
4,Alessandria,AND,2017,male,0


In [14]:
unesco_demo = pd.read_table("Data_final/unesco_demo.csv", sep = "\t")
unesco_demo.head()

Unnamed: 0,Indicator,iso3,Year,Value,Flag
0,SP_DYN_TFRT_IN,AUS,2003,1.748,
1,SP_DYN_TFRT_IN,AUS,2004,1.768,
2,SP_DYN_TFRT_IN,AUS,2005,1.807,
3,SP_DYN_TFRT_IN,AUS,2006,1.908,
4,SP_DYN_TFRT_IN,AUS,2007,1.959,


In [15]:
unesco_education = pd.read_table("Data_final/unesco_education.csv", sep = "\t")
unesco_education.head()

Unnamed: 0,Indicator,iso3,Year,Value,Flag
0,XGOVEXP_IMF,BFA,2005,19.49419,
1,XGOVEXP_IMF,BFA,2006,18.2017,
2,XGOVEXP_IMF,BFA,2007,17.86382,
3,XGOVEXP_IMF,BFA,2010,17.25081,
4,XGOVEXP_IMF,BFA,2011,20.31364,


In [16]:
geo_dist = pd.read_table("Data_final/cepii_geo_dist.csv", sep = "\t")
geo_dist.head()

Unnamed: 0,iso_o,iso_d,contig,comlang_off,comlang_ethno,colony,comcol,curcol,col45,smctry,dist,distcap,distw,distwces
0,ABW,ABW,0,0,0,0,0,0,0,0,5225315,5225315,2509354,2304723
1,ABW,AFG,0,0,0,0,0,0,0,0,1325781,1325781,1316822,1316637
2,ABW,AGO,0,0,0,0,0,0,0,0,9516913,9516913,9587316,9584193
3,ABW,AIA,0,0,1,0,0,0,0,0,9832682,9832682,9768974,9768916
4,ABW,ALB,0,0,0,0,0,0,0,0,9091742,9091742,9091576,9091466


In [18]:
geo_info = pd.read_table("Data_final/cepii_geo_info.csv", sep = "\t")
geo_info.head()

Unnamed: 0,iso3,country,area,dis_int,landlocked,continent,maincity,citynum,langoff_1,langoff_2,...,lang9_2,lang9_3,lang9_4,colonizer1,colonizer2,colonizer3,colonizer4,short_colonizer1,short_colonizer2,short_colonizer3
0,ABW,Aruba,193,5.225315,0,America,1,2,Dutch,Spanish,...,.,.,.,NLD,.,.,.,.,.,.
1,AFG,Afghanistan,652225,303.7614,1,Asia,1,25,Persian,.,...,Uzbek,.,.,.,.,.,.,GBR,.,.
2,AGO,Angola,1246700,419.9666,0,Africa,1,25,Portuguese,.,...,.,.,.,PRT,.,.,.,.,.,.
3,AIA,Anguilla,102,3.79869,0,America,1,2,English,.,...,.,.,.,GBR,.,.,.,.,.,.
4,ALB,Albania,28748,63.77311,0,Europe,1,25,Albanian,.,...,.,.,.,TUR,.,.,.,.,.,.


In [19]:
# keep just take the information about countrie contained in the ISTAT db
unesco_demo = unesco_demo.loc[unesco_demo["iso3"].isin(list(set(resident_foreigners_norm["Country"])))]
unesco_demo.index = range(len(unesco_demo))
unesco_education = unesco_education.loc[unesco_education["iso3"].isin(list(set(resident_foreigners_norm["Country"])))]
unesco_education.index = range(len(unesco_education))

geo_info = geo_info.loc[geo_info["iso3"].isin(list(set(resident_foreigners_norm["Country"])))]
geo_info.index = range(len(geo_info))
geo_dist = geo_dist[geo_dist["iso_d"].isin(list(set(resident_foreigners_norm["Country"])))]
geo_dist.index = range(len(geo_dist))

Check out if we are gonna miss some info about some countries contained in the ISTAT db

In [20]:
unesco_demo_missing = list(set(resident_foreigners_norm[~resident_foreigners_norm["Country"].isin(list(set(unesco_demo["iso3"])))]["Country"].values))

for c in unesco_demo_missing:
    print(pycountry.countries.get(alpha_3=c).name)

Taiwan, Province of China


In [23]:
unesco_education_missing = list(set(resident_foreigners_norm[~resident_foreigners_norm["Country"].isin(list(set(unesco_education["iso3"])))]["Country"].values))

for c in unesco_education_missing:
    try:
        print(pycountry.countries.get(alpha_3=c).name)
    except:
        print(c)

Macedonia, Republic of
Suriname
Bosnia and Herzegovina
Tuvalu
Montenegro
Bahamas
Libya
Iraq
Kiribati
YUG
Uzbekistan
Palestine, State of
Somalia
Korea, Democratic People's Republic of
Nigeria
China
Haiti
Nauru
Cuba
Myanmar
Palau
Taiwan, Province of China
Equatorial Guinea
Papua New Guinea


In [24]:
geo_info_missing = list(set(resident_foreigners_norm[~resident_foreigners_norm["Country"].isin(list(set(geo_info["iso3"])))]["Country"].values))

for c in geo_info_missing:
    print(pycountry.countries.get(alpha_3=c).name, pycountry.countries.get(alpha_3=c).alpha_2)

Liechtenstein LI
South Sudan SS
Monaco MC
Palestine, State of PS


In [30]:
geo_dist_missing = list(set(resident_foreigners_norm[~resident_foreigners_norm["Country"].isin(list(set(geo_dist["iso_d"])))]["Country"].values))

for c in geo_dist_missing:
    print(pycountry.countries.get(alpha_3=c).name)

Liechtenstein
South Sudan
Monaco
Palestine, State of


In [31]:
# It's possible to add at least the continent information of countries where the geo info is missing - by hands
geo_info = geo_info.append({"iso3": pycountry.countries.get(name="Monaco").alpha_3, "country": "Monaco", "continent": "Europe"}, ignore_index=True)
geo_info = geo_info.append({"iso3": pycountry.countries.get(name="Liechtenstein").alpha_3, "country": "Liechtenstein", "continent": "Europe"}, ignore_index=True)
geo_info = geo_info.append({"iso3": pycountry.countries.get(name="South Sudan").alpha_3, "country": "South Sudan", "continent": "Asia"}, ignore_index=True)
geo_info = geo_info.append({"iso3": pycountry.countries.get(name="Palestine, State of").alpha_3, "country": "Palestine, State of", "continent": "Asia"}, ignore_index=True)

In [32]:
resident_foreigners_norm.to_csv("Data_final/resident_foreigners_norm.csv", sep = "\t", index = False)
geo_info.to_csv("Data_final/cepii_geo_info.csv", sep = "\t", index = False)
geo_dist.to_csv("Data_final/cepii_geo_dist.csv", sep = "\t", index = False)
unesco_demo.to_csv("Data_final/unesco_demo.csv", sep = "\t", index = False)
unesco_education.to_csv("Data_final/unesco_education.csv", sep = "\t", index = False)