In [332]:
import plotly.io as pio
pio.renderers.default = "notebook_connected"

In [333]:
import pandas as pd
import numpy as np
import re

# url = "https://data.gov.au/data/dataset/2fe5e2a9-8a3d-4dcf-baec-c5147d953150/resource/b59a15df-86ea-4c4c-95be-4dd9fc9f8ac4/download/2019-20-historical-migration-statistics-locked.xlsx"
url = "./datasets/2019-20-historical-migration-statistics-locked.xlsx"

# Table 1.1: Permanent and long-term arrivals, October 1945 to June 1959
df1= pd.read_excel(url, sheet_name="1.1", header=7 )
# Table 1.2: Settler arrivals, January 1959 to June 1975  
df2= pd.read_excel(url, sheet_name="1.2", header=7 )
#Table 1.3: Settler arrivals, 1975–76 to 1994–95  
df3= pd.read_excel(url, sheet_name="1.3", header=7 )
#Table 1.4: Settler arrivals, 1995–96
df4= pd.read_excel(url, sheet_name="1.4", header=7 )
# Table 2.1: Permanent additions, 1996–97 to 2007–08
df5= pd.read_excel(url, sheet_name="2.1", header=7 )
# Table 2.2: Permanent additions, 2008–09 to 2016–17
df6= pd.read_excel(url, sheet_name="2.1", header=7 )

#Table 3.2: The permanent migration program outcome by stream and citizenship, 1996–97 to 2016–17
df7= pd.read_excel(url, sheet_name="3.2", header=7 )




frames = [df1, df2, df3, df4, df5, df6, df7]

stores_df = pd.concat(frames)



In [334]:

# stores_df = pd.read_excel("datasets/2019-20-historical-migration-statistics-locked.xlsx", sheet_name="1.2", header=7 )

stores_df = stores_df.iloc[: , :-1]
stores_df = stores_df.iloc[: , 1:]
stores_df = stores_df.melt(id_vars=["Year"], 
        var_name="country", 
        value_name="migration_no")
stores_df.rename(index=str, columns={"Year": "year"}, inplace=True)

stores_df.head()

Unnamed: 0,year,country,migration_no
0,Oct 1945-Jun 1947,Fiji,282.0
1,1947–48,Fiji,200.0
2,1948–49,Fiji,190.0
3,1949–50,Fiji,180.0
4,1950–51,Fiji,245.0


In [335]:
stores_df.dropna(subset=['migration_no', 'country' ], inplace=True) 

stores_df['year'].replace(to_replace=".*([0-9]{4}).*", value=r"\1", regex=True, inplace=True)

stores_df['migration_no'].replace(to_replace="\D*(\d*)", value=r"\1", regex=True, inplace=True)

stores_df[['year','migration_no']] = stores_df[['year','migration_no']].astype(int)

stores_df = stores_df[~stores_df.country.str.contains('|'.join(['Australia', 'Other', 'total', 'Total']))]

stores_df.dtypes




Unnamed: 0,year,country,migration_no
6764,1947,China,1700
6765,1947,China,827
6766,1948,China,1285
6767,1949,China,1499
6768,1950,China,708
...,...,...,...
120669,2012,Hong Kong (SAR of China).2,5
120670,2013,Hong Kong (SAR of China).2,0
120671,2014,Hong Kong (SAR of China).2,0
120672,2015,Hong Kong (SAR of China).2,5


In [336]:
remove = ['.1', '.2', '.3', '.4', '']
rexpress = '[' + re.escape (''. join (remove)) + ']'
stores_df['country'] = stores_df['country'].str.replace(rexpress, '', regex=True)

remove_words = [', Dem Peoples Rep Of', ', People\'s Republic of', 'and Ireland', '\x28excludes SARs and Taiwan\x29', '\x28SAR of China\x29']
rexpress = r'\b(?:{})\b'.format( '|'.join(remove_words))
stores_df['country'] = stores_df['country'].str.replace(rexpress, '', regex=True)

stores_df['country'].replace(["U.S.S.R."], "Russia",  inplace=True)
# stores_df[stores_df['country'].str.contains("SAR of China")]


Unnamed: 0,year,country,migration_no


In [337]:


import pycountry
list_countries = stores_df['country'].unique().tolist()
d_country_code = {} 
for country in list_countries:
    try:
        country_data = pycountry.countries.search_fuzzy(country)
        country_code = country_data[0].alpha_3
        d_country_code.update({country: country_code})
    except:
        print('\n could not add ISO 3 code for:', country)
        d_country_code.update({country: np.nan})
print(d_country_code)


 could not add ISO 3 code for: Czechoslovakia

 could not add ISO 3 code for: USSR

 could not add ISO 3 code for: Yugoslavia

 could not add ISO 3 code for: Unnamed: 68

 could not add ISO 3 code for: At Sea & Not Stated

 could not add ISO 3 code for: Unnamed: 77

 could not add ISO 3 code for: Melanesia nfd

 could not add ISO 3 code for: Micronesia nfd

 could not add ISO 3 code for: Polynesia excluding Hawaii nfd

 could not add ISO 3 code for: Samoa, American

 could not add ISO 3 code for: Samoa, Western

 could not add ISO 3 code for: Eastern Europe nfd

 could not add ISO 3 code for: Faeroe Islands

 could not add ISO 3 code for: Fmr Czechoslovakia

 could not add ISO 3 code for: Former Yugoslavia

 could not add ISO 3 code for: Germany, Federal Republic of

 could not add ISO 3 code for: Northern Europe nfd

 could not add ISO 3 code for: Southern Europe nfd

 could not add ISO 3 code for: Western Europe nfd

 could not add ISO 3 code for: Fmr USSR & the Baltic States

 coul

In [338]:
for k, v in d_country_code.items():
    stores_df.loc[(stores_df.country == k), 'iso_alpha'] = v

stores_df.dropna(subset=['iso_alpha' ], inplace=True) 

stores_df

Unnamed: 0,year,country,migration_no,iso_alpha
0,1947,Fiji,282,FJI
1,1947,Fiji,200,FJI
2,1948,Fiji,190,FJI
3,1949,Fiji,180,FJI
4,1950,Fiji,245,FJI
...,...,...,...,...
136155,2012,Zimbabwe,8,ZWE
136156,2013,Zimbabwe,5,ZWE
136157,2014,Zimbabwe,5,ZWE
136158,2015,Zimbabwe,5,ZWE


In [344]:
import plotly.express as px
gapminder = px.data.gapminder()
px.choropleth(stores_df,               
                    locations="iso_alpha",               
                    color="migration_no",
                    hover_name="country",  
                    animation_frame="year",    
                    range_color=[20,80],  
                    color_continuous_scale=px.colors.sequential.tempo,  
                    height=600             
)

