# Pre-Processing Phase

## Importing all the important libraries

In [87]:
import pandas as pd
import numpy as np
pd.plotting.register_matplotlib_converters()
import matplotlib.pyplot as plt
import seaborn as sns
import sklearn
import sklearn.model_selection
import chart_studio.plotly as py
import plotly.graph_objs as go
from plotly.offline import iplot, init_notebook_mode
import cufflinks as cf
cf.go_offline()
cf.set_config_file(offline=False, world_readable=True)
print("Initial setup is complete")


Initial setup is complete


In [145]:
import warnings
warnings.filterwarnings("ignore")
from pandas import datetime
def parser(x):
    return datetime.strptime(x, '%Y')
migration_filepath = "C:\\Users\\DELL\\Downloads\\migration_population.csv"
mig = pd.read_csv(migration_filepath, parse_dates=[1], date_parser = parser)
mig.head()

Unnamed: 0,country,year,population,pop_density,net_migration,migration_perc,iso3c,iso2c,region,incomeLevel,lendingType,capitalCity,longitude,latitude
0,Afghanistan,2018-01-01,37172386,56.93776,,,AFG,AF,South Asia,Low income,IDA,Kabul,69.1761,34.5228
1,Afghanistan,2017-01-01,36296400,55.595993,-314602.0,-0.008668,AFG,AF,South Asia,Low income,IDA,Kabul,69.1761,34.5228
2,Afghanistan,2016-01-01,35383128,54.197114,,,AFG,AF,South Asia,Low income,IDA,Kabul,69.1761,34.5228
3,Afghanistan,2015-01-01,34413603,52.712071,,,AFG,AF,South Asia,Low income,IDA,Kabul,69.1761,34.5228
4,Afghanistan,2014-01-01,33370794,51.114778,,,AFG,AF,South Asia,Low income,IDA,Kabul,69.1761,34.5228


In [147]:
mig.isnull().sum()

country             0
year                0
population          0
pop_density        19
net_migration     893
migration_perc    893
iso3c               0
iso2c               0
region              0
incomeLevel         0
lendingType         0
capitalCity         0
longitude           0
latitude            0
dtype: int64

In [148]:
mig['net_migration'].mean()

-219526.47368421053

In [149]:
mig['pop_density'].mean()

435.65773018513863

In [154]:
mig['migration_perc'].mean()

0.1688996086039246

In [151]:
mig.fillna(value=mig['migration_perc'].max(), inplace=True)

In [152]:
mig.fillna(value=mig['net_migration'].mean(), inplace=True)

In [153]:
mig.fillna(value=mig['pop_density'].mean(), inplace=True)

## Encoding categorical variables into numeric

In [96]:
mig["region"].value_counts()

South-East Asia    649
South Asia         472
Name: region, dtype: int64

In [97]:
mig["lendingType"].value_counts()

IDA               472
IBRD              413
Blend             118
Not classified    118
Name: lendingType, dtype: int64

In [98]:
mig["incomeLevel"].value_counts()

Lower middle income    649
Upper middle income    236
High income            118
Low income             118
Name: incomeLevel, dtype: int64

In [155]:
cleanup_nums = {"incomeLevel":     {"Low income": 1, "Lower middle income": 2, "Upper middle income": 3, "High income": 4}}
mig = mig.replace(cleanup_nums)


In [156]:
# One hot encoding of data
pd.get_dummies(mig, columns=["region","lendingType"]).head()

Unnamed: 0,country,year,population,pop_density,net_migration,migration_perc,iso3c,iso2c,incomeLevel,capitalCity,longitude,latitude,region_South Asia,region_South-East Asia,lendingType_Blend,lendingType_IBRD,lendingType_IDA,lendingType_Not classified
0,Afghanistan,2018-01-01,37172386,56.93776,0.212376,0.212376,AFG,AF,1,Kabul,69.1761,34.5228,1,0,0,0,1,0
1,Afghanistan,2017-01-01,36296400,55.595993,-314602.0,-0.008668,AFG,AF,1,Kabul,69.1761,34.5228,1,0,0,0,1,0
2,Afghanistan,2016-01-01,35383128,54.197114,0.212376,0.212376,AFG,AF,1,Kabul,69.1761,34.5228,1,0,0,0,1,0
3,Afghanistan,2015-01-01,34413603,52.712071,0.212376,0.212376,AFG,AF,1,Kabul,69.1761,34.5228,1,0,0,0,1,0
4,Afghanistan,2014-01-01,33370794,51.114778,0.212376,0.212376,AFG,AF,1,Kabul,69.1761,34.5228,1,0,0,0,1,0


## EDA Analysis

In [101]:
# Check for duplicates
idsTotal = mig.shape[0]
idsDupli = mig[mig['net_migration'].duplicated()]
print(f'There are {len(idsDupli)} duplicate IDs for {idsTotal} total entries')

There are 915 duplicate IDs for 1121 total entries


This shows that our Net Migration has some duplicate values. since a country is going to have a duplicated value for a period of time for migration

In [158]:
print ("Skew is:", mig.migration_perc.skew())
print("Kurtosis: %f" % mig.migration_perc.kurt())

Skew is: -1.6161667132537993
Kurtosis: 0.950273


We can see that our target variable is skewed towards the left. Therefore, we have to normalize it.

In [159]:
num_features = mig.select_dtypes(include=[np.number])
num_features.dtypes

population          int64
pop_density       float64
net_migration     float64
migration_perc    float64
incomeLevel         int64
longitude         float64
latitude          float64
dtype: object

In [160]:
corr=num_features.corr()
corr

Unnamed: 0,population,pop_density,net_migration,migration_perc,incomeLevel,longitude,latitude
population,1.0,-0.045008,-0.13825,-0.003132,-0.142487,-0.253184,0.249363
pop_density,-0.045008,1.0,0.025669,0.027144,0.473282,0.040553,-0.230298
net_migration,-0.13825,0.025669,1.0,0.355866,0.099991,0.042476,-0.104965
migration_perc,-0.003132,0.027144,0.355866,1.0,0.033807,0.003152,-0.015455
incomeLevel,-0.142487,0.473282,0.099991,0.033807,1.0,0.26185,-0.584813
longitude,-0.253184,0.040553,0.042476,0.003152,0.26185,1.0,-0.606429
latitude,0.249363,-0.230298,-0.104965,-0.015455,-0.584813,-0.606429,1.0


In [161]:
corr['net_migration'].sort_values(ascending=False)

net_migration     1.000000
migration_perc    0.355866
incomeLevel       0.099991
longitude         0.042476
pop_density       0.025669
latitude         -0.104965
population       -0.138250
Name: net_migration, dtype: float64

From the above result, we can see that migration_perc have the most positive correlation and the population has the lowest correlation with our target variable. It is totally different from our initial hypotheses, this variables was expected to have high impact in the increase of migration in these particular regions.

## Creating various visualizations

In [208]:
mig.loc[mig.country.isin(['Afghanistan'])][['net_migration']].iplot(
    kind='hist',
    histnorm='percent',
    barmode='overlay',
    xTitle='Classification of country',
    yTitle='(%) Countries invested',
    title='income levels and lending types of all countries')

In [163]:
mig.loc[mig.country.isin(['Bhutan', 'Maldives', 'Singapore'])]['pop_density'].iplot(kind='hist', xTitle='Population Distribution',
                  yTitle='count', title='Population density distribution for the two countries in comparison')

In [169]:
mig2 = mig.loc[mig.country.isin(['India'])][['year','pop_density','net_migration']].\
         set_index('year').\
         resample('M').mean()
mig2.iplot(kind='bar', xTitle='Date', yTitle='Average',
    title='Changes over the years, comparison of population density and net migration')

In [137]:
mig.pivot(columns='country', values='migration_perc').iplot(
        kind='box',
        yTitle='Migrants over the years[1960-2015]',
        title='Distribution of migrants by country')

In [170]:
tds = mig[mig['country'] == 'India'].\
         set_index('year')
# Plot read time as a time series
tds[['net_migration', 'incomeLevel', 'migration_perc']].iplot(
    y='net_migration', mode='lines+markers', secondary_y = 'migration_perc',
    secondary_y_title='Migration (%ge)', xTitle='Years', yTitle='pop_density',
    text='incomeLevel', title='Change in migration patterns with population density')

- Getting a nicely formatted time-series x-axis automatically
- Adding a secondary y-axis because our variables have different ranges
- Adding in the title of the articles as hover information

In [181]:
mig.iplot(
    x='incomeLevel',
    y='net_migration',
    # Specify the category
    categories='lendingType',
    xTitle='Income Level',
    yTitle='Net Migration',
    title='Income Level vs Net Migration by Lending Type of all countries')

In [191]:
colorscales = [
    'Greys', 'YlGnBu', 'Greens', 'YlOrRd', 'Bluered', 'RdBu', 'Reds', 'Blues',
    'Picnic', 'Rainbow', 'Portland', 'Jet', 'Hot', 'Blackbody', 'Earth',
    'Electric', 'Viridis', 'Cividis'
]

In [193]:
import plotly.figure_factory as ff

figure = ff.create_scatterplotmatrix(
    mig[['net_migration', 'pop_density', 'migration_perc',      
        'incomeLevel','year']], height = 1500, width = 1500,
    diag='histogram',
    index='year')
iplot(figure)

In [194]:
corrs = mig.corr()

figure = ff.create_annotated_heatmap(
    z=corrs.values,
    x=list(corrs.columns),
    y=list(corrs.index),
    colorscale='Earth',
    annotation_text=corrs.round(2).values,
    showscale=True, reversescale=True)

figure.layout.margin = dict(l=200, t=200)
figure.layout.height = 800
figure.layout.width = 1000

iplot(figure)


In [197]:
mig.loc[mig.country.isin(['India'])].set_index('year')[['net_migration', 'migration_perc']].iplot(
    kind='spread', mode='markers', yTitle='Number', title='Spread between net migration and migration percentage for each country ')