In [None]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import plotly.express as px

In [None]:
pd.set_option('display.max_columns',None) # Optional - only for this example

loading the file from excel

sheet is different from the default one
data has some not usefull records, which will be skipped

In [None]:
df = pd.read_excel('Canada.xlsx', sheet_name=1, skiprows=20, skipfooter=2)
df.head()

summarize data

In [None]:
df.info()

check missing entry

In [None]:
df.isnull().sum()

describe numerical data

In [None]:
df.describe() # by default numeric columns are considered

check the names of numerical and categorical columns

In [None]:
print('columns with numeric data')
print(df.select_dtypes(include='number').columns.tolist())
print('columns with non numerica data')
print(df.select_dtypes(exclude='number').columns.tolist())

describe non-numerical data

In [None]:
df.describe(exclude='number')

# data wrangling

In [None]:
df.head(10)

Drop columns that are not required

In [None]:
cols_to_drop=['Type','Coverage','AREA','DEV','REG']
df.drop(columns=cols_to_drop, inplace=True)
df.head()

renaming to columns to something meaningful

In [None]:
rename_dict={'OdName':'Country',
             'AreaName': 'Continent',
             'RegName':'Region',
             'DevName':'Status'}
df.rename(columns=rename_dict, inplace=True)
df.head()


add country wise total immigration

In [None]:
years = list(range(1980,2014))
df['Total'] = df[years].sum(axis=1)
df.head()

since each country is unique, i will use country as index

In [None]:
df.set_index('Country', inplace=True)
df.head()

# Analysis
- analyse a county's trend of immigration
- Compare multiple countries trends
- provide a trend comparing total immigration per year
- compare top five country immigration trend

In [None]:
print(df.index.tolist())

In [None]:
df.head()

In [None]:
df.loc['Algeria',years].plot(kind='area',
                             figsize=(15,5),
                             title="India's immigration trend",
                             alpha=0.5)

In [None]:
countries=['India','Pakistan']
df.loc[countries,years].T.plot(figsize=(15,5),kind='area',alpha=0.2,stacked=False)

In [None]:
df.sort_values(by='Total',ascending=False,inplace=True)

In [None]:
df.head()[years].T

In [None]:
countries=df.head().index.tolist()
df.loc[countries,years]

In [None]:
df.head()[years].T.plot(figsize=(15,5),kind='area',stacked=False,alpha=0.2)

In [None]:
df.loc['Philippines',years].plot(figsize=(15,5))

In [None]:
df.loc['Philippines',years].mean()

In [None]:
df.loc['Philippines',years].plot(figsize=(15,5))
plt.xticks(ticks=range(0,len(years)),labels=years,rotation=90)
plt.text(0,30000,"Philippines migration trend",fontdict={'color':'red',
                                                         'fontsize':'14'})
plt.annotate("1993\n immigration \nspikes",(13,20000),(8,25000),arrowprops={'arrowstyle':'->'})
plt.annotate("2010 \nimmigration \nspikes",(30,39000),(25,30000),arrowprops={'arrowstyle':'->'})
plt.vlines(30,0,50000,linestyle=':')
plt.vlines(13,0,50000,linestyle=':')
plt.hlines(df.loc['Philippines',years].mean(),0,34,linestyles=":",colors='red')
plt.text(0,df.loc['Philippines',years].mean()+600,"Avg Immigration 15000/yr")
plt.show()

In [None]:
top5df = df.head()[years].T.copy()
top5df.head()

In [None]:
px.area(top5df,x=top5df.index, y= ['India','China'])

In [None]:
df[years].sum()

In [None]:
year_wise_trend=df[years].sum()
fig=px.funnel(year_wise_trend,year_wise_trend.index,
              year_wise_trend.values,
              title="Year wise immigration Trend",
              height=600)
fig.show()

In [None]:
df.head()


In [46]:
px.choropleth(
    data_frame=df,
    locations=df.index,
    locationmode='country names',
    color='Total',
    projection='orthographic',
    color_continuous_scale='Rainbow',
    height=600,
    width=600
)

In [47]:
px.sunburst(
    data_frame=df,
    path=['Continent','Status','Region'],
    values='Total',
    names=df.index
)

In [48]:
px.treemap(
    data_frame=df,
    path=['Continent','Status','Region'],
    values='Total',
    names=df.index
)

In [50]:
px.scatter_3d(
    df,
    x=1980,
    y=1981,
    z=1982,
)