In [None]:
# Standard imports
import numpy as np
import pandas as pd
%matplotlib inline
import matplotlib as mpl
import matplotlib.pyplot as plt

In [None]:
df = pd.read_excel(
    'https://cf-courses-data.s3.us.cloud-object-storage.appdomain.cloud/IBMDeveloperSkillsNetwork-DV0101EN-SkillsNetwork/Data%20Files/Canada.xlsx',
    sheet_name='Canada by Citizenship',
    skiprows=range(20),
    skipfooter=2)

In [None]:
df.head()

In [None]:
df.info()

In [None]:
df.columns

Getting list of indices

In [None]:
df.index

In [None]:
df.shape

In [None]:
df.drop(columns=['AREA', 'REG', 'DEV', 'Type', 'Coverage'], inplace=True)

In [None]:
df.shape

In [None]:
df.rename(columns={'OdName': 'Country',
          'AreaName': 'Continent', 'RegName': 'Region'}, inplace=True)
df.columns

Let us add a total column

In [None]:
df['total'] = df[[a for a in range(1980, 2014, 1)]].sum(axis=1)

In [None]:
df.head()

Let's also check the null values in the columns

In [None]:
df.isna().sum()

Now let's get a quick summary

In [None]:
df.describe()

Now let us change the index to 'country' for easy querying

In [None]:
df.set_index('Country', inplace=True)

In [None]:
df.head()

In [None]:
# removing the name of the index
df.index.name = None

In [None]:
df.head()

In [None]:
df.loc[df.index == 'Japan', [a for a in range(1980, 1986)]]

let's change the column names to str(to be more specific the years)

In [None]:
df.columns = list(map(str, df.columns))
[print(type(x)) for x in df.columns.values]

In [None]:
years = [str(a) for a in range(1980, 2014)]
years

In [None]:
df.loc[(df['Continent'] == 'Africa') & (df['Region'] == 'Southern Africa')]

#### Visualizing Data using Matplotlib

In [None]:
# We will use ggplot style
print(plt.style.available)
mpl.style.use(['ggplot'])

In [None]:
haiti = df.loc['Haiti', years]

#### Line Plots

if one is using the Pandas plot(), Pandas automatically populates the x-axis\
with the index values and the y-axis with the column values (For Series, if DataFrame one must specify)

In [None]:
haiti.plot()

In [None]:
haiti.index = haiti.index.map(int)  # Convert index data type to int
haiti.plot(kind='line')

plt.title('Immigration from Haiti')
plt.ylabel('Number of immigrants')
plt.xlabel('Years')

plt.text(2000, 6000, '2010 Earthquake')
plt.show()

In [None]:
hai_china = df.loc[['Haiti', 'China'], years]
hai_china = hai_china.T
hai_china.index = hai_china.index.map(int)
hai_china.plot(kind='line')

plt.title('Immigration from Haiti')
plt.ylabel('Number of immigrants')
plt.xlabel('Years')

plt.text(2000, 6000, '2010 Earthquake')
plt.savefig('immigration.png')
plt.show()

In [None]:
hai_china

Showing a saved figure

In [None]:
# from Ipython.display import Image
#
# Image.display('immigration.png')

#### Area PLot

In [None]:
df.sort_values('total', ascending=False, axis=0, inplace=True)

df_top5 = df.head()
df_top5.head()

In [None]:
df_top5_2 = df_top5[years].T
df_top5_2.head()

Area plots are stacked by default. And to produce a stacked area plot, each\
column must be either all positive or all negative values (any NaN, i.e. not\
a number, values will default to 0). To produce an unstacked plot, set parameter stacked to value False.

In [None]:
type(df_top5_2.index)

Changing dtype to int

In [None]:
df_top5_2.index = df_top5_2.index.map(int)

In [None]:
df_top5_2.plot(kind='area',
               stacked=False,
               figsize=(20, 10))
plt.title('Immigration Trend of Top 5 Countries')
plt.ylabel('Number of Immigrants')
plt.xlabel('Years')

plt.show()

The unstacked plot has a default transparency (alpha value) at 0.5. We can modify this value by passing in the alpha parameter.

In [None]:
df_top5_2.plot(kind='area',
               alpha=0.25,  # 0 - 1, default value alpha = 0.5
               stacked=False,
               figsize=(20, 10))

plt.title('Immigration Trend of Top 5 Countries')
plt.ylabel('Number of Immigrants')
plt.xlabel('Years')

plt.show()