In [21]:
import numpy as np
import pandas as pd
import plotly.graph_objects as go
import plotly.io as pio
pio.renderers.default = "notebook"


# Reading the CSV data while skipping the initial meta-information lines
data = pd.read_csv('./1710002201-eng.csv', skiprows=8)
data.head()


Unnamed: 0,"Geography, province of origin","Geography, province of destination",Vector,Coordinate,2013 / 2014,2014 / 2015,2015 / 2016,2016 / 2017,2017 / 2018,2018 / 2019,2019 / 2020,2020 / 2021,2021 / 2022,2022 / 2023
0,"Newfoundland and Labrador, province of origin 5",,,,Persons,,,,,,,,,
1,,"Newfoundland and Labrador, province of destina...",..,1.1,..,..,..,..,..,..,..,..,..,..
2,,"Prince Edward Island, province of destination",v466446,1.2,84,72,108,130,146,150,100,82,147,187
3,,"Nova Scotia, province of destination",v466451,1.3,994,982,987,1100,1165,1131,1192,818,999,720
4,,"New Brunswick, province of destination",v466452,1.4,334,383,373,393,483,457,408,304,369,641


In [2]:
# Forward fill the "Geography, province of origin" column
data['Geography, province of origin'] = data['Geography, province of origin'].fillna(method='ffill')

# Filter out rows with '..' or 'Persons' in the "2013 / 2014" column
data = data[(data['2013 / 2014'] != '..') & (data['2013 / 2014'] != 'Persons')]

# Melt the dataframe to transform it into a long format
data_long = pd.melt(data, 
                    id_vars=['Geography, province of origin', 'Geography, province of destination'], 
                    value_vars=data.columns[4:], 
                    var_name='Year', 
                    value_name='Migrants')

# Clean the province names
data_long['Geography, province of origin'] = data_long['Geography, province of origin'].str.split(',').str[0]
data_long['Geography, province of destination'] = data_long['Geography, province of destination'].str.split(',').str[0]

# Replace NaN values with 0 in the 'Migrants' column
data_long['Migrants'] = data_long['Migrants'].fillna('0')

# Convert the 'Migrants' column to integer, replacing commas
data_long['Migrants'] = data_long['Migrants'].str.replace(',', '').astype(int)

data_long.head()



Unnamed: 0,"Geography, province of origin","Geography, province of destination",Year,Migrants
0,Newfoundland and Labrador,Prince Edward Island,2013 / 2014,84
1,Newfoundland and Labrador,Nova Scotia,2013 / 2014,994
2,Newfoundland and Labrador,New Brunswick,2013 / 2014,334
3,Newfoundland and Labrador,Quebec,2013 / 2014,173
4,Newfoundland and Labrador,Ontario,2013 / 2014,1683


In [3]:
data_long.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1670 entries, 0 to 1669
Data columns (total 4 columns):
 #   Column                              Non-Null Count  Dtype 
---  ------                              --------------  ----- 
 0   Geography, province of origin       1670 non-null   object
 1   Geography, province of destination  1630 non-null   object
 2   Year                                1670 non-null   object
 3   Migrants                            1670 non-null   int64 
dtypes: int64(1), object(3)
memory usage: 52.3+ KB


In [11]:
mask = pd.isna(data_long['Geography, province of destination'])
null_rows = data_long[mask]

print(null_rows)

                          Geography, province of origin  \
156                                      Symbol legend:   
158                                          Footnotes:   
165   How to cite: Statistics Canada. Table 17-10-00...   
166   https://www150.statcan.gc.ca/t1/tbl1/en/tv.act...   
323                                      Symbol legend:   
325                                          Footnotes:   
332   How to cite: Statistics Canada. Table 17-10-00...   
333   https://www150.statcan.gc.ca/t1/tbl1/en/tv.act...   
490                                      Symbol legend:   
492                                          Footnotes:   
499   How to cite: Statistics Canada. Table 17-10-00...   
500   https://www150.statcan.gc.ca/t1/tbl1/en/tv.act...   
657                                      Symbol legend:   
659                                          Footnotes:   
666   How to cite: Statistics Canada. Table 17-10-00...   
667   https://www150.statcan.gc.ca/t1/tbl1/en/tv.act... 

In [32]:
# Filter out rows with missing values in the "Geography, province of destination" column
data_long_filtered = data_long.dropna(subset=['Geography, province of destination'])

# Check the info of the filtered dataframe to ensure no missing values remain
print(data_long_filtered.info())

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1630 entries, 0 to 1667
Data columns (total 4 columns):
 #   Column                              Non-Null Count  Dtype 
---  ------                              --------------  ----- 
 0   Geography, province of origin       1630 non-null   object
 1   Geography, province of destination  1630 non-null   object
 2   Year                                1630 non-null   object
 3   Migrants                            1630 non-null   int64 
dtypes: int64(1), object(3)
memory usage: 63.7+ KB
None


In [52]:
data_long_clean = data_long_filtered.iloc[:-7]

values_to_filter_out = ['..', '1', '2', '3', '4', '5', '6']
data_long_clean = data_long_filtered[~data_long_filtered['Geography, province of origin'].isin(values_to_filter_out)]

# Get unique provinces from the "Geography, province of origin" column
valid_provinces = data_long_clean['Geography, province of origin'].unique()

# Filter the dataframe to retain only rows where "Geography, province of destination" has valid provinces
data_long_clean = data_long_clean[data_long_clean['Geography, province of destination'].isin(valid_provinces)]


data_long_clean.head(15)

Unnamed: 0,"Geography, province of origin","Geography, province of destination",Year,Migrants
0,Newfoundland and Labrador,Prince Edward Island,2013 / 2014,84
1,Newfoundland and Labrador,Nova Scotia,2013 / 2014,994
2,Newfoundland and Labrador,New Brunswick,2013 / 2014,334
3,Newfoundland and Labrador,Quebec,2013 / 2014,173
4,Newfoundland and Labrador,Ontario,2013 / 2014,1683
5,Newfoundland and Labrador,Manitoba,2013 / 2014,119
6,Newfoundland and Labrador,Saskatchewan,2013 / 2014,122
7,Newfoundland and Labrador,Alberta,2013 / 2014,2648
8,Newfoundland and Labrador,British Columbia,2013 / 2014,439
9,Newfoundland and Labrador,Yukon,2013 / 2014,16


In [51]:
import plotly.graph_objects as go

# Filter the data for the year "2013 / 2014" from the filtered dataframe
year_data_2013_2014 = data_long_clean[data_long_clean['Year'] == '2013 / 2014']

# Group the data by 'Geography, province of origin' and sum the 'Migrants' for each province
grouped_data = year_data_2013_2014.groupby('Geography, province of origin')['Migrants'].sum().reset_index()

# Create a bar chart using Plotly
bar_chart = go.Figure(go.Bar(
    x=grouped_data['Geography, province of origin'],
    y=grouped_data['Migrants'],
    marker_color='blue'
))

bar_chart.update_layout(title_text="Migrants by Province of Origin (2013/2014)", xaxis_title="Province of Origin", yaxis_title="Number of Migrants")

# Display the bar chart
bar_chart.show()



In [53]:
grouped_destination = year_data_2013_2014.groupby('Geography, province of destination')['Migrants'].sum().reset_index()

bar_chart_2 = go.Figure(go.Bar(
    x=grouped_destination['Geography, province of destination'],
    y=grouped_destination['Migrants'],
    marker_color='red'
))

bar_chart_2.update_layout(title_text="Migrants by Province of Destination (2013/2014)", xaxis_title="Province of Destination", yaxis_title="Number of Migrants")

bar_chart_2.show()


### Make the below cell run only once, It should be rerun and with the df its using checked to make sure its the right one

In [58]:
data_long_clean.to_csv('data_long_clean.csv', index=False)

In [77]:

def create_sankey(year):
    # Filter data for the selected year
    year_data = data_long_clean[data_long_clean['Year'] == year]
    
    # List of unique provinces for both origin and destination
    labels = list(year_data['Geography, province of origin'].unique())
    
    # Map each province to a unique index
    label_map = {province: i for i, province in enumerate(labels)}
    
    # Define source, target, and value lists for Sankey diagram
    source = year_data['Geography, province of origin'].map(label_map).tolist()
    target = year_data['Geography, province of destination'].map(label_map).tolist()
    value = year_data['Migrants'].tolist()

    # Create Sankey diagram
    fig = go.Figure(data=[go.Sankey(
        node=dict(
            pad=15,
            thickness=20,
            line=dict(color="black", width=0.5),
            label=labels
        ),
        link=dict(
            source=source[0:100],
            target=target[0:100],
            value=value[0:100]
        )
    )])
    
    fig.update_layout(title_text=f"Interprovincial Migration in Canada ({year})", font_size=10)
    return fig

# Display Sankey diagram for the year '2013 / 2014'
create_sankey('2013 / 2014').show()



In [83]:
def create_sankey(year):
    # Filter data for the selected year
    year_data = data_long_clean[data_long_clean['Year'] == year]
    
    # Calculate total outflow and inflow for each province
    outflow = year_data.groupby('Geography, province of origin')['Migrants'].sum().reset_index()
    inflow = year_data.groupby('Geography, province of destination')['Migrants'].sum().reset_index()
    
    # Sort provinces by outflow and inflow
    sorted_outflow_provinces = outflow.sort_values('Migrants', ascending=False)['Geography, province of origin'].tolist()
    sorted_inflow_provinces = inflow.sort_values('Migrants', ascending=False)['Geography, province of destination'].tolist()
    
    # Concatenate labels to have a set of source nodes and a set of target nodes
    all_labels = sorted_outflow_provinces + sorted_inflow_provinces
    
    # Map each province to a unique index
    label_map_left = {province: i for i, province in enumerate(sorted_outflow_provinces)}
    label_map_right = {province: i + len(sorted_outflow_provinces) for i, province in enumerate(sorted_inflow_provinces)}
    
    # Define source, target, and value lists for Sankey diagram
    source = year_data['Geography, province of origin'].map(label_map_left).tolist()
    target = year_data['Geography, province of destination'].map(label_map_right).tolist()
    value = year_data['Migrants'].tolist()

    # Create Sankey diagram
    fig = go.Figure(data=[go.Sankey(
        node=dict(
            pad=15,
            thickness=20,
            line=dict(color="black", width=0.5),
            label=all_labels  # Use the concatenated label list
        ),
        link=dict(
            source=source,
            target=target,
            value=value
        )
    )])
    
    fig.update_layout(title_text=f"Interprovincial Migration in Canada ({year})", font_size=10)
    return fig


In [85]:
# Create an empty figure
fig = go.Figure()

# Generate Sankey data for each year and add as frames
frames = []
for year in data_long_clean['Year'].unique():
    year_fig = create_sankey(year)
    frames.append(go.Frame(data=year_fig.data, name=year))

fig.frames = frames

# Define steps for the slider
steps = []
for i, year in enumerate(data_long['Year'].unique()):
    step = {
        'args': [[year], {'frame': {'duration': 800, 'redraw': True},
                         'mode': 'immediate',
                         'transition': {'duration': 800}}],
        'label': year,
        'method': 'animate',
        'value': year
    }
    steps.append(step)

# Add slider and play/pause buttons to the figure
sliders = [{
    'active': 0,
    'yanchor': 'top',
    'xanchor': 'left',
    'currentvalue': {
        'font': {'size': 20},
        'prefix': 'Year:',
        'visible': True,
        'xanchor': 'right'
    },
    'transition': {'duration': 900, 'easing': 'linear'},
    'pad': {'b': 10, 't': 50},
    'len': 0.9,
    'x': 0.1,
    'y': 0,
    'steps': steps
}]

updatemenus = [{
    'buttons': [{
        'args': [None, {'frame': {'duration': 500, 'redraw': True},
                        'fromcurrent': True, 'transition': {'duration': 300}}],
        'label': 'Play',
        'method': 'animate'
    }, {
        'args': [[None], {'frame': {'duration': 0, 'redraw': True}, 'mode': 'immediate',
                          'transition': {'duration': 0}}],
        'label': 'Pause',
        'method': 'animate'
    }],
    'direction': 'left',
    'pad': {'r': 10, 't': 87},
    'showactive': False,
    'type': 'buttons',
    'x': 0.1,
    'xanchor': 'right',
    'y': 0,
    'yanchor': 'top'
}]

# Update the layout to include the slider and updatemenu
fig.update_layout(sliders=sliders, updatemenus=updatemenus)

# Update the figure with the data from the first frame
fig.add_traces(frames[0].data)

# Display the figure
fig.show()

