<a href="https://colab.research.google.com/github/TahminaAnondi/migration_ML/blob/main/NZ_Migration.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

#**Import the libraries.**

In [187]:
import pandas as pd
from sklearn.model_selection import train_test_split
#from sklearn import svm # it is a supervised machine learning algorithm used for classification and regression tasks.
import seaborn as sns
import matplotlib.pyplot as plt
#from sklearn.metrics import mean_squared_error
import numpy as np
from sklearn.ensemble import RandomForestRegressor #It's based on the idea of constructing multiple decision trees during training and combining their predictions to improve accuracy and reduce overfitting.
import plotly.express as px

#Load the Data

In [188]:
migration = pd.read_csv('migration_nz.csv')
migration.head()

Unnamed: 0,Measure,Country,Citizenship,Year,Value
0,Arrivals,Oceania,New Zealand Citizen,1979,11817.0
1,Arrivals,Oceania,Australian Citizen,1979,4436.0
2,Arrivals,Oceania,Total All Citizenships,1979,19965.0
3,Arrivals,Antarctica,New Zealand Citizen,1979,10.0
4,Arrivals,Antarctica,Australian Citizen,1979,0.0


## **Data** preprocessing

In [189]:
#continents_to_exclude = ['Antarctica', 'Asia', 'Europe', 'North America', 'Antarctica','South America','Oceania','All countries','Not stated']
#migration_filtered = migration[~migration['Country'].isin(continents_to_exclude)]
# List of countries to keep
countries_to_keep = ['USA', 'India','Brazil','Bangladesh','Nigeria']

# Filter out rows where the 'Country' column is not in the list of countries to keep
migration_filtered_for_UIBBN = migration[migration['Country'].isin(countries_to_keep)]

#List of measure to keep
measure_to_keep = ['Net']

# Filter out rows where the 'Country' column is not in the list of countries to keep
migration_filtered_for_UIBBN = migration_filtered_for_UIBBN[migration_filtered_for_UIBBN['Measure'].isin(measure_to_keep)]

print(migration_filtered_for_UIBBN)

      Measure     Country             Citizenship  Year   Value
1620      Net  Bangladesh     New Zealand Citizen  1979    -2.0
1621      Net  Bangladesh      Australian Citizen  1979     0.0
1622      Net  Bangladesh  Total All Citizenships  1979    -6.0
1641      Net       India     New Zealand Citizen  1979   -65.0
1642      Net       India      Australian Citizen  1979    -4.0
...       ...         ...                     ...   ...     ...
86275     Net         USA      Australian Citizen  2016     0.0
86276     Net         USA  Total All Citizenships  2016  1286.0
86433     Net     Nigeria     New Zealand Citizen  2016     5.0
86434     Net     Nigeria      Australian Citizen  2016     0.0
86435     Net     Nigeria  Total All Citizenships  2016    64.0

[570 rows x 5 columns]


# Replacing 'Total All Citizenships' to 'other citizenship'

In [190]:

migration_filtered_for_UIBBN.loc[migration['Citizenship'] == 'Total All Citizenships', 'Citizenship'] = 'Other citizenship'


In [191]:
migration_filtered_for_UIBBN.to_csv('filtered_migration_data.csv', index=False)

In [192]:
print(migration_filtered_for_UIBBN)

      Measure     Country          Citizenship  Year   Value
1620      Net  Bangladesh  New Zealand Citizen  1979    -2.0
1621      Net  Bangladesh   Australian Citizen  1979     0.0
1622      Net  Bangladesh    Other citizenship  1979    -6.0
1641      Net       India  New Zealand Citizen  1979   -65.0
1642      Net       India   Australian Citizen  1979    -4.0
...       ...         ...                  ...   ...     ...
86275     Net         USA   Australian Citizen  2016     0.0
86276     Net         USA    Other citizenship  2016  1286.0
86433     Net     Nigeria  New Zealand Citizen  2016     5.0
86434     Net     Nigeria   Australian Citizen  2016     0.0
86435     Net     Nigeria    Other citizenship  2016    64.0

[570 rows x 5 columns]


filtered_migration.info()

In [193]:
migration_filtered_for_UIBBN.isna().sum()

Measure        0
Country        0
Citizenship    0
Year           0
Value          0
dtype: int64

In [194]:
migration_filtered_for_UIBBN["Value"].fillna(migration_filtered_for_UIBBN["Value"].median(),inplace=True)
migration_filtered_for_UIBBN.isna().sum()

Measure        0
Country        0
Citizenship    0
Year           0
Value          0
dtype: int64

In [195]:
migration_filtered_for_UIBBN['Measure'].unique()

array(['Net'], dtype=object)

In [196]:
!pip install plotly



In [197]:
migration_filtered_for_UIBBN.isna().sum()

Measure        0
Country        0
Citizenship    0
Year           0
Value          0
dtype: int64

In [198]:
migration_filtered_for_UIBBN["Value"].fillna(migration_filtered_for_UIBBN["Value"].median(),inplace=True)
migration_filtered_for_UIBBN.isna().sum()

Measure        0
Country        0
Citizenship    0
Year           0
Value          0
dtype: int64

In [199]:
migration_filtered_for_UIBBN['Citizenship'].unique()

array(['New Zealand Citizen', 'Australian Citizen', 'Other citizenship'],
      dtype=object)

In [200]:
# Factorize the 'Country' and 'Citizenship' columns for numerical representation
migration_filtered_for_UIBBN['Country'] = pd.factorize(migration_filtered_for_UIBBN.Country)[0]
migration_filtered_for_UIBBN['Measure'] = pd.factorize(migration_filtered_for_UIBBN.Measure)[0]
migration_filtered_for_UIBBN['Citizenship'] = pd.factorize(migration_filtered_for_UIBBN.Citizenship)[0]



In [201]:
x_data = migration_filtered_for_UIBBN[['Country','Citizenship','Measure','Year']]
y_data = migration_filtered_for_UIBBN['Value']

In [202]:
x_training_data, x_test_data, y_training_data, y_test_data = train_test_split(x_data, y_data, test_size = 0.30, random_state=9)


#**Random Forest Regression**

In [203]:
model_rf = RandomForestRegressor(n_estimators=70,max_features = 3,max_depth=5,n_jobs=-1)
# Train the model on the training data.
print(y_training_data)
model_rf.fit(x_training_data, y_training_data)
# Make predictions on the test data.
predictions_rf = model_rf.predict(x_test_data)

43171      0.0
49722     -8.0
35796    -14.0
6175       0.0
79604     28.0
         ...  
19859    139.0
8858     350.0
77046    -14.0
54012      7.0
58831     -1.0
Name: Value, Length: 399, dtype: float64


In [204]:
model_rf.score(x_test_data, y_test_data)

0.5698927950658593

In [205]:
from sklearn.metrics import mean_squared_error

# Calculate Mean Squared Error
mse_rf = mean_squared_error(y_test_data, predictions_rf)

print(f"Mean Squared Error of Random Forest Regression: {mse_rf}")


Mean Squared Error of Random Forest Regression: 191781.1950219158


#**Logistic Regression**


In [206]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
# Initialize and train the Logistic Regression model
logreg_model = LogisticRegression(random_state=42)
logreg_model.fit(x_training_data, y_training_data)

# Make predictions on the test set
y_pred = logreg_model.predict(x_test_data)

# Evaluate the model
accuracy = accuracy_score(y_test_data, y_pred)
conf_matrix = confusion_matrix(y_test_data, y_pred)
classification_rep = classification_report(y_test_data, y_pred)

print(f"Accuracy: {accuracy:.2f}")
#print("Confusion Matrix:")
#print(conf_matrix)
#print("Classification Report:")
#print(classification_rep)

Accuracy: 0.15



lbfgs failed to converge (status=1):
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.


Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.


Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.



In [207]:
logreg_model.score(x_test_data, y_test_data)

0.14619883040935672

In [208]:
# Calculate Mean Squared Error
mse_lr = mean_squared_error(y_test_data, y_pred)

print(f"Mean Squared Error: {mse_lr}")

Mean Squared Error: 462554.29824561405


#**Multiple Linear Regression**

In [209]:
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score
# Create a linear regression model
model_mlr = LinearRegression()

# Train the model on the training data
model_mlr.fit(x_training_data, y_training_data)

# Make predictions on the test data
predictions_mlr = model_mlr.predict(x_test_data)

# Evaluate the model
score_mlr = r2_score(y_test_data, predictions_mlr)

print(f"Multiple Linear Regression Score: {score_mlr}")

Multiple Linear Regression Score: 0.013821407797085516


In [210]:
# Calculate Mean Squared Error
mse_mlr = mean_squared_error(y_test_data, predictions_mlr)

print(f"Mean Squared Error: {mse_mlr}")

Mean Squared Error: 439728.7623830104


**Grid Search**

In [211]:
from sklearn.model_selection import GridSearchCV



# Create a linear regression model
model_lr = LinearRegression()

# Define the parameter grid to search
param_grid = {
    'fit_intercept': [True, False]

}

# Create the GridSearchCV object
grid_search = GridSearchCV(model_lr, param_grid, scoring='neg_mean_squared_error', cv=5)

# Fit the model to the data
grid_search.fit(x_training_data, y_training_data)

# Get the best parameters
best_params = grid_search.best_params_

# Get the best model
best_model = grid_search.best_estimator_

# Make predictions on the test set
predictions_gridSearchCV = best_model.predict(x_test_data)

# Evaluate the model
mse = mean_squared_error(y_test_data, predictions_gridSearchCV)
print(f"Mean Squared Error on Test Set: {mse}")
print("Best Hyperparameters:", best_params)


Mean Squared Error on Test Set: 439728.7623830104
Best Hyperparameters: {'fit_intercept': True}


In [212]:
r2 = r2_score(y_test_data, predictions_gridSearchCV)
print(f"Grid Search Score: {r2}")

Grid Search Score: 0.013821407797085516


#**SVM Regression Model**

In [213]:
from sklearn.svm import SVR
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error


# Create an SVM regression model
model_svm = SVR(kernel='linear')  # You can choose different kernel functions

# Train the model on the training data
model_svm.fit(x_training_data, y_training_data)

# Make predictions on the test data
predictions_svm = model_svm.predict(x_test_data)
# Calculate R² score
r2_svm_score = r2_score(y_test_data, predictions_svm)

# Evaluate the model
mse_svm = mean_squared_error(y_test_data, predictions_svm)
print(f"Mean Squared Error on Test Set (SVM): {mse_svm}")
print(f"Score of (SVM): {r2_svm_score}")


Mean Squared Error on Test Set (SVM): 457159.9820940783
Score of (SVM): -0.025271544917405464


# **Gradient Boosting Regression**

In [214]:
from sklearn.ensemble import GradientBoostingRegressor

# Initialize and train the Gradient Boosting model
gb_model = GradientBoostingRegressor(n_estimators=100, random_state=42)
gb_model.fit(x_training_data, y_training_data)

# Make predictions on the test set
predictions_gb = gb_model.predict(x_test_data)

# Evaluate the model
mse_gb = mean_squared_error(y_test_data, predictions_gb)
r2_gb = r2_score(y_test_data, predictions_gb)

print(f"Mean Squared Error on Test Set (Gradient Boosting): {mse_gb}")
print(f"R² Score on Test Set (Gradient Boosting): {r2_gb}")


Mean Squared Error on Test Set (Gradient Boosting): 165352.88937454225
R² Score on Test Set (Gradient Boosting): 0.6291634898377739


#KNN

In [215]:
from sklearn.neighbors import KNeighborsRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score


# Create a KNN regression model
model_knn = KNeighborsRegressor(n_neighbors=5)  # You can choose the number of neighbors

# Train the model on the training data
model_knn.fit(x_training_data, y_training_data)

# Make predictions on the test data
predictions_knn = model_knn.predict(x_test_data)

# Evaluate the model
mse_knn = mean_squared_error(y_test_data, predictions_knn)
r2_knn = r2_score(y_test_data, predictions_knn)

print(f"Mean Squared Error on Test Set (KNN): {mse_knn}")
print(f"R² Score on Test Set (KNN): {r2_knn}")


Mean Squared Error on Test Set (KNN): 565889.6376608186
R² Score on Test Set (KNN): -0.26911927067549346


#Feature for next 10 years

In [216]:
# Assuming migration_filtered_for_UIBBN is your original DataFrame

# Extract the unique values for 'Country', 'Citizenship', 'Measure'
unique_countries = migration_filtered_for_UIBBN['Country'].unique()
unique_citizenships = migration_filtered_for_UIBBN['Citizenship'].unique()
unique_measures = migration_filtered_for_UIBBN['Measure'].unique()

# Generate future years (adjust the range as needed)
future_years = range(2017, 2027)

# Create a list to store future data
future_data = []

# Loop through combinations of future years, countries, citizenships, and measures
for year in future_years:
    for country in unique_countries:
        for citizenship in unique_citizenships:
            for measure in unique_measures:
                future_data.append({
                    'Year': year,
                    'Country': country,
                    'Citizenship': citizenship,
                    'Measure': measure,
                })

# Create a DataFrame from the future data
future_df = pd.DataFrame(future_data)

# Concatenate the original and future DataFrames
extended_df = pd.concat([migration_filtered_for_UIBBN, future_df], ignore_index=True)
print(extended_df)

# Now, 'extended_df' contains the original data and rows for future years
# You can use this extended DataFrame for training and predicting migration for the next 10 years

# Separate the features (X) and target variable (y)
X_data = extended_df[['Country', 'Citizenship', 'Measure', 'Year']]
y_data = extended_df['Value']
# Simulate historical data
historical_data = migration_filtered_for_UIBBN.copy()

# Simulate 'Value' for historical years
historical_data['Value'] = np.random.randint(1000, 5000, size=len(historical_data))

# Split historical data into training and testing sets
train_historical, test_historical = train_test_split(historical_data, test_size=0.2, random_state=42)

# Train a Random Forest model on historical data
rf_model = RandomForestRegressor(n_estimators=100, random_state=42)
rf_model.fit(train_historical[['Country', 'Citizenship', 'Measure', 'Year']], train_historical['Value'])

# Predict 'Value' for historical test set
predictions_historical_rf = rf_model.predict(test_historical[['Country', 'Citizenship', 'Measure', 'Year']])
predictions_historical_rf = np.round(predictions_historical_rf).astype(int)  # Round to the nearest integer

# Evaluate the model on historical data
mse_historical_rf = mean_squared_error(test_historical['Value'], predictions_historical_rf)
print(f"Mean Squared Error on Historical Data (Random Forest): {mse_historical_rf}")

# Simulate 'Value' for future years
# Assuming X_future contains features for future years
X_future = extended_df.loc[extended_df['Year'] >= 2017, ['Country', 'Citizenship', 'Measure', 'Year']]
future_predictions_rf = rf_model.predict(X_future)
future_predictions_rf = np.round(future_predictions_rf).astype(int)  # Round to the nearest integer

# Update 'Value' in extended_df for future years
extended_df.loc[extended_df['Year'] >= 2017, 'Value'] = future_predictions_rf

# Now 'extended_df' contains simulated or predicted 'Value' as integers for both historical and future years using Random Forest
print(extended_df)
# Define mappings for 'Measure', 'Country', and 'Citizenship'
measure_mapping = {0: 'Net'}
country_mapping = {0: 'USA', 1: 'India', 2: 'Brazil', 3: 'Bangladesh', 4: 'Nigeria'}
citizenship_mapping = {0: 'New Zealand Citizen', 1: 'Australian Citizen', 2: 'Other Citizenship'}

# Map numerical values to categorical values in the DataFrame
extended_df['Measure'] = extended_df['Measure'].map(measure_mapping)
extended_df['Country'] = extended_df['Country'].map(country_mapping)
extended_df['Citizenship'] = extended_df['Citizenship'].map(citizenship_mapping)

# Export the DataFrame to a new Excel file
output_excel_path = 'predicted_migration_results.xlsx'
extended_df.to_excel(output_excel_path, index=False)

print(f"Results exported to: {output_excel_path}")


     Measure  Country  Citizenship  Year  Value
0          0        0            0  1979   -2.0
1          0        0            1  1979    0.0
2          0        0            2  1979   -6.0
3          0        1            0  1979  -65.0
4          0        1            1  1979   -4.0
..       ...      ...          ...   ...    ...
715        0        3            1  2026    NaN
716        0        3            2  2026    NaN
717        0        4            0  2026    NaN
718        0        4            1  2026    NaN
719        0        4            2  2026    NaN

[720 rows x 5 columns]
Mean Squared Error on Historical Data (Random Forest): 1436450.3771929825
     Measure  Country  Citizenship  Year   Value
0          0        0            0  1979    -2.0
1          0        0            1  1979     0.0
2          0        0            2  1979    -6.0
3          0        1            0  1979   -65.0
4          0        1            1  1979    -4.0
..       ...      ...          .

#**Net Migration Over Historical and Future Years**

In [217]:
import plotly.express as px

# Assuming 'extended_df' is your DataFrame with historical and future predictions
# Group data by country and sum the values for each year
grouped_data = extended_df.groupby(['Country', 'Measure', 'Year'])['Value'].sum().reset_index()

# Calculate net migration by subtracting departures from arrivals
grouped_data['Net Migration'] = grouped_data.groupby(['Country'])['Value'].transform('diff').fillna(grouped_data['Value'])

# Create an interactive bar chart using Plotly for net migration
fig = px.bar(grouped_data, x='Year', y='Net Migration', color='Country', facet_col='Measure',
             labels={'Net Migration': 'Net Migration (Arrivals - Departures)', 'Year': 'Year'},
             title='Net Migration Over Historical and Future Years')

# Show the figure
fig.show()


#**SNS Heatmap**

In [218]:
import plotly.graph_objects as go
from plotly.subplots import make_subplots

# Assuming 'heatmap_data' is your DataFrame with historical and predicted migration patterns
heatmap_data = extended_df.groupby(['Country', 'Year', 'Measure', 'Citizenship'])['Value'].sum().reset_index()

# Filter historical and predicted data
historical_data = heatmap_data[heatmap_data['Year'] < 2017]
predicted_data = heatmap_data[heatmap_data['Year'] >= 2017]

# Separate data for arrivals and departures
historical_Net = historical_data[historical_data['Measure'] == 'Net']

predicted_Net = predicted_data[predicted_data['Measure'] == 'Net']


# Calculate net migration for historical and predicted data
historical_arrivals_departures_diff = historical_Net.groupby(['Country', 'Citizenship'])['Value'].sum().reset_index()

predicted_arrivals_departures_diff = predicted_Net.groupby(['Country', 'Citizenship'])['Value'].sum().reset_index()

# Function to create heatmap trace
def create_heatmap_trace(data, name):
    heatmap_trace = go.Heatmap(
        x=data['Citizenship'],
        y=data['Country'],
        z=data['Value'],
        colorscale='YlGnBu',
        colorbar=dict(title='Net Migration (Arrivals - Departures)'),
        hovertemplate='Country: %{y}<br>Citizenship: %{x}<br>Net Migration: %{z}',
        name=name,
    )
    return heatmap_trace

# Create a subplot with two heatmaps for historical data
fig = make_subplots(rows=1, cols=2, subplot_titles=['Historical Net Migration'])

fig.add_trace(create_heatmap_trace(historical_arrivals_departures_diff, 'Historical Net Migration'), row=1, col=1)

fig.update_layout(
    title_text='Historical Net Migration Heatmap',
    height=600,
    width=1200,
)

# Show the figure
fig.show()

# Repeat the process for predicted data
fig = make_subplots(rows=1, cols=2, subplot_titles=['Predicted Net Migration'])

fig.add_trace(create_heatmap_trace(predicted_arrivals_departures_diff, 'Predicted Net Migration'), row=1, col=1)

fig.update_layout(
    title_text='Predicted Net Migration Heatmap',
    height=600,
    width=1200,
)

# Show the figure
fig.show()


#**KMeans Clustering**

In [219]:
import plotly.express as px
from sklearn.cluster import KMeans

# Assuming 'migration_data' is your DataFrame with relevant features
features = ['Citizenship', 'Measure', 'Year', 'Value']
X = extended_df[features]

# Convert categorical variables to numerical representations
X = pd.get_dummies(X)

# Apply K-Means clustering
kmeans = KMeans(n_clusters=3, random_state=42)
extended_df['Cluster'] = kmeans.fit_predict(X)

# Create a 3D scatter plot with Plotly Express
fig = px.scatter_3d(
    extended_df,
    x='Year',
    y='Value',
    z='Citizenship',  # You can choose another feature for the Z-axis
    color='Cluster',
    hover_data=['Country'],
    title='3D Scatter Plot of K-Means Clustering',
    labels={'Value': 'Migration Value'},
)

# Show the figure
fig.show()




