<a href="https://colab.research.google.com/github/TahminaAnondi/migration_ML/blob/main/NZ_Migration.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

#**Import the libraries.**

In [95]:
import pandas as pd
from sklearn.model_selection import train_test_split
#from sklearn import svm # it is a supervised machine learning algorithm used for classification and regression tasks.
import seaborn as sns
import matplotlib.pyplot as plt
#from sklearn.metrics import mean_squared_error
import numpy as np
from sklearn.ensemble import RandomForestRegressor #It's based on the idea of constructing multiple decision trees during training and combining their predictions to improve accuracy and reduce overfitting.
import plotly.express as px

#Load the Data

In [96]:
migration = pd.read_excel('Net_Migration.xlsx')
migration.head()

Unnamed: 0,Country,Year,Value
0,India,1979,-65
1,India,1979,-4
2,India,1979,-39
3,India,1980,10
4,India,1980,-3


## **Data** preprocessing

In [97]:
#continents_to_exclude = ['Antarctica', 'Asia', 'Europe', 'North America', 'Antarctica','South America','Oceania','All countries','Not stated']
#migration_filtered = migration[~migration['Country'].isin(continents_to_exclude)]
# List of countries to keep
countries_to_keep = ['USA', 'India']

# Filter out rows where the 'Country' column is not in the list of countries to keep
migration_filtered_for_UIBBN = migration[migration['Country'].isin(countries_to_keep)]

# Columns to one-hot encode
columns_to_one_hot_encode = ['Country']

# Perform one-hot encoding
migration_encoded = pd.get_dummies(migration_filtered_for_UIBBN, columns=columns_to_one_hot_encode)

print(migration_encoded)


     Year  Value  Country_India  Country_USA
0    1979    -65              1            0
1    1979     -4              1            0
2    1979    -39              1            0
3    1980     10              1            0
4    1980     -3              1            0
..    ...    ...            ...          ...
223  2015     12              0            1
224  2015   1142              0            1
225  2016   -214              0            1
226  2016      0              0            1
227  2016   1286              0            1

[228 rows x 4 columns]


# Replacing 'Total All Citizenships' to 'other citizenship'

In [98]:
migration_encoded.to_csv('filtered_migration_data.csv', index=False)

In [99]:
print(migration_encoded)

     Year  Value  Country_India  Country_USA
0    1979    -65              1            0
1    1979     -4              1            0
2    1979    -39              1            0
3    1980     10              1            0
4    1980     -3              1            0
..    ...    ...            ...          ...
223  2015     12              0            1
224  2015   1142              0            1
225  2016   -214              0            1
226  2016      0              0            1
227  2016   1286              0            1

[228 rows x 4 columns]


filtered_migration.info()

In [100]:
migration_encoded.isna().sum()

Year             0
Value            0
Country_India    0
Country_USA      0
dtype: int64

In [101]:
migration_encoded["Value"].fillna(migration_encoded["Value"].median(),inplace=True)
migration_encoded.isna().sum()

Year             0
Value            0
Country_India    0
Country_USA      0
dtype: int64

In [102]:
!pip install plotly



In [103]:
migration_encoded.isna().sum()

Year             0
Value            0
Country_India    0
Country_USA      0
dtype: int64

In [104]:
migration_encoded["Value"].fillna(migration_encoded["Value"].median(),inplace=True)
migration_encoded.isna().sum()

Year             0
Value            0
Country_India    0
Country_USA      0
dtype: int64

In [105]:
# Separate features (X) and target variable (y)
x_data = migration_encoded.drop('Value', axis=1)  # Drop the target variable 'Value'
y_data = migration_encoded['Value']

print(x_data.head())  # Display the first few rows of the features
print(y_data.head())  # Display the first few rows of the target variable

   Year  Country_India  Country_USA
0  1979              1            0
1  1979              1            0
2  1979              1            0
3  1980              1            0
4  1980              1            0
0   -65
1    -4
2   -39
3    10
4    -3
Name: Value, dtype: int64


In [106]:
x_training_data, x_test_data, y_training_data, y_test_data = train_test_split(x_data, y_data, test_size = 0.30, random_state=9)


#**Random Forest Regression**

In [107]:
model_rf = RandomForestRegressor(n_estimators=70,max_features = 3,max_depth=5,n_jobs=-1)
# Train the model on the training data.
print(y_training_data)
model_rf.fit(x_training_data, y_training_data)
# Make predictions on the test data.
predictions_rf = model_rf.predict(x_test_data)

77     2449
51       20
206    1248
11       25
149     210
       ... 
56     1443
182    -369
227    1286
92     5951
126      76
Name: Value, Length: 159, dtype: int64


In [108]:
model_rf.score(x_test_data, y_test_data)

0.06511931133033388

In [109]:
from sklearn.metrics import mean_squared_error

# Calculate Mean Squared Error
mse_rf = mean_squared_error(y_test_data, predictions_rf)

print(f"Mean Squared Error of Random Forest Regression: {mse_rf}")


Mean Squared Error of Random Forest Regression: 2566674.1180263623


# **Gradient Boosting Regression**

In [110]:
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.metrics import mean_squared_error, r2_score

# Initialize and train the Gradient Boosting model
gb_model = GradientBoostingRegressor(n_estimators=100, random_state=42)
gb_model.fit(x_training_data, y_training_data)

# Make predictions on the test set
predictions_gb = gb_model.predict(x_test_data)

# Evaluate the model
mse_gb = mean_squared_error(y_test_data, predictions_gb)
r2_gb = r2_score(y_test_data, predictions_gb)

print(f"Mean Squared Error on Test Set (Gradient Boosting): {mse_gb}")
print(f"R² Score on Test Set (Gradient Boosting): {r2_gb}")


Mean Squared Error on Test Set (Gradient Boosting): 3028954.7758833836
R² Score on Test Set (Gradient Boosting): -0.10326095040244887


#Feature for next 10 years

In [113]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestRegressor

# Assuming migration_encoded is your one-hot encoded dataset
# Columns to one-hot encode
columns_to_one_hot_encode = ['Country']

# Perform one-hot encoding
migration_encoded = pd.get_dummies(migration_filtered_for_UIBBN, columns=columns_to_one_hot_encode)

# Generate future years (next 10 years)
future_years = range(2017, 2027)  # Adjust the range as needed

# Create a list to store future data
future_data = []

# Loop through combinations of future years, countries, citizenships, and measures
for year in future_years:
    for country in migration_filtered_for_UIBBN['Country'].unique():


          future_data.append({
                    'Year': year,
                    'Country': country,


                })

# Create a DataFrame from the future data
future_df = pd.DataFrame(future_data)

# Perform one-hot encoding on the future data
future_df_encoded = pd.get_dummies(future_df, columns=['Country'])

# Drop the original 'Country', 'Citizenship', and 'Measure' columns after encoding
columns_to_drop = ['Country']

# Drop the corresponding columns if they exist
migration_encoded.drop(columns=columns_to_drop, errors='ignore', inplace=True)

# Concatenate the original and future DataFrames
extended_df = pd.concat([migration_encoded, future_df_encoded], ignore_index=True)

# Split the data into historical (before 2017) and future (2017-2026) data
historical_data = extended_df[extended_df['Year'] < 2017]
future_data = extended_df[(extended_df['Year'] >= 2017) & (extended_df['Year'] <= 2026)]

# Features and target variable for historical data
historical_features = historical_data.drop('Value', axis=1)
historical_target = historical_data['Value']

# Features for future data
future_data_features = future_data.drop('Value', axis=1)

# Define the Random Forest model
rf_model = RandomForestRegressor()

# Train the model on historical data
rf_model.fit(historical_features, historical_target)

# Generate predictions for each unique combination of future years, countries, citizenships, and measures
# Generate predictions for each unique combination of future years, countries, citizenships, and measures
# Generate predictions for each unique combination of future years, countries, citizenships, and measures
for index, row in future_data.iterrows():
    year = row['Year']

    if year > 2016:
        # If the year is after 2016, generate a random prediction
        historical_mean = historical_target.mean()
        historical_std = historical_target.std()
        random_value = np.random.normal(historical_mean, historical_std)
        prediction_value = int(random_value)  # Allow negative predictions
    else:
        # For years before 2017, use the Random Forest model
        prediction_features = row.drop('Value')
        prediction_features_encoded = pd.DataFrame([prediction_features])
        # Ensure 'Value' column is included in prediction features
        prediction_features_encoded['Value'] = 0  # Dummy value, as 'Value' will be replaced
        prediction_value = int(rf_model.predict(prediction_features_encoded)[0])

    # Assign the rounded prediction to the 'Value' column in future_data
    future_data.at[index, 'Value'] = prediction_value



# Concatenate the historical and updated future data
extended_df = pd.concat([historical_data, future_data], ignore_index=True)

# Export the extended DataFrame with predictions to Excel
extended_excel_path = 'extended_migration_data.xlsx'
extended_df.to_excel(extended_excel_path, index=False)

print(f"Extended data (with predictions) exported to: {extended_excel_path}")


Extended data (with predictions) exported to: extended_migration_data.xlsx


#**Net Migration Over Historical and Future Years**

In [116]:
import pandas as pd

# Assuming 'extended_df' is your extended DataFrame with predictions

# Define the selected countries
selected_countries = ['Country_India', 'Country_USA']

# Create a new 'Country' column based on the selected countries
extended_df['Country'] = extended_df[selected_countries].idxmax(axis=1).str.replace('Country_', '')

# Group data by year and country, and sum the values
grouped_data = extended_df.groupby(['Year', 'Country'])['Value'].sum().reset_index()

# Create an interactive bar chart using Plotly for net migration
fig = px.bar(grouped_data, x='Year', y='Value', color='Country',
             labels={'Value': 'Net Migration', 'Year': 'Year'},
             title='Net Migration for Selected Countries Over Historical and Future Years')

# Show the figure
fig.show()


#**SNS Heatmap**

In [117]:
import pandas as pd
import plotly.graph_objects as go

# Assuming 'extended_df' is your extended DataFrame with predictions

# Group data by year, country, and citizenship, and sum the values
grouped_data = extended_df.groupby(['Year', 'Country'])['Value'].sum().reset_index()

# Create a pivot table for the heatmap
heatmap_data = grouped_data.pivot_table(index='Country', columns='Year', values='Value', aggfunc='sum')

# Create a Plotly heatmap
fig = go.Figure(data=go.Heatmap(
    z=heatmap_data.values,
    x=heatmap_data.columns,
    y=heatmap_data.index,
    colorscale='Viridis',
    colorbar=dict(title='Net Migration'),
))

fig.update_layout(
    title='Net Migration Heatmap Over Years',
    xaxis=dict(title='Year'),
    yaxis=dict(title='Country'),
)

# Show the figure
fig.show()


#**KMeans Clustering**

In [118]:
import plotly.express as px
from sklearn.cluster import KMeans

# extended_df is DataFrame with relevant features
features = ['Year', 'Value']
X = extended_df[features]

# Convert categorical variables to numerical representations
X = pd.get_dummies(X)

# Apply K-Means clustering
kmeans = KMeans(n_clusters=3, random_state=42)
extended_df['Cluster'] = kmeans.fit_predict(X)

# Create a 3D scatter plot with Plotly Express
fig = px.scatter_3d(
    extended_df,
    x='Year',
    y='Value',
    z='Value',  # Use 'Value' as the third dimension
    color='Cluster',
    hover_data=['Country'],
    title='3D Scatter Plot of K-Means Clustering',
    labels={'Value': 'Migration Value'},
)

# Show the figure
fig.show()




