# Import Library

In [1]:
import csv, os

# ==============================================================================
# Data manipulation
# ==============================================================================
import numpy as np
import pandas as pd

# ==============================================================================
# Plots
# ==============================================================================
import matplotlib.pyplot as plt
plt.style.use('fivethirtyeight')
plt.rcParams['lines.linewidth'] = 1.5
%matplotlib inline

# ==============================================================================
# Modeling and Forecasting
# ==============================================================================
from sklearn.svm import SVR
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler

# ==============================================================================
# Warnings configuration
# ==============================================================================
import warnings
# warnings.filterwarnings('ignore')

# Import Data

In [2]:
df = pd.read_csv('UN_data_country_specific_till_20250.csv')
df.head()

Unnamed: 0,LocTypeID,LocTypeName,Location,Time,TPopulation1July,TPopulationMale1July,TPopulationFemale1July,PopDensity,PopSexRatio,MedianAgePop,...,LE65Male,LE65Female,LE80,LE80Male,LE80Female,InfantDeaths,IMR,LBsurvivingAge1,Under5Deaths,NetMigrations
0,4,Country/Area,Afghanistan,1960,8622.466,4476.521,4145.945,13.2797,107.9735,17.9998,...,8.8874,9.5773,4.0539,3.9313,4.1994,103.79,240.5329,364.106,151.545,2.606
1,4,Country/Area,Afghanistan,1961,8790.14,4556.369,4233.771,13.538,107.6196,17.9244,...,8.9651,9.6573,4.09,3.9663,4.235,104.08,236.1826,373.167,152.016,6.109
2,4,Country/Area,Afghanistan,1962,8969.046,4642.166,4326.881,13.8135,107.2866,17.8525,...,9.0281,9.7308,4.1211,3.9945,4.2681,104.617,232.1626,382.932,152.887,7.016
3,4,Country/Area,Afghanistan,1963,9157.464,4732.954,4424.51,14.1037,106.9712,17.7876,...,9.0874,9.8036,4.1511,4.0207,4.301,105.263,228.239,393.236,153.919,6.681
4,4,Country/Area,Afghanistan,1964,9355.514,4828.822,4526.692,14.4087,106.6744,17.7305,...,9.1502,9.8749,4.1812,4.0481,4.3326,105.942,224.3317,403.99,154.971,7.079


# Data Preprocessing
- Drop useless columns like LocTypeID, LocTypeName, and Location
- Fill missing data with the median

In [38]:
df = pd.read_csv('UN_data_country_specific_till_20250.csv')
df = df.drop(['LocTypeID','LocTypeName', 'Location'], axis = 1)

to_save_df = df.copy()

# Fill mean of NA with group Location(Country)
# df = df.groupby('Location').transform(lambda x: x.fillna(x.mean()))
# df = df.drop(['Location' ], axis = 1)

# Fill NA with global median
for col in df.columns:
  df[col] = df[col].fillna(df[col].median())

df.head()

Unnamed: 0,Time,TPopulation1July,TPopulationMale1July,TPopulationFemale1July,PopDensity,PopSexRatio,MedianAgePop,NatChange,NatChangeRT,PopChange,...,LE65Male,LE65Female,LE80,LE80Male,LE80Female,InfantDeaths,IMR,LBsurvivingAge1,Under5Deaths,NetMigrations
0,1960,8622.466,4476.521,4145.945,13.2797,107.9735,17.9998,158.818,18.419,161.436,...,8.8874,9.5773,4.0539,3.9313,4.1994,103.79,240.5329,364.106,151.545,2.606
1,1961,8790.14,4556.369,4233.771,13.538,107.6196,17.9244,167.811,19.094,173.912,...,8.9651,9.6573,4.09,3.9663,4.235,104.08,236.1826,373.167,152.016,6.109
2,1962,8969.046,4642.166,4326.881,13.8135,107.2866,17.8525,176.875,19.725,183.901,...,9.0281,9.7308,4.1211,3.9945,4.2681,104.617,232.1626,382.932,152.887,7.016
3,1963,9157.464,4732.954,4424.51,14.1037,106.9712,17.7876,186.264,20.344,192.935,...,9.0874,9.8036,4.1511,4.0207,4.301,105.263,228.239,393.236,153.919,6.681
4,1964,9355.514,4828.822,4526.692,14.4087,106.6744,17.7305,196.084,20.964,203.164,...,9.1502,9.8749,4.1812,4.0481,4.3326,105.942,224.3317,403.99,154.971,7.079


Split data for training, validaion, and test

In [39]:
train_year, val_year = 2010, 2020
df_train, df_val, df_test = df[df.Time <=train_year], df[(df.Time>train_year) & (df.Time<=val_year)], df[df.Time>val_year]
df_train_test = df[df.Time<=val_year]
df_train_test

Unnamed: 0,Time,TPopulation1July,TPopulationMale1July,TPopulationFemale1July,PopDensity,PopSexRatio,MedianAgePop,NatChange,NatChangeRT,PopChange,...,LE65Male,LE65Female,LE80,LE80Male,LE80Female,InfantDeaths,IMR,LBsurvivingAge1,Under5Deaths,NetMigrations
0,1960,8622.466,4476.521,4145.945,13.2797,107.9735,17.9998,158.818,18.419,161.436,...,8.8874,9.5773,4.0539,3.9313,4.1994,103.790,240.5329,364.106,151.545,2.606
1,1961,8790.140,4556.369,4233.771,13.5380,107.6196,17.9244,167.811,19.094,173.912,...,8.9651,9.6573,4.0900,3.9663,4.2350,104.080,236.1826,373.167,152.016,6.109
2,1962,8969.046,4642.166,4326.881,13.8135,107.2866,17.8525,176.875,19.725,183.901,...,9.0281,9.7308,4.1211,3.9945,4.2681,104.617,232.1626,382.932,152.887,7.016
3,1963,9157.464,4732.954,4424.510,14.1037,106.9712,17.7876,186.264,20.344,192.935,...,9.0874,9.8036,4.1511,4.0207,4.3010,105.263,228.2390,393.236,153.919,6.681
4,1964,9355.514,4828.822,4526.692,14.4087,106.6744,17.7305,196.084,20.964,203.164,...,9.1502,9.8749,4.1812,4.0481,4.3326,105.942,224.3317,403.990,154.971,7.079
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
19530,2016,14452.704,6796.658,7656.047,37.3600,88.7750,17.1543,358.160,24.730,298.237,...,12.4248,14.4471,6.7904,6.0570,7.3040,19.043,39.6049,464.459,27.570,-59.918
19531,2017,14751.101,6940.630,7810.471,38.1313,88.8632,17.3043,358.458,24.250,298.556,...,12.4335,14.5308,6.8135,6.0671,7.3342,18.619,38.7236,464.979,26.781,-59.918
19532,2018,15052.184,7086.002,7966.182,38.9096,88.9511,17.4780,363.531,24.102,303.610,...,12.4796,14.6291,6.8399,6.0785,7.3687,18.057,37.3543,468.496,25.638,-59.918
19533,2019,15354.608,7231.990,8122.618,39.6914,89.0352,17.6663,361.162,23.475,301.237,...,12.4448,14.6607,6.8614,6.0872,7.3948,18.038,37.1948,469.639,25.641,-59.918


In [40]:
y_train_test = df_train_test[['NetMigrations']].values.astype(float)
X_train_test = df_train_test.drop(['NetMigrations'], axis = 1).values.astype(float)

y_train = df_train[['NetMigrations']].values.astype(float)
X_train = df_train.drop(['NetMigrations'], axis = 1).values.astype(float)

y_val = df_val[['NetMigrations']].values.astype(float)
X_val = df_val.drop(['NetMigrations'], axis = 1).values.astype(float)

y_test = df_test[['NetMigrations']].values.astype(float)
X_test = df_test.drop(['NetMigrations'], axis = 1).values.astype(float)

Scale data

In [43]:
from sklearn.preprocessing import StandardScaler
def myScale(X, y):
  sc_X = StandardScaler()
  sc_y = StandardScaler()
  X = sc_X.fit_transform(X)
  y = sc_y.fit_transform(y)
  
  return X, y


X_train, y_train = myScale(X_train, y_train)
X_test, y_test = myScale(X_test, y_test)
X_val, y_val = myScale(X_val, y_val)

# Model Training & Validation

In [44]:
import sklearn.metrics as metrics
def regression_results(y_true, y_pred):    # Regression metrics
    metrics_dict = {}

    try:
      explained_variance=metrics.explained_variance_score(y_true, y_pred)
      # print('explained_variance: ', round(explained_variance,4))       
      metrics_dict['explained_variance'] = round(explained_variance,4)
    except: pass
    
    try:
      mean_absolute_error=metrics.mean_absolute_error(y_true, y_pred) 
      mse=metrics.mean_squared_error(y_true, y_pred) 
      # print('MSE: ', round(mse,4))
      # print('RMSE: ', round(np.sqrt(mse),4))  
      metrics_dict['MSE'] = round(mse,4)
      metrics_dict['RMSE'] = round(np.sqrt(mse),4)
    except: pass
    
    try:
      mean_squared_log_error=metrics.mean_squared_log_error(y_true, y_pred)
      # print('mean_squared_log_error: ', round(mean_squared_log_error,4))  
      metrics_dict['mean_squared_log_error'] = mean_squared_log_error
    except: pass
    
    try:
      median_absolute_error=metrics.median_absolute_error(y_true, y_pred)
      # print('MAE: ', round(mean_absolute_error,4))  
      metrics_dict['MAE'] = round(mean_absolute_error,4)
    except: pass
    
    try:
      r2=metrics.r2_score(y_true, y_pred)
      # print('r2: ', round(r2,4))  
      metrics_dict['r2'] = round(r2,4)
    except: pass
    
    return metrics_dict

In [45]:
def save_eval(way, model, X_val, y_val, X_test, y_test):
  score_val = model.score(X_val, y_val)
  y_pred = model.predict(X_val)
  metric_results = regression_results(y_val, y_pred)
  mse_val = metric_results['MSE']
  rmse_val = metric_results['RMSE']

  # Evaluation for test data
  y_pred_rdf = model.predict(X_test)
  score_test = model.score(X_test, y_test)

  metric_results = regression_results(y_test, y_pred_rdf)

  metrics_model = {'model': [],
                  'accuracy_val':[], 'mse_val': [], 'rmse_val': [] ,
                  'accuracy_test':[], 'explained_variance_test': [], 'MSE_test': [], 'RMSE_test': [], 'MAE_test': [], 'r2_test': []}
  metrics_model['model'] += [way]
  metrics_model['mse_val'] += [mse_val]
  metrics_model['rmse_val'] += [rmse_val]
  metrics_model['accuracy_val'] += [score_val]
  metrics_model['accuracy_test'] += [score_test]
  for metric in metric_results:
    metrics_model[f'{metric}_test'] += [metric_results[metric]]
  
  return pd.DataFrame(metrics_model)



In [46]:
def save_pred(fileName, y_pred):
  df = pd.read_csv('UN_data_country_specific_till_20250.csv')
  df_test_toSave = df[df.Time<=val_year][['Location', 'Time', 'NetMigrations']]
  df_test_toSave = df_test_toSave.rename({'Location': 'Country', 'NetMigrations': 'Actual_Net_Migration'}, axis=1)
  df_test_toSave.reset_index()
  df_test_toSave['Predicted_Net_Migration'] = y_pred
  df_test_toSave.to_csv(fileName)

In [22]:
# df = pd.read_csv('UN_data_country_specific_till_20250.csv')
# df_test_toSave = df[df.Time<=val_year][['Location', 'Time', 'NetMigrations']]
# df_test_toSave = df_test_toSave.rename({'Location': 'Country', 'NetMigrations': 'Actual_Net_Migration'}, axis=1)
# df_test_toSave.reset_index()
X_train_test
# y_pred_rdf_X_train_test = model.predict(X_train_test)
# save_pred('SVR_RDF_Predicted_2010_2020.csv', y_pred_rdf_X_train_test)

array([[1.960000e+03, 8.622466e+03, 4.476521e+03, ..., 2.405329e+02,
        3.641060e+02, 1.515450e+02],
       [1.961000e+03, 8.790140e+03, 4.556369e+03, ..., 2.361826e+02,
        3.731670e+02, 1.520160e+02],
       [1.962000e+03, 8.969046e+03, 4.642166e+03, ..., 2.321626e+02,
        3.829320e+02, 1.528870e+02],
       ...,
       [2.013000e+03, 1.074500e+02, 5.505300e+01, ..., 1.637540e+01,
        1.739000e+00, 3.600000e-02],
       [2.014000e+03, 1.069120e+02, 5.477300e+01, ..., 1.476980e+01,
        1.718000e+00, 3.000000e-02],
       [2.015000e+03, 1.064820e+02, 5.453800e+01, ..., 2.059435e+01,
        7.911150e+01, 1.999000e+00]])

## SVR model with kernal rbf

In [47]:
model = SVR(kernel='rbf')

# Model Training
model.fit(X_train, y_train)
score_train = model.score(X_train, y_train)

# Predict and save for tain+val
y_pred_rdf_X_train_test = model.predict(X_train_test)
save_pred('SVR_RDF_Predicted_2010_2020.csv', y_pred_rdf_X_train_test)

# Evaluation 
metrics_svr_rbf = save_eval('SVR_RBF', model, X_val, y_val, X_test, y_test)
metrics_svr_rbf

  y = column_or_1d(y, warn=True)


Unnamed: 0,model,accuracy_val,mse_val,rmse_val,accuracy_test,explained_variance_test,MSE_test,RMSE_test,MAE_test,r2_test
0,SVR_RBF,0.568964,0.431,0.6565,0.285258,0.2858,0.7147,0.8454,0.2886,0.2853


## SVR model with kernal poly

In [48]:
model = SVR(kernel='poly')

# Model Training
model.fit(X_train, y_train)
score_train = model.score(X_train, y_train)

# Predict and save for tain+val
y_pred_rdf_X_train_test = model.predict(X_train_test)
save_pred('SVR_POLY_Predicted_2010_2020.csv', y_pred_rdf_X_train_test)

# Evaluation 
metrics_svr_poly = save_eval('SVR_POLY', model, X_val, y_val, X_test, y_test)
metrics_svr_poly

  y = column_or_1d(y, warn=True)


Unnamed: 0,model,accuracy_val,mse_val,rmse_val,accuracy_test,explained_variance_test,MSE_test,RMSE_test,MAE_test,r2_test
0,SVR_POLY,-1.324506,2.3245,1.5246,-6.415375,-6.4036,7.4154,2.7231,0.5875,-6.4154


## SVR model with kernal linear

### Use Cross-Validation to find out the best C for linear kernal
The C with **0.01** has the lowest MSE, which is the best C in this case

In [None]:
C_range = [0.0001, 0.001, 0.01, 0.1, 1, 10, 100]

validation_dict = {'C':[],'accuracy_val':[],'MSE':[],'RMSE':[],'r2':[]}
for C_val in C_range:
  model = SVR(kernel='linear', C=C_val)

  # Model Training
  model.fit(X_train, y_train)
  score_train = model.score(X_train, y_train)

  # Model Validation
  score_val = model.score(X_val, y_val)
  y_pred = model.predict(X_val)
  metric_results = regression_results(y_val, y_pred)
  print(C_val, metric_results)

  validation_dict['C'] += [C_val]
  validation_dict['accuracy_val'] += [score_val]
  validation_dict['MSE'] += [metric_results['MSE']]
  validation_dict['RMSE'] += [metric_results['RMSE']]
  validation_dict['r2'] += [metric_results['r2']]
  

pd.DataFrame(validation_dict)

  y = column_or_1d(y, warn=True)


0.0001 {'explained_variance': 0.0788, 'MSE': 0.923, 'RMSE': 0.9607, 'MAE': 0.2755, 'r2': 0.077}


  y = column_or_1d(y, warn=True)


0.001 {'explained_variance': 0.1708, 'MSE': 0.8306, 'RMSE': 0.9114, 'MAE': 0.2696, 'r2': 0.1694}


  y = column_or_1d(y, warn=True)


0.01 {'explained_variance': 0.5936, 'MSE': 0.4073, 'RMSE': 0.6382, 'MAE': 0.2167, 'r2': 0.5927}


  y = column_or_1d(y, warn=True)


0.1 {'explained_variance': 0.2486, 'MSE': 0.7516, 'RMSE': 0.867, 'MAE': 0.2671, 'r2': 0.2484}


  y = column_or_1d(y, warn=True)


1 {'explained_variance': -0.2145, 'MSE': 1.2147, 'RMSE': 1.1021, 'MAE': 0.3239, 'r2': -0.2147}


  y = column_or_1d(y, warn=True)


10 {'explained_variance': -0.2093, 'MSE': 1.2093, 'RMSE': 1.0997, 'MAE': 0.3151, 'r2': -0.2093}


  y = column_or_1d(y, warn=True)


100 {'explained_variance': -0.2004, 'MSE': 1.2005, 'RMSE': 1.0957, 'MAE': 0.3138, 'r2': -0.2005}


Unnamed: 0,C,accuracy_val,MSE,RMSE,r2
0,0.0001,0.076986,0.923,0.9607,0.077
1,0.001,0.169387,0.8306,0.9114,0.1694
2,0.01,0.592734,0.4073,0.6382,0.5927
3,0.1,0.248362,0.7516,0.867,0.2484
4,1.0,-0.214709,1.2147,1.1021,-0.2147
5,10.0,-0.209309,1.2093,1.0997,-0.2093
6,100.0,-0.200452,1.2005,1.0957,-0.2005


In [49]:
BEST_C = 0.01

model = SVR(kernel='linear', C=BEST_C)

# Model Training
model.fit(X_train, y_train)
score_train = model.score(X_train, y_train)

# Predict and save for tain+val
y_pred_rdf_X_train_test = model.predict(X_train_test)
save_pred('SVR_Linear_Predicted_2010_2020.csv', y_pred_rdf_X_train_test)

# Evaluation 
metrics_svr_linear = save_eval('SVR_Linear', model, X_val, y_val, X_test, y_test)
metrics_svr_linear

  y = column_or_1d(y, warn=True)


Unnamed: 0,model,accuracy_val,mse_val,rmse_val,accuracy_test,explained_variance_test,MSE_test,RMSE_test,MAE_test,r2_test
0,SVR_Linear,0.571242,0.4288,0.6548,0.444287,0.4451,0.5557,0.7455,0.2004,0.4443


## Performance of all models
The SVR with rbg kernal performance the best with the lowest MSE and the hightest R2 for test data.

In [50]:
pd.concat([metrics_svr_rbf, metrics_svr_poly, metrics_svr_linear], join="inner")

Unnamed: 0,model,accuracy_val,mse_val,rmse_val,accuracy_test,explained_variance_test,MSE_test,RMSE_test,MAE_test,r2_test
0,SVR_RBF,0.568964,0.431,0.6565,0.285258,0.2858,0.7147,0.8454,0.2886,0.2853
0,SVR_POLY,-1.324506,2.3245,1.5246,-6.415375,-6.4036,7.4154,2.7231,0.5875,-6.4154
0,SVR_Linear,0.571242,0.4288,0.6548,0.444287,0.4451,0.5557,0.7455,0.2004,0.4443
