In [76]:
import pandas as pd
import numpy as np
from sklearn import preprocessing

In [77]:
df = pd.read_csv('./datasets/cleaned-migration.csv')
df = df[['year',  'migration_no', 'iso_alpha']]
df = df.sort_values(by=['migration_no'], ascending = False)

df

Unnamed: 0,year,migration_no,iso_alpha
1033,1968,80205,GBR
1082,1969,76336,GBR
935,1966,75514,GBR
837,1964,74754,GBR
886,1965,74749,GBR
...,...,...,...
5199,2000,5,UKR
5200,2000,5,VNM
2431,1983,5,NRU
5290,2001,5,GUM


In [78]:
encode = preprocessing.LabelEncoder()
encode.fit(df.iso_alpha)
df.iso_alpha=encode.transform(df.iso_alpha)
df.iso_alpha


1033     69
1082     69
935      69
837      69
886      69
       ... 
5199    199
5200    206
2431    146
5290     82
4052    138
Name: iso_alpha, Length: 8105, dtype: int64

In [79]:
from sklearn.model_selection import train_test_split
X = df.drop(columns=['migration_no'], axis = 1) 
y = df.migration_no.values  
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [80]:
from sklearn.linear_model import LinearRegression, Lasso
from sklearn.model_selection import cross_val_score

In [81]:
lm = LinearRegression()
lm.fit(X_train, y_train)
np.mean(cross_val_score(lm,X_train,y_train, scoring = 'neg_mean_absolute_error', cv = 3))  

-1029.388511963393

In [82]:
lm_l = Lasso(alpha=0.7)
lm_l.fit(X_train,y_train)
np.mean(cross_val_score(lm_l,X_train,y_train, scoring = 'neg_mean_absolute_error', cv = 3))

-1029.3747310049582

In [83]:
from sklearn.ensemble import RandomForestRegressor
rf = RandomForestRegressor()
rf.fit(X_train,y_train)
np.mean(cross_val_score(rf,X_train,y_train,scoring = 'neg_mean_absolute_error', cv= 3))  

-253.6870016222773

In [84]:
from sklearn.tree import DecisionTreeRegressor
dtr = DecisionTreeRegressor(min_samples_leaf=.01)
dtr.fit(X_train,y_train)
np.mean(cross_val_score(dtr,X_train,y_train, scoring = 'neg_mean_absolute_error', cv = 3))  

-820.163422992653

In [85]:
# Predict
tpred_lm = lm.predict(X_test)
tpred_lml = lm_l.predict(X_test)
tpred_rf = rf.predict(X_test)
tpred_dtr = dtr.predict(X_test)

In [86]:
# Performance
from sklearn.metrics import mean_absolute_error
from sklearn import metrics

mean_absolute_error(y_test,tpred_lm) # Linear Regression
print('MAE: %d' % (y_test != tpred_lm).sum())
print('R2 Score: %.2f' % metrics.r2_score(y_test, tpred_lm))

MAE: 1621
R2 Score: 0.04


In [87]:
mean_absolute_error(y_test,tpred_lml) # Lasso Regression
print('MAE: %d' % (y_test != tpred_lml).sum())
print('R2 Score: %.2f' % metrics.r2_score(y_test, tpred_lml))

MAE: 1621
R2 Score: 0.04


In [88]:
mean_absolute_error(y_test,tpred_dtr) # Decision Tree Regressor
print('MAE: %d' % (y_test != tpred_dtr).sum())
print('R2 Score: %.2f' % metrics.r2_score(y_test, tpred_dtr))

MAE: 1620
R2 Score: 0.21


In [91]:
import joblib
joblib.dump(dtr,'trained_model')

model = joblib.load('trained_model')

In [95]:
predicts = pd.DataFrame(data = [[1980,69]], columns=['year', 'iso_alpha'])
pred = model.predict(predicts)
print("Predict {}".format((pred[0])))

Predict 20685.028169014084
