In [4]:
import pandas as pd
import numpy as np
import bamboolib
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
from sklearn.utils import shuffle
from sklearn.dummy import DummyRegressor
from sklearn.multioutput import MultiOutputRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.gaussian_process import GaussianProcessRegressor
from xgboost import XGBRegressor
from sklearn.svm import SVR
from sklearn.metrics import r2_score
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_error

### Import Data

In [6]:
data_2006_x = pd.read_csv('../Model_training/x_df_2006_data.csv')
data_2006_y = pd.read_csv('../Model_training/y_for_2006_data2011.csv')
data_2011_x = pd.read_csv('../Model_training/x_df_2011_data.csv')
data_2011_y = pd.read_csv('../Model_training/y_for_2011_data2016.csv')
data_2016_x = pd.read_csv('../Model_training/x_df_2016_data.csv')

In [7]:
data_2006_x = data_2006_x.dropna(how='any', axis=1)
data_2011_x = data_2011_x.dropna(how='any', axis=1)
data_2016_x = data_2016_x.dropna(how='any', axis=1)

### set up training data

In [8]:
#join 2006 and 2011 data
data_x = [data_2006_x, data_2011_x]
x = pd.concat(data_x)
x = x.dropna(how={'any'},thresh = 1000, axis=1)

In [9]:
#shuffle data from 2006 and 2011 in order to split training and testing data
data_y = [data_2006_y, data_2011_y]
y_a = pd.concat(data_y)
#shuffle
X, y = shuffle(x, y_a, random_state=13)

#separate the LGA names since it is not needed for running model
#note that the LGA codes have been shuffled with the data so they will still follow the order of the model data
X_a = X.iloc[:,2:]
lgas = X.iloc[:,:2]
y_a =  y.iloc[:,1:]

#drop values that are not found in both census 2006 and 2011
X = X_a.dropna(how='any', axis=1)

#ensure data type is float and check that there are no nan values
X = X_a.astype(np.float32)
y_a = y_a.astype(np.float32)
np.any(np.isnan(X))

False

### Split data in traning and test sets

In [10]:
#split data
X_train, X_test, y_train, y_test = train_test_split(X, y_a, test_size=0.2)


### Baseline 1 - Mean Prediction

In [18]:
dummy_baseline_model = MultiOutputRegressor(DummyRegressor(strategy="mean")).fit(X_train,y_train)
#predict training data
dummy_predictions_train = dummy_baseline_model.predict(X_train)
#predict test data
dummy_predictions = dummy_baseline_model.predict(X_test)

#scoring metrics training data 
dummy_scorer2t = r2_score(y_train,dummy_predictions_train,multioutput='uniform_average')
dummy_mset = mean_squared_error(y_train,dummy_predictions_train,multioutput='uniform_average')
dummy_maet = mean_absolute_error(y_train,dummy_predictions_train,multioutput='uniform_average')

#scoring metrics test data
dummy_scorer2 = r2_score(y_test,dummy_predictions,multioutput='uniform_average')
dummy_mse = mean_squared_error(y_test,dummy_predictions,multioutput='uniform_average')
dummy_mae = mean_absolute_error(y_test,dummy_predictions,multioutput='uniform_average')
print("TRAIN ERROR")
print("dummy_r2 ",dummy_scorer2t,"dummy_mse ",dummy_mset,"dummy_mae ",dummy_maet)
print("TEST ERROR")
print("dummy_r2 ",dummy_scorer2,"dummy_mse ",dummy_mse,"dummy_mae ",dummy_mae)

TRAIN ERROR
dummy_r2  0.10566037735849057 dummy_mse  10145817.0 dummy_mae  163.21056
TEST ERROR
dummy_r2  0.07878015374236474 dummy_mse  7142029.0 dummy_mae  160.33452


### Baseline 2 - Sequence Projection

For baseline 2 please refer to ```Baseline_2.ipynb``` file. Baseline 2 requires the input data to be in a different shape to the training data in this notebook. This is due to data from baseline 2 uses all data from 2006 and 2011 to measure the percentual difference between the census. And then uses this data to predict 2016, which is then compared to ground truth labels to calculate the metrics and scoring.
The data in this notebook has 2006 and 2011 shuffled in order to train the model with the labels respecting to the next time window.

### Random Forest Regressor

In [43]:
random_forest_model = MultiOutputRegressor(RandomForestRegressor(n_estimators = 150,warm_start=True,n_jobs= -1)).fit(X_train,y_train)

In [44]:
#predict training data
random_forest_train_predictions = random_forest_model.predict(X_train)
#predict test data
random_forest_predictions = random_forest_model.predict(X_test)

#scoring metrics training data 
random_forest_train_score = r2_score(y_train,random_forest_train_predictions,multioutput='uniform_average')
random_forest_train_mse= mean_squared_error(y_train,random_forest_train_predictions,multioutput='uniform_average')
random_forest_train_mae = mean_absolute_error(y_train,random_forest_train_predictions,multioutput='uniform_average')

#scoring metrics test data
random_forest_score = r2_score(y_test,random_forest_predictions,multioutput='uniform_average')
random_forest_mse= mean_squared_error(y_test,random_forest_predictions,multioutput='uniform_average')
random_forest_mae = mean_absolute_error(y_test,random_forest_predictions,multioutput='uniform_average')

print("TRAIN ERROR")
print("rf_r2 ",random_forest_train_score,"rf_mse ",random_forest_train_mse,"rf_mae ",random_forest_train_mae)
print("TEST ERROR")
print("rf_r2 ",random_forest_score,"rf_mse ",random_forest_mse,"rf_mae ",random_forest_mae)

TRAIN ERROR
rf_r2  0.9490598127127312 rf_mse  83531.47303960622 rf_mae  6.740944197804907
TEST ERROR
rf_r2  -0.3288632810566601 rf_mse  173567.83459016567 rf_mae  14.428142311012456


### KNN Regressor

In [22]:
#K Neighbours regressor
KNN_reg_model = MultiOutputRegressor(KNeighborsRegressor(n_neighbors=10,n_jobs=-1)).fit(X_train,y_train)

In [23]:
#predict train data
KNN_predictions_train = KNN_reg_model.predict(X_train)
#predict test data
KNN_predictions = KNN_reg_model.predict(X_test)

#scoring metrics training data 
KNN_train_score = r2_score(y_train,KNN_predictions_train,multioutput='uniform_average')
KNN_train_mse= mean_squared_error(y_train,KNN_predictions_train,multioutput='uniform_average')
KNN_train_mae = mean_absolute_error(y_train,KNN_predictions_train,multioutput='uniform_average')

#scoring metrics test data
KNN_score = r2_score(y_test,KNN_predictions,multioutput='uniform_average')
KNN_mse= mean_squared_error(y_test,KNN_predictions,multioutput='uniform_average')
KNN_mae = mean_absolute_error(y_test,KNN_predictions,multioutput='uniform_average')

print("TRAIN ERROR")
print("rf_r2 ",KNN_train_score,"rf_mse ",KNN_train_mse,"rf_mae ",KNN_train_mae)
print("TEST ERROR")
print("rf_r2 ",KNN_score,"rf_mse ",KNN_mse,"rf_mae ",KNN_mae)


TRAIN ERROR
rf_r2  0.4923267158776316 rf_mse  1686027.6 rf_mae  32.166813
TEST ERROR
rf_r2  0.19721880862922025 rf_mse  362041.25 rf_mae  29.946348


### Gaussian Process Regressor

In [24]:
GP_reg_model = MultiOutputRegressor(GaussianProcessRegressor()).fit(X_train,y_train)

In [25]:
#predict train data
GP_predictions_train = GP_reg_model.predict(X_train)
#predict test data
GP_predictions = GP_reg_model.predict(X_test)

#scoring metrics training data 
GP_train_score = r2_score(y_train,GP_predictions_train,multioutput='uniform_average')
GP_train_mse= mean_squared_error(y_train,GP_predictions_train,multioutput='uniform_average')
GP_train_mae = mean_absolute_error(y_train,GP_predictions_train,multioutput='uniform_average')

#scoring metrics test data
GP_score = r2_score(y_test,GP_predictions,multioutput='uniform_average')
GP_mse= mean_squared_error(y_test,GP_predictions,multioutput='uniform_average')
GP_mae = mean_absolute_error(y_test,GP_predictions,multioutput='uniform_average')

print("TRAIN ERROR")
print("rf_r2 ",GP_train_score,"rf_mse ",GP_train_mse,"rf_mae ",GP_train_mae)
print("TEST ERROR")
print("rf_r2 ",GP_score,"rf_mse ",GP_mse,"rf_mae ",GP_mae)


TRAIN ERROR
rf_r2  1.0 rf_mse  1.281484008082509e-13 rf_mae  1.4270309614922139e-08
TEST ERROR
rf_r2  0.09181876285915229 rf_mse  9870936.497410282 rf_mae  140.2006289308176


### XGboost

In [26]:
xgb_model = MultiOutputRegressor(XGBRegressor()).fit(X_train,y_train)

In [27]:
#predict train data
xgb_predictions_train = xgb_model.predict(X_train)
#predict test data
xgb_predictions = xgb_model.predict(X_test)

#scoring metrics training data 
xgb_train_score = r2_score(y_train,xgb_predictions_train,multioutput='uniform_average')
xgb_train_mse= mean_squared_error(y_train,xgb_predictions_train,multioutput='uniform_average')
xgb_train_mae = mean_absolute_error(y_train,xgb_predictions_train,multioutput='uniform_average')

#scoring metrics test data
xgb_score = r2_score(y_test,xgb_predictions,multioutput='uniform_average')
xgb_mse= mean_squared_error(y_test,xgb_predictions,multioutput='uniform_average')
xgb_mae = mean_absolute_error(y_test,xgb_predictions,multioutput='uniform_average')

print("TRAIN ERROR")
print("rf_r2 ",xgb_train_score,"rf_mse ",xgb_train_mse,"rf_mae ",xgb_train_mae)
print("TEST ERROR")
print("rf_r2 ",xgb_score,"rf_mse ",xgb_mse,"rf_mae ",xgb_mae)


TRAIN ERROR
rf_r2  0.8943395306305532 rf_mse  0.3948036 rf_mae  0.04674477
TEST ERROR
rf_r2  -4.290975482687937 rf_mse  117015.69 rf_mae  14.689305


### Support Vector Machine Regressor

In [30]:
svr_model = MultiOutputRegressor(SVR()).fit(X_train,y_train)

In [31]:
#predict train data
svr_predictions_train = svr_model.predict(X_train)
#predict test data
svr_predictions = svr_model.predict(X_test)

#scoring metrics training data 
svr_train_score = r2_score(y_train,svr_predictions_train,multioutput='uniform_average')
svr_train_mse= mean_squared_error(y_train,svr_predictions_train,multioutput='uniform_average')
svr_train_mae = mean_absolute_error(y_train,svr_predictions_train,multioutput='uniform_average')

#scoring metrics test data
svr_score = r2_score(y_test,svr_predictions,multioutput='uniform_average')
svr_mse= mean_squared_error(y_test,svr_predictions,multioutput='uniform_average')
svr_mae = mean_absolute_error(y_test,svr_predictions,multioutput='uniform_average')

print("TRAIN ERROR")
print("rf_r2 ",svr_train_score,"rf_mse ",svr_train_mse,"rf_mae ",svr_train_mae)
print("TEST ERROR")
print("rf_r2 ",svr_score,"rf_mse ",svr_mse,"rf_mae ",svr_mae)


TRAIN ERROR
rf_r2  0.2199786571160182 rf_mse  11241125.234130554 rf_mae  124.93723306669055
TEST ERROR
rf_r2  0.2206270738356772 rf_mse  8276734.672657772 rf_mae  122.29774824929082


### Stacking Ensemble of Regressors

This technique will use the top three regressors from the previous experiment and then will implement a GradientBooster to find the best option to choose given an specific instance

In [35]:
from sklearn.ensemble import StackingRegressor
from sklearn.ensemble import GradientBoostingRegressor
rf=RandomForestRegressor(warm_start = True,n_estimators = 150)#random_forest_model
gp = GaussianProcessRegressor()#GP_reg_model#
xgb = XGBRegressor(warm_start = True)#xgb_model#
estimators = [('rf',rf),('gp',gp),('xgb',xgb)]
final_estimator = GradientBoostingRegressor(warm_start = True,n_estimators=200,subsample=1, min_samples_leaf=25,random_state=42,criterion='mae')

stacked_f_reg = MultiOutputRegressor(StackingRegressor(estimators=estimators,final_estimator=final_estimator,n_jobs=-1, cv=5)).fit(X_train,y_train)

In [36]:
#predict train data
stacked_f_reg_predictions = stacked_f_reg.predict(X_train)
#predict train data
stacked_f_reg_predictions_test = stacked_f_reg.predict(X_test)


#scoring metrics training data 
stacked_train_score = r2_score(y_train,stacked_f_reg_predictions,multioutput='uniform_average')
stacked_train_mse= mean_squared_error(y_train,stacked_f_reg_predictions,multioutput='uniform_average')
stacked_train_mae = mean_absolute_error(y_train,stacked_f_reg_predictions,multioutput='uniform_average')

#scoring metrics test data
stacked_score = r2_score(y_test,stacked_f_reg_predictions_test,multioutput='uniform_average')
stacked_mse= mean_squared_error(y_test,stacked_f_reg_predictions_test,multioutput='uniform_average')
stacked_mae = mean_absolute_error(y_test,stacked_f_reg_predictions_test,multioutput='uniform_average')

print("TRAIN ERROR")
print("rf_r2 ",stacked_train_score,"rf_mse ",stacked_train_mse,"rf_mae ",stacked_train_mae)
print("TEST ERROR")
print("rf_r2 ",stacked_score,"rf_mse ",stacked_mse,"rf_mae ",stacked_mae)

TRAIN ERROR
rf_r2  0.5039255708142446 rf_mse  3223502.583870516 rf_mae  21.008128522706436
TEST ERROR
rf_r2  0.4989274090862527 rf_mse  655816.5031451004 rf_mae  21.600725856856776


### Voting Ensemble

In [46]:
#SUM PREDICTIONS AND AVERAGE THE RESULT OF THE MODELS TO GENERATE A STACKED VERSION

sum_train_predictions = np.add(random_forest_train_predictions, GP_predictions_train,xgb_predictions_train)
avg_train_predictions = sum_train_predictions / 3

sum_predictions = np.add(random_forest_predictions, GP_predictions,xgb_predictions)
avg_predictions = sum_predictions / 3



#print("train r2 is:",stack_train_score,"test r2 is:",stack_score)


#scoring metrics training data 
voting_train_score = r2_score(y_train,avg_train_predictions,multioutput='uniform_average')
voting_train_mse= mean_squared_error(y_train,avg_train_predictions,multioutput='uniform_average')
voting_train_mae = mean_absolute_error(y_train,avg_train_predictions,multioutput='uniform_average')

#scoring metrics test data
voting_score = r2_score(y_test,avg_predictions,multioutput='uniform_average')
voting_mse= mean_squared_error(y_test,avg_predictions,multioutput='uniform_average')
voting_mae = mean_absolute_error(y_test,avg_predictions,multioutput='uniform_average')
print("TRAIN ERROR")
print("rf_r2 ",voting_train_score,"rf_mse ",voting_train_mse,"rf_mae ",voting_train_mae)
print("TEST ERROR")
print("rf_r2 ",voting_score,"rf_mse ",voting_mse,"rf_mae ",voting_mae)

TRAIN ERROR
rf_r2  0.8581664257024147 rf_mse  1557955.1 rf_mae  47.85888
TEST ERROR
rf_r2  0.27350019484352994 rf_mse  4464548.0 rf_mae  93.11902


In [49]:
from sklearn.datasets import make_classification
from sklearn.model_selection import learning_curve
import matplotlib as plt
def evaluate(g,x,y):
    train_sizes, train_scores, valid_scores = learning_curve(g,x,y, scoring = 'accuracy', cv =10)
    plt.figure()
    plt.xlabel("training size")
    plt.ylabel("accuracy")
    plt.plot(train_sizes,  np.mean(train_scores, axis=1), 'o-', color="r",
                     label="Training score")
    plt.plot(train_sizes,  np.mean(valid_scores, axis=1), 'o-', color="g",
                     label="Cross-validation score")
    plt.legend(loc="best")
    return plt.show()





In [50]:
evaluate(random_forest_model,X,y_a)

TypeError: 'module' object is not callable

In [None]:
evaluate(KNN_reg_model,X,y_a)

In [None]:
evaluate(GP_reg_model,X,y_a)

In [None]:
evaluate(xgb_model,X,y_a)

In [None]:
evaluate(svr_model,X,y_a)

In [None]:

#save classifier

from joblib import dump, load

dump(random_forest_model,'random_forest_clf.joblib')
dump(GP_reg_model,'gaussian_processReg_model.joblib')
dump(xgb_model,'xgb_clf.joblib')

