### Import packages

In [1]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestRegressor
from sklearn.neural_network import MLPRegressor
from sklearn.linear_model import LinearRegression
from sklearn import feature_selection as fs
from sklearn import preprocessing as pp
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import cross_val_score
import matplotlib.pyplot as plt
from sklearn.model_selection import validation_curve
from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score
from sklearn.metrics import mean_absolute_error
from boruta import BorutaPy

### Feature selection functions

In [2]:
def read_and_get_data():
    train_data = pd.read_csv('./training.csv')
    test_data = pd.read_csv('./testing.csv')

    x_train = train_data.values[:,2:28]
    y_train = train_data['Appliances'].values

    x_test = test_data.values[:,2:28]
    y_test = test_data['Appliances'].values
    return x_train, y_train, x_test, y_test

def low_variance_selector(var_per):
    return fs.VarianceThreshold(threshold=(var_per*(1-var_per)))

def univariant_selector(univariant_per):
    return fs.SelectPercentile(fs.mutual_info_regression, percentile=univariant_per)

def RFE_selector(num):
    return fs.RFE(estimator=RandomForestRegressor(max_depth=100), n_features_to_select=num, step = 1)

def boruta_selector():
    return BorutaPy(RandomForestRegressor(max_depth=100), n_estimators='auto', verbose=1, random_state=1)

def min_max_scaler():
    return pp.MinMaxScaler(feature_range = (-1,1))

### Pipelines

In [3]:
def gen_linear_pipeline(vp, up, num, degree=2):
    return make_pipeline(low_variance_selector(vp),
                         univariant_selector(up),
                         RFE_selector(num),
                         pp.PolynomialFeatures(degree),
                         LinearRegression()
                        )

def gen_rf_pipeline(vp, up, num, md, ne, degree=2):
    return make_pipeline(low_variance_selector(vp),
                         univariant_selector(up),
                         RFE_selector(num),
                         pp.PolynomialFeatures(degree),
                         RandomForestRegressor(max_depth=md, n_estimators=ne)
                        )

def gen_nn_pipeline(vp, up, num, hls, al=0.0001, degree=2):
    return make_pipeline(low_variance_selector(vp),
                         univariant_selector(up),
                         RFE_selector(num),
                         min_max_scaler(),
                         pp.PolynomialFeatures(degree),
                         MLPRegressor(hidden_layer_sizes=hls, alpha=al)
                        )

### Test pipelines

In [4]:
def test_regressor(x_train,x_test,y_train,y_test,degree):
    s1 = low_variance_selector(0.8)
    s2 = univariant_selector(90)
    s3 = RFE_selector(15)
    #s3 = boruta_selector()
    s4 = pp.PolynomialFeatures(degree)
    s1.fit(x_train, y_train)
    x_transform = s1.transform(x_train)
    s2.fit(x_transform, y_train)
    x_transform = s2.transform(x_transform)
    s3.fit(x_transform, y_train)
    x_transform = s3.transform(x_transform)
    s4.fit(x_transform, y_train)
    x_transform = s4.transform(x_transform)
    rf = RandomForestRegressor(max_depth = 100)
    rf.fit(x_transform,y_train)

    test_transform = s1.transform(x_test)
    test_transform = s2.transform(test_transform)
    test_transform = s3.transform(test_transform)
    test_transform = s4.transform(test_transform)
    y_pred = rf.predict(test_transform)
    return y_pred

def mean_absolute_percentage_error(y_true, y_pred): 
    y_true, y_pred = np.array(y_true), np.array(y_pred)
    return np.mean(np.abs((y_true - y_pred) / y_true)) * 100

def cal_errors(y_test,y_pred):
    mae = mean_absolute_error(y_test, y_pred)
    rms = mean_squared_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)
    mape = mean_absolute_percentage_error(y_test, y_pred)
    print('MAE = {}, RMS = {}, R2 = {}, MAPE = {}'.format(mae,rms,r2,mape))

def draw_plot(y_test,y_pred):
    plt.figure(figsize=(10,10))
    plt.plot(np.arange(len(y_pred)), np.sort(y_pred), color='blue', linewidth=1)

    plt.plot(np.arange(len(y_test)), np.sort(y_test), color='red', linewidth=1)

    plt.xticks(())
    plt.yticks(())

    plt.show()

In [None]:
degrees = {0,1,2,3,4,5}
results = list()
for degree in degrees:
    x_train, y_train, x_test, y_test = read_and_get_data()
    y_pred = test_regressor(x_train,x_test,y_train,y_test,degree)
    print('testing for degree :{}'.format(degree))
    cal_errors(y_test, y_pred)
    mape = mean_absolute_percentage_error(y_test, y_pred)
    results.append(mape)
print(results)



testing for degree :0
MAE = 59.75010391955467, RMS = 10325.315341667474, R2 = -0.00019220599252389725, MAPE = 76.10710955159414




### Validation_curve for different pipelines(to find the best degree)

In [None]:
degree = np.arange(0, 21)
train_score, val_score = validation_curve(gen_rf_pipeline(0.8,5,25,), x_train, y_train,
                                          'polynomialfeatures__degree', scoring='precision' degree, cv=5)

plt.plot(degree, np.median(train_score, 1), color='blue', label='training score')
plt.plot(degree, np.median(val_score, 1), color='red', label='validation score')
plt.legend(loc='best')
plt.ylim(0, 1)
plt.xlabel('degree')
plt.ylabel('score');

In [None]:
degree = np.arange(0, 5)
train_score, val_score = validation_curve(gen_rf_pipeline(vp=0.8,up=5,num=25,md=100,ne=50), x_train, y_train,
                                          'polynomialfeatures__degree', scoring = 'neg_mean_squared_log_error', param_range = degree, cv=5)

plt.plot(degree, np.mean(train_score), color='blue', label='training score')
plt.plot(degree, np.mean(val_score), color='red', label='validation score')
plt.legend(loc='best')
plt.ylim(0, 1)
plt.xlabel('degree')
plt.ylabel('score');

### Test

In [None]:
x_train, y_train, x_test, y_test = read_and_get_data()
nn = gen_linear_pipeline(0.8,90,15,degree=1)
nn.fit(x_train,y_train)
y_pred = nn.predict(x_test)
cal_errors(y_pred,y_test)

### Cross validation

In [None]:
score_result = cross_val_score(nn, x_test, y_test, cv = 10)

In [None]:
score_result