In [1]:
import numpy as np
from sklearn import ensemble
import matplotlib.pyplot as plt
import pandas as pd
from sklearn.model_selection import KFold
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import MinMaxScaler
from sklearn import metrics
import seaborn as sns
import pickle


In [2]:
# #############################################################################
# Load data
#data = pd.read_csv('../../RealData/RealData.csv')
data = pd.read_csv('../../SynData/data/GeneratedData.csv')



In [3]:
# #############################################################################
# Training and Testing Data
X = data[['throughput']]
y = data['CPU']
X = X.fillna(X.mean())
scaler = MinMaxScaler(feature_range=(0, 100))
X = scaler.fit_transform(X)
#X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.3,random_state=101)

n = len(X)
kf = KFold(n_splits=5)
fold = 0
for train_index, test_index in kf.split(X):
    fold += 1
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]

In [4]:
# #############################################################################
# Fit regression model
params = {'n_estimators': [50, 100, 200], 'max_depth':[2, 4, 6], 'min_samples_split': [2, 4, 12],
          'learning_rate': [0.1, 0.01, 0.001], 'loss': ['ls', 'lad', 'huber', 'quantile']}
gb = ensemble.GradientBoostingRegressor()
clf = GridSearchCV(gb, params, verbose=1, n_jobs=-1)

clf.fit(X_train, y_train)
print(clf.best_params_)

Fitting 3 folds for each of 324 candidates, totalling 972 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:    1.7s
[Parallel(n_jobs=-1)]: Done 595 tasks      | elapsed:   15.2s
[Parallel(n_jobs=-1)]: Done 957 out of 972 | elapsed:   26.3s remaining:    0.4s


{'learning_rate': 0.1, 'loss': 'ls', 'max_depth': 4, 'min_samples_split': 12, 'n_estimators': 100}


[Parallel(n_jobs=-1)]: Done 972 out of 972 | elapsed:   26.8s finished


In [5]:
prediction = clf.predict(X_test)
y_test = np.array(list(y_test))
prediction= np.array(prediction)
df = pd.DataFrame({'Actual': y_test.flatten(), 'Predicted': prediction.flatten()})
df

Unnamed: 0,Actual,Predicted
0,0.592078,0.588265
1,0.500662,0.507837
2,0.301197,0.300803
3,0.20963,0.208351
4,0.112262,0.112862
5,0.948188,0.949943
6,0.123777,0.128
7,0.368485,0.363392
8,0.659974,0.661505
9,0.227223,0.227826


In [6]:
# Plot training deviance

# compute test set deviance
test_score = np.zeros((100,), dtype=np.float64)

for i, y_pred in enumerate(clf.predict(X_test)):
    test_score[i] = clf.score(y_test, y_pred)

plt.figure(figsize=(12, 6))
plt.subplot(1, 2, 1)
plt.title('Deviance')
plt.plot(np.arange(params['n_estimators']) + 1, clf.train_score_, 'b-',
         label='Training Set Deviance')
plt.plot(np.arange(params['n_estimators']) + 1, test_score, 'r-',
         label='Test Set Deviance')
plt.legend(loc='upper right')
plt.xlabel('Boosting Iterations')
plt.ylabel('Deviance')


ValueError: Expected 2D array, got 1D array instead:
array=[0.59207803 0.5006619  0.30119744 0.20962974 0.11226249 0.94818765
 0.12377669 0.36848524 0.6599742  0.22722273 0.11611299 0.20724559
 0.7484624  0.6077644  0.40753    0.87997526 0.6066824  0.78111887
 0.50150794 0.58542675 0.9334464  0.55443877 0.3047778  0.51456964
 0.493627   0.86918604 0.7592706  0.11919948 0.8828949  0.6130486
 0.7434831  0.3012217  0.09883588 0.05128798 0.23127033 0.3762537
 0.5229857  0.16743849 0.35976914 0.45143753 0.99036545 0.04931847
 0.21001355 0.33338794 0.48077703 0.54632765 0.9842923  0.7111902
 0.85504055 0.71394295 0.09617222 0.39796665 0.2759692  0.30598286
 0.8407216  0.75385654 0.23901713 0.94291764 0.6385752  0.9123476 ].
Reshape your data either using array.reshape(-1, 1) if your data has a single feature or array.reshape(1, -1) if it contains a single sample.

In [None]:
#Model Evaluation
print('MAE:', metrics.mean_absolute_error(y_test, prediction))
print('MSE:', metrics.mean_squared_error(y_test, prediction))
print('RMSE:', np.sqrt(metrics.mean_squared_error(y_test, prediction)))



In [None]:
sns.distplot(y_test-prediction,bins=30)

In [None]:
# save the model to disk
filename = '../models/gradientBoostingReg_model.sav'
pickle.dump(clf, open(filename, 'wb'))

In [None]:
clf.predict([[0]])