In [1]:
%matplotlib notebook

from scipy.io import loadmat

 
import numpy as np
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
import matplotlib.pyplot as plt
import numpy as np
from sklearn.model_selection import train_test_split

mat = loadmat("Project Datasets/powerTrainData.mat")

# X and y correspond to a training set that your model will learn on.
Xtrain = mat["powerTrainInput"]
ytrain = mat["powerTrainOutput"]

print('shape of cnTrainX', Xtrain.shape)
print('shape of cnTrainy', ytrain.shape)

Xtrain = Xtrain.transpose()
ytrain = ytrain.transpose()

print('Xtrain shape:', Xtrain.shape)
print('ytrain shape:', ytrain.shape)


print("Xtrain")
print(Xtrain[:5,:5])
print("ytrain")
print(ytrain[:5])


shape of cnTrainX (15, 844)
shape of cnTrainy (1, 844)
Xtrain shape: (844, 15)
ytrain shape: (844, 1)
Xtrain
[[3240.  2546.7 2438.3 3240.  3240. ]
 [1871.  1793.8 1709.7 2094.  2094. ]
 [1929.  1873.  1839.3 2177.  2315. ]
 [1837.  1468.6 1551.5 1837.  1956. ]
 [2209.  1777.8 1860.8 2209.  2284. ]]
ytrain
[[3359]
 [1868]
 [1943]
 [1900]
 [2252]]


In [2]:
# Standardizing the Data
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()

# Fit on training set only.
scaler.fit(Xtrain)

# Apply transform to both the training set and the validation set.
Xtrain = scaler.transform(Xtrain)

# print("Xtrain")
print(Xtrain[:5,:5])

[[ 1.53530519  1.59593531  1.41013676  1.50372851  1.16726513]
 [-1.05522651 -0.47134155 -0.68542807 -0.78149918 -1.00026294]
 [-0.94547425 -0.25387798 -0.31267869 -0.61598967 -0.58226669]
 [-1.11956404 -1.36426018 -1.14043542 -1.29398043 -1.26127418]
 [-0.41563578 -0.51527359 -0.25084141 -0.55217878 -0.64089965]]


In [3]:
# using PCA for Data Visualization
from sklearn.decomposition import PCA

pca_plot =PCA(n_components=2)

pca_plot.fit(Xtrain)

Xtrain_pca_plot = pca_plot.transform(Xtrain)

print("original shape:   ", Xtrain.shape)
print("transformed shape:", Xtrain_pca_plot.shape)

print('Variance ratio corresponds to the data with PCA:')
print(pca_plot.explained_variance_ratio_)

fig, ax = plt.subplots()
ax.scatter(Xtrain_pca_plot[:, 0], Xtrain_pca_plot[:, 1], color="green", marker="o")
ax.set_xlabel("Principal component 1")
ax.set_ylabel("Principal component 2")
print("end")


original shape:    (844, 15)
transformed shape: (844, 2)
Variance ratio corresponds to the data with PCA:
[0.71291343 0.08305948]


<IPython.core.display.Javascript object>

end


In [4]:
# using PCA for Training the model
from sklearn.decomposition import PCA

print("original shape:", Xtrain.shape)

pca =PCA(n_components=10)

pca.fit(Xtrain)

Xtrain_pca = pca.transform(Xtrain)

print("transformed shape:", Xtrain_pca.shape)

print('Variance retained corresponds to the data after reducing with PCA:')
print(pca.explained_variance_ratio_)

original shape: (844, 15)
transformed shape: (844, 10)
Variance retained corresponds to the data after reducing with PCA:
[0.71291343 0.08305948 0.06661705 0.04012179 0.03533515 0.02463563
 0.01303532 0.00938495 0.00611938 0.00322591]


In [5]:
# using Feature Selection
from sklearn.model_selection import train_test_split
from sklearn.feature_selection import VarianceThreshold

print("original shape:", Xtrain.shape)

# Removing constant features
constant_filter = VarianceThreshold(threshold=0)

constant_filter.fit(Xtrain)

Xtrain = constant_filter.transform(Xtrain)

# Removing quasi features
quasi_filter = VarianceThreshold(threshold=0.01)

quasi_filter.fit(Xtrain)

Xtrain = quasi_filter.transform(Xtrain)

print("transformed shape:", Xtrain.shape)

print("transformed shape:", Xtrain.shape)

original shape: (844, 15)
transformed shape: (844, 15)
transformed shape: (844, 15)


In [6]:
from sklearn.neighbors import KNeighborsRegressor
from sklearn.metrics import mean_squared_error

# X and y data as traing and test set 

Xtrain, Xtest, ytrain, ytest = train_test_split(Xtrain_pca, ytrain, test_size=0.2, random_state=0)

print('shapes of training and validation data after splitiing',Xtrain.shape, Xtest.shape, ytrain.shape, ytest.shape)

# using KNN Regression from sklearn

# Create KNN regression object
regression_model = KNeighborsRegressor()

# Train the model using the training sets
regression_model.fit(Xtrain, ytrain)

# Make predictions using the testing set
ypred = regression_model.predict(Xtest)

print("ytest\n",ytest[:5])
print("ypred\n",ypred[:5])



shapes of training and validation data after splitiing (675, 10) (169, 10) (675, 1) (169, 1)
ytest
 [[2762]
 [1960]
 [3494]
 [2566]
 [2007]]
ypred
 [[2823.2]
 [1910.6]
 [3432.2]
 [2620.8]
 [1968.2]]


In [7]:
print(Xtrain.shape) 
print(ytrain.shape) 
print(Xtest.shape)  
print(ytest.shape) 
print(ypred.shape)

(675, 10)
(675, 1)
(169, 10)
(169, 1)
(169, 1)


In [8]:
#Calculating the errors
from sklearn import metrics
print("mean_squared_error:",metrics.mean_squared_error(ytest,ypred))
print("mean_absolute_error:",metrics.mean_absolute_error(ytest,ypred))
print("root_mean_squared_error:",np.sqrt(metrics.mean_squared_error(ytest,ypred)))


mean_squared_error: 12380.366390532545
mean_absolute_error: 84.4165680473373
root_mean_squared_error: 111.26709482381817


In [9]:
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import cross_val_score

#Hyper Parameters Set
n_neighbors = list(range(1,30))
params = {'n_neighbors':n_neighbors, 'weights':['uniform', 'distance']}

#Making models with hyper parameters sets
model_knn = GridSearchCV(regression_model, param_grid=params, cv=10)

#Learning
model_knn.fit(Xtrain,ytrain)

#The best hyper parameters set
print("Best Hyper Parameters with GridSearchCV:\n",model_knn.best_params_)

#Prediction
ypred=model_knn.predict(Xtest)
print(ytest[:5])
print(ypred[:5])

# Using Cross Validation To Evaluate Model
CV_Result = cross_val_score(model_knn, Xtrain, ytrain, cv=10, n_jobs=-1)
print(); print('cross_val_Result:',CV_Result)
print(); print('cross_val_Mean:', CV_Result.mean())
print(); print('cross_val_Std:', CV_Result.std())    

Best Hyper Parameters with GridSearchCV:
 {'n_neighbors': 4, 'weights': 'distance'}
[[2762]
 [1960]
 [3494]
 [2566]
 [2007]]
[[2828.66820543]
 [1939.35455495]
 [3443.36633828]
 [2528.71063457]
 [1958.12128332]]

cross_val_Result: [0.96669395 0.96764519 0.94519145 0.95524873 0.95701781 0.96688229
 0.9477346  0.96919076 0.96022246 0.94397931]

cross_val_Mean: 0.9579806543554357

cross_val_Std: 0.009231628647937721


In [10]:
from sklearn.model_selection import cross_val_score
import matplotlib.pyplot as plt
from math import sqrt

print ("Cross Validation:")
k_values = list(range(1,30))
cv_scores ={}

# perform 10-fold cross validation
for k in k_values:
    knn = KNeighborsRegressor(n_neighbors = k)
    scores = cross_val_score(knn, Xtrain, ytrain, cv=10, scoring='neg_mean_squared_error')
    rmse= np.sqrt(abs(scores))
    cv_scores.update({k:(rmse.mean())})
print ("cv_scores:\n", cv_scores)


for i,j in cv_scores.items():
    if j==min(cv_scores.values()):
        print ("\nThe best K value is ",i," and the error  is ",j)

# plot
fig, ax = plt.subplots()

# plot the "K" vs "RMSE"
ax.plot(list(cv_scores.keys()), list(cv_scores.values()))

ax.set_xlabel("K")
ax.set_ylabel("RMS Error")
ax.grid()
fig.show()

Cross Validation:
cv_scores:
 {1: 131.17220563761165, 2: 113.28558221560192, 3: 109.06024246092878, 4: 108.69577789208574, 5: 110.83959447725951, 6: 113.32527181291857, 7: 112.95726674099924, 8: 113.6138330519475, 9: 115.1934659830622, 10: 116.02466131830806, 11: 116.53866316757016, 12: 118.06273856471262, 13: 119.38889247949268, 14: 120.12876767081666, 15: 121.20652765389696, 16: 122.03266329935461, 17: 122.63805906694809, 18: 123.33819837829519, 19: 124.5296033392062, 20: 125.42436983054044, 21: 125.68136082696276, 22: 126.94892818685403, 23: 127.4812608533801, 24: 128.1330343636355, 25: 128.8819678282388, 26: 129.6132315929906, 27: 129.62595370789376, 28: 130.053695391486, 29: 131.16672046920036}

The best K value is  4  and the error  is  108.69577789208574


<IPython.core.display.Javascript object>

In [11]:
# Training KNN Regression with the best hyper parameters found through cross-validation
regression_best_knn = KNeighborsRegressor(n_neighbors=8)

regression_best_knn.fit(Xtrain,ytrain)

print(Xtrain.shape)
#Prediction
ypred = regression_best_knn.predict(Xtest)

from sklearn.metrics import mean_squared_error, mean_absolute_error

# calculate mean_squared_error
mean = mean_squared_error(ytest,ypred)
print("mean_squared_error:",format(mean))
# calculate mean_absolute_error
absolute = mean_absolute_error(ytest,ypred)
print("mean_absolute_error:",format(absolute))
rmse = np.sqrt(mean_squared_error(ytest,ypred))
print("root_mean_sqrt_error:",format(rmse))

(675, 10)
mean_squared_error: 12174.186482988165
mean_absolute_error: 82.81139053254438
root_mean_sqrt_error: 110.3366959945247


In [12]:
# using the test data powerTestInput correspond to a test set.
XtestSep = mat["powerTestInput"]
print("Xtest shape",XtestSep.shape)
XtestSep = np.transpose(XtestSep)
print("Xtest shape after reshape",XtestSep.shape)
print(XtestSep[:5,:5])

XtestSep = scaler.transform(XtestSep)
from sklearn.decomposition import PCA
pca =PCA(n_components=10)
pca.fit(XtestSep)
Xtest_pca=pca.transform(XtestSep)
print("Xtest shape after PCA:", Xtest_pca.shape)

print('Variance retained corresponds to the test data after reducing with PCA:')
print(pca.explained_variance_ratio_.sum())

#Prediction of test data with the best KNN Regression parameters
powerTestoutput=regression_best_knn.predict(Xtest_pca)
print("Final output of the test data (powerTestInput):")
print(powerTestoutput[:5])

Xtest shape (15, 115)
Xtest shape after reshape (115, 15)
[[3336.  2820.2 2661.3 3459.  3529. ]
 [3195.  2539.8 2583.3 3195.  3445. ]
 [3329.  2774.  2693.  3446.  3446. ]
 [3272.  2322.5 2552.4 3272.  3459. ]
 [2933.  2449.6 2394.  3060.  3375. ]]
Xtest shape after PCA: (115, 10)
Variance retained corresponds to the test data after reducing with PCA:
0.991183711841375
Final output of the test data (powerTestInput):
[[2168.375]
 [2451.25 ]
 [2181.125]
 [2698.875]
 [2428.   ]]


In [13]:
from sklearn import tree
from sklearn.metrics import mean_squared_error

print(Xtrain.shape) 
print(ytrain.shape) 
regression_model = tree.DecisionTreeClassifier()

# Train the model using the training sets
regression_model.fit(Xtrain, ytrain)

# Make predictions using the testing set
ypred = regression_model.predict(Xtest)



(675, 10)
(675, 1)


In [14]:
print(Xtrain.shape) 
print(ytrain.shape) 
print(Xtest.shape)  
print(ytest.shape) 
print(ypred.shape)

(675, 10)
(675, 1)
(169, 10)
(169, 1)
(169,)


In [15]:
#Calculating the errors
from sklearn import metrics
print("mean_squared_error:",metrics.mean_squared_error(ytest,ypred))
print("mean_absolute_error:",metrics.mean_absolute_error(ytest,ypred))
print("root_mean_squared_error:",np.sqrt(metrics.mean_squared_error(ytest,ypred)))


mean_squared_error: 20969.13017751479
mean_absolute_error: 38750.94082840237
root_mean_squared_error: 144.8072172839282


In [16]:
from sklearn.model_selection import GridSearchCV

#Hyper Parameters Set
# Create the parameter grid based on the results of random search 
param_grid = {
    'max_depth': [8, 9, 10, 11],
    'max_features': [2, 3],
    'min_samples_leaf': [3, 4, 5],
    'min_samples_split': [8, 10, 12],
    'criterion': ['gini', 'entropy']
}

#Making models with hyper parameters sets
model = GridSearchCV(regression_model, param_grid = param_grid, 
                          cv = 3, n_jobs=1, verbose = 2)

#Learning
model.fit(Xtrain,ytrain)

#The best hyper parameters set
print("Best Hyper Parameters with GridSearchCV:\n",model.best_params_)

#Prediction
ypred=model.predict(Xtest)
print(ytest[:5])
print(ypred[:5])

# Using Cross Validation To Evaluate Model
CV_Result = cross_val_score(model, Xtrain, ytrain, cv=10, n_jobs=-1)
print(); print('cross_val_Result:',CV_Result)
print(); print('cross_val_Mean:', CV_Result.mean())
print(); print('cross_val_Std:', CV_Result.std())    

Fitting 3 folds for each of 144 candidates, totalling 432 fits


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.0s remaining:    0.0s


[CV] criterion=gini, max_depth=8, max_features=2, min_samples_leaf=3, min_samples_split=8 
[CV]  criterion=gini, max_depth=8, max_features=2, min_samples_leaf=3, min_samples_split=8, total=   0.0s
[CV] criterion=gini, max_depth=8, max_features=2, min_samples_leaf=3, min_samples_split=8 
[CV]  criterion=gini, max_depth=8, max_features=2, min_samples_leaf=3, min_samples_split=8, total=   0.0s
[CV] criterion=gini, max_depth=8, max_features=2, min_samples_leaf=3, min_samples_split=8 
[CV]  criterion=gini, max_depth=8, max_features=2, min_samples_leaf=3, min_samples_split=8, total=   0.0s
[CV] criterion=gini, max_depth=8, max_features=2, min_samples_leaf=3, min_samples_split=10 
[CV]  criterion=gini, max_depth=8, max_features=2, min_samples_leaf=3, min_samples_split=10, total=   0.0s
[CV] criterion=gini, max_depth=8, max_features=2, min_samples_leaf=3, min_samples_split=10 
[CV]  criterion=gini, max_depth=8, max_features=2, min_samples_leaf=3, min_samples_split=10, total=   0.0s
[CV] criter

[CV]  criterion=gini, max_depth=8, max_features=3, min_samples_leaf=4, min_samples_split=12, total=   0.0s
[CV] criterion=gini, max_depth=8, max_features=3, min_samples_leaf=4, min_samples_split=12 
[CV]  criterion=gini, max_depth=8, max_features=3, min_samples_leaf=4, min_samples_split=12, total=   0.0s
[CV] criterion=gini, max_depth=8, max_features=3, min_samples_leaf=4, min_samples_split=12 
[CV]  criterion=gini, max_depth=8, max_features=3, min_samples_leaf=4, min_samples_split=12, total=   0.0s
[CV] criterion=gini, max_depth=8, max_features=3, min_samples_leaf=5, min_samples_split=8 
[CV]  criterion=gini, max_depth=8, max_features=3, min_samples_leaf=5, min_samples_split=8, total=   0.0s
[CV] criterion=gini, max_depth=8, max_features=3, min_samples_leaf=5, min_samples_split=8 
[CV]  criterion=gini, max_depth=8, max_features=3, min_samples_leaf=5, min_samples_split=8, total=   0.0s
[CV] criterion=gini, max_depth=8, max_features=3, min_samples_leaf=5, min_samples_split=8 
[CV]  crit

[CV]  criterion=gini, max_depth=9, max_features=3, min_samples_leaf=3, min_samples_split=12, total=   0.0s
[CV] criterion=gini, max_depth=9, max_features=3, min_samples_leaf=4, min_samples_split=8 
[CV]  criterion=gini, max_depth=9, max_features=3, min_samples_leaf=4, min_samples_split=8, total=   0.0s
[CV] criterion=gini, max_depth=9, max_features=3, min_samples_leaf=4, min_samples_split=8 
[CV]  criterion=gini, max_depth=9, max_features=3, min_samples_leaf=4, min_samples_split=8, total=   0.0s
[CV] criterion=gini, max_depth=9, max_features=3, min_samples_leaf=4, min_samples_split=8 
[CV]  criterion=gini, max_depth=9, max_features=3, min_samples_leaf=4, min_samples_split=8, total=   0.0s
[CV] criterion=gini, max_depth=9, max_features=3, min_samples_leaf=4, min_samples_split=10 
[CV]  criterion=gini, max_depth=9, max_features=3, min_samples_leaf=4, min_samples_split=10, total=   0.0s
[CV] criterion=gini, max_depth=9, max_features=3, min_samples_leaf=4, min_samples_split=10 
[CV]  crite

[CV]  criterion=gini, max_depth=10, max_features=2, min_samples_leaf=5, min_samples_split=10, total=   0.0s
[CV] criterion=gini, max_depth=10, max_features=2, min_samples_leaf=5, min_samples_split=12 
[CV]  criterion=gini, max_depth=10, max_features=2, min_samples_leaf=5, min_samples_split=12, total=   0.0s
[CV] criterion=gini, max_depth=10, max_features=2, min_samples_leaf=5, min_samples_split=12 
[CV]  criterion=gini, max_depth=10, max_features=2, min_samples_leaf=5, min_samples_split=12, total=   0.0s
[CV] criterion=gini, max_depth=10, max_features=2, min_samples_leaf=5, min_samples_split=12 
[CV]  criterion=gini, max_depth=10, max_features=2, min_samples_leaf=5, min_samples_split=12, total=   0.0s
[CV] criterion=gini, max_depth=10, max_features=3, min_samples_leaf=3, min_samples_split=8 
[CV]  criterion=gini, max_depth=10, max_features=3, min_samples_leaf=3, min_samples_split=8, total=   0.0s
[CV] criterion=gini, max_depth=10, max_features=3, min_samples_leaf=3, min_samples_split=8

[CV]  criterion=gini, max_depth=11, max_features=2, min_samples_leaf=4, min_samples_split=12, total=   0.0s
[CV] criterion=gini, max_depth=11, max_features=2, min_samples_leaf=5, min_samples_split=8 
[CV]  criterion=gini, max_depth=11, max_features=2, min_samples_leaf=5, min_samples_split=8, total=   0.0s
[CV] criterion=gini, max_depth=11, max_features=2, min_samples_leaf=5, min_samples_split=8 
[CV]  criterion=gini, max_depth=11, max_features=2, min_samples_leaf=5, min_samples_split=8, total=   0.0s
[CV] criterion=gini, max_depth=11, max_features=2, min_samples_leaf=5, min_samples_split=8 
[CV]  criterion=gini, max_depth=11, max_features=2, min_samples_leaf=5, min_samples_split=8, total=   0.0s
[CV] criterion=gini, max_depth=11, max_features=2, min_samples_leaf=5, min_samples_split=10 
[CV]  criterion=gini, max_depth=11, max_features=2, min_samples_leaf=5, min_samples_split=10, total=   0.0s
[CV] criterion=gini, max_depth=11, max_features=2, min_samples_leaf=5, min_samples_split=10 
[

[CV]  criterion=entropy, max_depth=8, max_features=2, min_samples_leaf=3, min_samples_split=10, total=   0.1s
[CV] criterion=entropy, max_depth=8, max_features=2, min_samples_leaf=3, min_samples_split=10 
[CV]  criterion=entropy, max_depth=8, max_features=2, min_samples_leaf=3, min_samples_split=10, total=   0.1s
[CV] criterion=entropy, max_depth=8, max_features=2, min_samples_leaf=3, min_samples_split=12 
[CV]  criterion=entropy, max_depth=8, max_features=2, min_samples_leaf=3, min_samples_split=12, total=   0.1s
[CV] criterion=entropy, max_depth=8, max_features=2, min_samples_leaf=3, min_samples_split=12 
[CV]  criterion=entropy, max_depth=8, max_features=2, min_samples_leaf=3, min_samples_split=12, total=   0.1s
[CV] criterion=entropy, max_depth=8, max_features=2, min_samples_leaf=3, min_samples_split=12 
[CV]  criterion=entropy, max_depth=8, max_features=2, min_samples_leaf=3, min_samples_split=12, total=   0.1s
[CV] criterion=entropy, max_depth=8, max_features=2, min_samples_leaf=

[CV]  criterion=entropy, max_depth=8, max_features=3, min_samples_leaf=5, min_samples_split=8, total=   0.1s
[CV] criterion=entropy, max_depth=8, max_features=3, min_samples_leaf=5, min_samples_split=8 
[CV]  criterion=entropy, max_depth=8, max_features=3, min_samples_leaf=5, min_samples_split=8, total=   0.1s
[CV] criterion=entropy, max_depth=8, max_features=3, min_samples_leaf=5, min_samples_split=8 
[CV]  criterion=entropy, max_depth=8, max_features=3, min_samples_leaf=5, min_samples_split=8, total=   0.1s
[CV] criterion=entropy, max_depth=8, max_features=3, min_samples_leaf=5, min_samples_split=10 
[CV]  criterion=entropy, max_depth=8, max_features=3, min_samples_leaf=5, min_samples_split=10, total=   0.1s
[CV] criterion=entropy, max_depth=8, max_features=3, min_samples_leaf=5, min_samples_split=10 
[CV]  criterion=entropy, max_depth=8, max_features=3, min_samples_leaf=5, min_samples_split=10, total=   0.1s
[CV] criterion=entropy, max_depth=8, max_features=3, min_samples_leaf=5, mi

[CV]  criterion=entropy, max_depth=9, max_features=3, min_samples_leaf=3, min_samples_split=12, total=   0.1s
[CV] criterion=entropy, max_depth=9, max_features=3, min_samples_leaf=3, min_samples_split=12 
[CV]  criterion=entropy, max_depth=9, max_features=3, min_samples_leaf=3, min_samples_split=12, total=   0.1s
[CV] criterion=entropy, max_depth=9, max_features=3, min_samples_leaf=3, min_samples_split=12 
[CV]  criterion=entropy, max_depth=9, max_features=3, min_samples_leaf=3, min_samples_split=12, total=   0.1s
[CV] criterion=entropy, max_depth=9, max_features=3, min_samples_leaf=4, min_samples_split=8 
[CV]  criterion=entropy, max_depth=9, max_features=3, min_samples_leaf=4, min_samples_split=8, total=   0.1s
[CV] criterion=entropy, max_depth=9, max_features=3, min_samples_leaf=4, min_samples_split=8 
[CV]  criterion=entropy, max_depth=9, max_features=3, min_samples_leaf=4, min_samples_split=8, total=   0.1s
[CV] criterion=entropy, max_depth=9, max_features=3, min_samples_leaf=4, m

[CV]  criterion=entropy, max_depth=10, max_features=2, min_samples_leaf=5, min_samples_split=8, total=   0.1s
[CV] criterion=entropy, max_depth=10, max_features=2, min_samples_leaf=5, min_samples_split=10 
[CV]  criterion=entropy, max_depth=10, max_features=2, min_samples_leaf=5, min_samples_split=10, total=   0.1s
[CV] criterion=entropy, max_depth=10, max_features=2, min_samples_leaf=5, min_samples_split=10 
[CV]  criterion=entropy, max_depth=10, max_features=2, min_samples_leaf=5, min_samples_split=10, total=   0.1s
[CV] criterion=entropy, max_depth=10, max_features=2, min_samples_leaf=5, min_samples_split=10 
[CV]  criterion=entropy, max_depth=10, max_features=2, min_samples_leaf=5, min_samples_split=10, total=   0.1s
[CV] criterion=entropy, max_depth=10, max_features=2, min_samples_leaf=5, min_samples_split=12 
[CV]  criterion=entropy, max_depth=10, max_features=2, min_samples_leaf=5, min_samples_split=12, total=   0.1s
[CV] criterion=entropy, max_depth=10, max_features=2, min_samp

[CV]  criterion=entropy, max_depth=11, max_features=2, min_samples_leaf=3, min_samples_split=12, total=   0.1s
[CV] criterion=entropy, max_depth=11, max_features=2, min_samples_leaf=3, min_samples_split=12 
[CV]  criterion=entropy, max_depth=11, max_features=2, min_samples_leaf=3, min_samples_split=12, total=   0.1s
[CV] criterion=entropy, max_depth=11, max_features=2, min_samples_leaf=4, min_samples_split=8 
[CV]  criterion=entropy, max_depth=11, max_features=2, min_samples_leaf=4, min_samples_split=8, total=   0.1s
[CV] criterion=entropy, max_depth=11, max_features=2, min_samples_leaf=4, min_samples_split=8 
[CV]  criterion=entropy, max_depth=11, max_features=2, min_samples_leaf=4, min_samples_split=8, total=   0.1s
[CV] criterion=entropy, max_depth=11, max_features=2, min_samples_leaf=4, min_samples_split=8 
[CV]  criterion=entropy, max_depth=11, max_features=2, min_samples_leaf=4, min_samples_split=8, total=   0.1s
[CV] criterion=entropy, max_depth=11, max_features=2, min_samples_l

[CV]  criterion=entropy, max_depth=11, max_features=3, min_samples_leaf=5, min_samples_split=8, total=   0.1s
[CV] criterion=entropy, max_depth=11, max_features=3, min_samples_leaf=5, min_samples_split=10 
[CV]  criterion=entropy, max_depth=11, max_features=3, min_samples_leaf=5, min_samples_split=10, total=   0.1s
[CV] criterion=entropy, max_depth=11, max_features=3, min_samples_leaf=5, min_samples_split=10 
[CV]  criterion=entropy, max_depth=11, max_features=3, min_samples_leaf=5, min_samples_split=10, total=   0.1s
[CV] criterion=entropy, max_depth=11, max_features=3, min_samples_leaf=5, min_samples_split=10 
[CV]  criterion=entropy, max_depth=11, max_features=3, min_samples_leaf=5, min_samples_split=10, total=   0.1s
[CV] criterion=entropy, max_depth=11, max_features=3, min_samples_leaf=5, min_samples_split=12 
[CV]  criterion=entropy, max_depth=11, max_features=3, min_samples_leaf=5, min_samples_split=12, total=   0.1s
[CV] criterion=entropy, max_depth=11, max_features=3, min_samp

[Parallel(n_jobs=1)]: Done 432 out of 432 | elapsed:   24.6s finished


ValueError: n_splits=10 cannot be greater than the number of members in each class.

In [17]:
# Training Decision Tree Regression with the best hyper parameters found through cross-validation
regression_best_dt = tree.DecisionTreeClassifier(criterion='entropy', max_depth=8, max_features=2, min_samples_leaf=3)

regression_best_dt.fit(Xtrain,ytrain)

#Prediction
ypred=regression_best_dt.predict(Xtest)

from sklearn.metrics import mean_squared_error, mean_absolute_error

# calculate mean_squared_error
mean = mean_squared_error(ytest,ypred)
print("mean_squared_error:",format(mean))
# calculate mean_absolute_error
absolute = mean_absolute_error(ytest,ypred)
print("mean_absolute_error:",format(absolute))
rmse = np.sqrt(mean_squared_error(ytest,ypred))
print("root_mean_sqrt_error:",format(rmse))

mean_squared_error: 24545.71597633136
mean_absolute_error: 53213.60946745562
root_mean_sqrt_error: 156.6707246946007


In [18]:
# using the test data powerTestInput correspond to a test set.
XtestSep = mat["powerTestInput"]
print("Xtest shape",XtestSep.shape)
XtestSep = np.transpose(Xtest)
print("Xtest shape after reshape",XtestSep.shape)
print(Xtest[:5,:5])

# Xtrain = scaler.transform(Xtrain)
from sklearn.decomposition import PCA
pca =PCA(n_components=10)
pca.fit(XtestSep)
Xtest_pca=pca.transform(XtestSep)
print("Xtest shape after PCA:", Xtest_pca.shape)

print('Variance retained corresponds to the test data after reducing with PCA:')
print(pca.explained_variance_ratio_.sum())

#Prediction of test data with the best KNN Regression parameters
powerTestoutput=regression_best_dt.predict(Xtest_pca)
print("Final output of the test data (powerTestInput):")
print(powerTestoutput[:5])

Xtest shape (15, 115)
Xtest shape after reshape (10, 169)
[[ 3.0696777  -0.66438469  0.40252818 -0.6095858   0.64105257]
 [-2.95976734 -0.2972308   0.07852385  1.29929822  2.17532333]
 [ 5.44433427  0.55138435  0.12274693 -0.39431794  0.40240631]
 [ 1.31826811 -1.2356806   0.9732071   0.28153152 -0.55364157]
 [-2.57799862 -0.38110799 -0.51934324 -1.03969219  1.25509452]]
Xtest shape after PCA: (10, 10)
Variance retained corresponds to the test data after reducing with PCA:
1.0
Final output of the test data (powerTestInput):
[2752 1915 2536 1915 1915]


In [19]:
Final_Output = regression_best_knn.predict(Xtest_pca)
print(Final_Output)

[[3732.375]
 [1887.75 ]
 [2049.625]
 [1851.5  ]
 [1792.125]
 [1901.375]
 [1814.375]
 [1713.5  ]
 [1821.25 ]
 [1696.25 ]]
