## Project 1 - Task II

In [1]:
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.model_selection import RandomizedSearchCV 
from sklearn.metrics import mean_absolute_error
from sklearn.neural_network import MLPRegressor

import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from scikeras.wrappers import KerasRegressor
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import KFold
from keras.layers.core import Dense, Flatten, Dropout
from keras.layers import LayerNormalization

2022-11-26 00:53:09.727061: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  SSE4.1 SSE4.2 AVX
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [None]:
#Importing results from Task I 
%run Task_I.ipynb

### Task 2.1 
Model Training: Train three models M1,M2,M3 on the training set (produced in Task I-3) using the methods linear regression, random forest regression, and neural network regression.

In [None]:
#M1: Fitting Linear Regression
reg_KV = LinearRegression(fit_intercept=True).fit(X_KV_new, Y_KV['ReadsAvg'])  
reg_VoD = LinearRegression(fit_intercept=True).fit(X_VoD_new, Y_VoD['DispFrames'])

In [None]:
#Using RandomizedSearchCV in order to find good hyperparameters for Random Forest
rand_forest = RandomForestRegressor()
dist_rand_forest = dict(n_estimators = range(1,50), max_depth=range(1,10))

rand_forest_hyp_KV = RandomizedSearchCV(estimator = rand_forest, param_distributions = dist_rand_forest,n_iter=20)
hyp_KV = rand_forest_hyp_KV.fit(X_KV_new, Y_KV['ReadsAvg'])
hyp_KV.best_params_

In [None]:
rand_forest_hyp_VoD = RandomizedSearchCV(estimator = rand_forest, param_distributions = dist_rand_forest,n_iter=20)
hyp_VoD = rand_forest_hyp_VoD.fit(X_VoD_new, Y_VoD['DispFrames'])
hyp_VoD.best_params_

In [None]:
#Train Random Forest with best found hyperparameters
rand_forest_KV = RandomForestRegressor(n_estimators = 39, max_depth=8).fit(X_KV_new, Y_KV['ReadsAvg'])
rand_forest_VoD = RandomForestRegressor(n_estimators = 45, max_depth=4).fit(X_VoD_new,Y_VoD['DispFrames'])

In [None]:
#Train NN using hyperparameters obtained from trial and error

#Define a base model:
def baseline_model():
    #Create model
    model = Sequential()
    model.add(Dense(units=32, input_shape=(16,),activation = 'relu'))
    model.add(Dense(units=16,activation = 'relu'))
    model.add(Dense(units=8,activation = 'relu'))
    model.add(Dense(1))
    #Compile model
    model.compile(loss='mse', optimizer = keras.optimizers.Adam(learning_rate=1e-5), metrics = [tf.keras.metrics.MeanSquaredError()])
    return model

#Early Stopping to avoid overfitting
callback = tf.keras.callbacks.EarlyStopping(monitor='loss',min_delta = 0.005, patience=500) #Early stopping to avoid overfitting

NN_KV_ReadsAvg = KerasRegressor(model=baseline_model(), epochs=5000, batch_size=300, callbacks = [callback])
NN_KV_ReadsAvg.fit(X_KV_new, Y_KV['ReadsAvg'])

NN_VoD = KerasRegressor(model=baseline_model(), epochs=5000, batch_size=300, verbose=0, callbacks = [callback])
NN_VoD.fit(features_VoD_new, Y_VoD['DispFrames'])

### Task 2.2
Train and test your models Mi with the so-called validation-set technique. This technique entails that you split the set of observations into two parts: the training set for computing the model Mi and the test set for evaluating the accuracy of Mi. From the complete set of observations, you select uniformly at random 70% of the observations to form the training set and then assign the remaining 30% to the test set.

In [4]:
#Splitting data into test and training 

#Store reduces feature matrices and target scores in pd.Dataframe s.t. we can sort after indices 
X_KV_new = pd.DataFrame(X_KV_new)
X_VoD_new = pd.DataFrame(X_VoD_new)

X_KV_train, X_KV_test, Y_KV_train, Y_KV_test = train_test_split(X_KV_new, Y_KV['ReadsAvg'], test_size=0.3, random_state=42)
X_VoD_train, X_VoD_test, Y_VoD_train, Y_VoD_test = train_test_split(X_VoD_new, Y_VoD['DispFrames'], test_size=0.3, random_state=42)

#Store train and test sets sorted after index
X_KV_train =  X_KV_train.sort_index(axis=0)
X_KV_test = X_KV_test.sort_index(axis=0)
Y_KV_train = Y_KV_train.sort_index(axis=0)
Y_KV_test = Y_KV_test.sort_index(axis=0)

X_VoD_train = X_VoD_train.sort_index(axis=0)
X_VoD_test = X_VoD_test.sort_index(axis=0)
Y_VoD_train = Y_VoD_train.sort_index(axis=0)
Y_VoD_test = Y_VoD_test.sort_index(axis=0)

In [5]:
#Train Linear Regression on the training set
reg_KV = LinearRegression(fit_intercept=True).fit(X_KV_train, Y_KV_train) #Train on the training set
reg_KV_pred = reg_KV.predict(X_KV_test)

reg_VoD = LinearRegression(fit_intercept=True).fit(X_VoD_train, Y_VoD_train)
reg_VoD_pred = reg_VoD.predict(X_VoD_test)

In [None]:
#Using RandomizedSearchCV in order to find good hyperparameters for Random Forest
rand_forest_hyp_KV = RandomizedSearchCV(estimator = rand_forest, param_distributions = dist_rand_forest)
hyp_KV = rand_forest_hyp_KV.fit(X_KV_train, Y_KV_train)
hyp_KV.best_params_

In [None]:
rand_forest_hyp_VoD = RandomizedSearchCV(estimator = rand_forest, param_distributions = dist_rand_forest)
hyp_VoD = rand_forest_hyp_KV.fit(X_VoD_train, Y_VoD_train)
hyp_VoD.best_params_

In [6]:
#Train Random Forest with best found hyperparameters on the training set
rand_forest_KV = RandomForestRegressor(n_estimators = 15, max_depth=6).fit(X_KV_train, Y_KV_train)
rand_forest_KV_pred = rand_forest_KV.predict(X_KV_test)

rand_forest_VoD = RandomForestRegressor(n_estimators = 16, max_depth=9).fit(X_VoD_train, Y_VoD_train)
rand_forest_VoD_pred = rand_forest_VoD.predict(X_VoD_test)

In [None]:
#Using MLPRegressor() and RandomizedSearchCV() find good hyperparameters for NN
NN = MLPRegressor(solver='adam', tol=2, max_iter=100)

hidden_layer_sizes = [(int(x),int(y),int(z)) for x in range(1,64) for y in range(1,32) for z in range(1,16) ]
activation = ['logistic', 'tanh', 'relu']
dist_NN = {'activation': activation,  'hidden_layer_sizes': hidden_layer_sizes}

NN_KV = RandomizedSearchCV(estimator = NN, param_distributions = dist_NN)
hyp_KV = NN_KV.fit(X_KV_train, Y_KV_train)
hyp_KV.best_params_

#Output = {'hidden_layer_sizes': (63, 26, 14), 'activation': 'relu'} 

In [None]:
NN_VoD = RandomizedSearchCV(estimator = NN, param_distributions = dist_NN)
hyp_VoD = NN_KV.fit(X_VoD_train, Y_VoD_train)
hyp_KV.best_params_

#Output = {'hidden_layer_sizes': (13, 14, 10), 'activation': 'logistic'}

In [None]:
#Train NN for target score ReadsAvg

#hyperparameters through formal search and a lot of trial and error
def baseline_model(lr):
    #Create model
    model = Sequential()
    model.add(Dense(units=32, input_shape=(16,),activation='relu'))
    model.add(Dense(units=16, activation='relu'))
    model.add(Dense(units=8, activation='relu'))
    model.add(Dense(1))
    #Compile model
    model.compile(loss='mae', optimizer = keras.optimizers.Adam(learning_rate=lr), metrics = [tf.keras.metrics.MeanAbsoluteError()])
    return model

callback = tf.keras.callbacks.EarlyStopping(monitor='loss',min_delta = 0.005, patience=20) #Early stopping to avoid overfitting

NN_KV_ReadsAvg = KerasRegressor(model=baseline_model(1e-4), epochs=5000, batch_size=300, callbacks = [callback]).fit(X_KV_train, Y_KV_train)
NN_KV_ReadsAvg_pred = NN_KV_ReadsAvg.predict(X_KV_test)

In [None]:
#Train NN for target score DispFrames

def baseline_model(lr):
    #Create model
    model = Sequential()
    model.add(Dense(units=13, input_shape=(16,),activation='relu'))
    model.add(Dense(units=14, activation='tanh'))
    model.add(Dense(units=10, activation='tanh'))
    model.add(Dense(1))
    #Compile model
    model.compile(loss='mae', optimizer = keras.optimizers.Adam(learning_rate=lr), metrics = [tf.keras.metrics.MeanAbsoluteError()])
    return model

callback = tf.keras.callbacks.EarlyStopping(monitor='loss',min_delta = 0.01, patience=50)

NN_VoD = KerasRegressor(model=baseline_model(1e-3), epochs=5000, batch_size=300 ,callbacks=[callback]).fit(X_VoD_train, Y_VoD_train)
NN_VoD_pred = NN_VoD.predict(X_VoD_test)

### Task 2.3
Model Accuracy: Compute the estimation error of the models Mi on the test set. We define the estimation error as the Normalized Mean Absolute Error (NMAE) = 1(1 􏰂m |yj −yˆj|), whereby
y ̄m j=1
yˆj is the model estimation for the measured service metric yi, and y ̄ is the average of the observations
yj of the test set. Note that yˆj = Mi(yj).

In [7]:
#Model 1: Linear Regression

#NMAE for Dataset 1:  
NMAE_KV_reg = (1/Y_KV_test.mean())*mean_absolute_error(Y_KV_test, reg_KV_pred)
print(NMAE_KV_reg)

#NMAE for Dataset 2: 
NMAE_VoD_reg = (1/Y_VoD_test.mean())*mean_absolute_error(Y_VoD_test, reg_VoD_pred)
print(NMAE_VoD_reg)

0.019578755791714635
0.11128603229548305


In [8]:
#Model 2: Random Forest

#NMAE for Dataset 1: 
NMAE_KV_rand_forest = (1/Y_KV_test.mean())*mean_absolute_error(Y_KV_test, rand_forest_KV_pred)
print(NMAE_KV_rand_forest)

#NMAE for Dataset 2: 
NMAE_VoD_rand_forest = (1/Y_VoD_test.mean())*mean_absolute_error(Y_VoD_test, rand_forest_VoD_pred)
print(NMAE_VoD_rand_forest)

0.017444648428940827
0.08205953150784659


In [None]:
#Model 3: Neural Network

#NMAE for Dataset 1: 
NMAE_KV_NN = (1/Y_KV_test.mean())*mean_absolute_error(Y_KV_test, NN_KV_ReadsAvg_pred)
print(NMAE_KV_NN)

#NMAE for Dataset 2:
NMAE_VoD_NN = (1/Y_VoD_test.mean())*mean_absolute_error(Y_VoD_test, NN_VoD_pred)
print(NMAE_VoD_NN)

### Task 2.5
As a baseline for Mi, use a na ̈ıve method which relies on Y values only. For each x ∈ X it predicts a constant value y ̄ which is the mean of the samples yj in the training set. Compute y ̄ for the na ̈ıve method for the training set and compute the NMAE for the test set.


In [9]:
#For target score ReadsAvg of Dataset 1
mean_KV_ReadsAvg = np.full(shape=X_KV_test.shape[0],fill_value=np.mean(Y_KV_train)) #Array containing mean of target values in the training set
NMAE_KV_ReadsAvg = (1/np.mean(Y_KV_test))*mean_absolute_error(Y_KV_test, mean_KV_ReadsAvg)

#For target score DispFrames of Dataset 2
mean_VoD_DispFrames = np.full(shape=X_VoD_test.shape[0],fill_value=np.mean(Y_VoD_train))
NMAE_VoD_DispFrames = (1/np.mean(Y_VoD_test))*mean_absolute_error(Y_VoD_test, mean_VoD_DispFrames)

### Task 2.6
Choose one method (either linear regression, random forest, or neural network) and produce a time series plot that shows both the measurements and the model estimations for the target on the test set. Show also the prediction of the a na ̈ıve method (see Figure 1). For this plot chose a time interval with 1 000 samples of the test set.

In [None]:
#Time series for estimations of the method Linear Regression

# For ReadsAvg Values
plt.figure(figsize=(16, 3))
plt.plot(range(1000), Y_KV_test.iloc[:1000], linewidth=0.8, color='r', label='Measured')
plt.plot(range(1000), reg_KV_pred[:1000], linewidth=1.2, color='b', label='Estimated')
plt.plot(range(1000), mean_KV_ReadsAvg[:1000], linewidth=1.5, color='black', label='Naive Estimation')
plt.ylim([50,70])
plt.xlabel('Time Index',fontsize=16)
plt.ylabel('ReadsAvg',fontsize=16)
#plt.title("Times series plot for the target score 'ReadsAvg' corresponding to the dataset JNSM_KV_flashcrowd_1",fontsize=16)
plt.legend(fontsize=12)
#plt.savefig('time_series_KV',bbox_inches='tight')
plt.show()


#For DispFrames Values
plt.figure(figsize=(16, 3))
plt.plot(range(1000), Y_VoD_test[:1000], linewidth=0.8, color='r', label='Measured')
plt.plot(range(1000), reg_VoD_pred[:1000], linewidth=1.2, color='blue', label='Estimated')
plt.plot(range(1000), mean_VoD_DispFrames[:1000], linewidth=1.5, color='black', label='Naive Estimation')
plt.ylim([0,30])
plt.xlabel('Time Index',fontsize=16)
plt.ylabel('DispFrames',fontsize=16)
#plt.title("Times series plot for the target score 'DispFrames' corresponding to the dataset JNSM_VoD_flashcrowd_1",fontsize=16)
plt.legend(fontsize=12)
#plt.savefig('time_series_VoD',bbox_inches='tight')
plt.show()

### Task 2.7
Produce a density plot and a histogram for the target values on the test set. Set the bin size of the histogram to 1 frame for Video Frame Rate or 1ms for Response Time.

In [None]:
#For target score ReadsAvg
hist_KV_ReadsAvg = sns.distplot(Y_KV_test.iloc[:], hist=True, kde=True, 
             bins=25, color = 'lightblue', 
             hist_kws={'edgecolor':'red', 'linewidth': 3},
             kde_kws={'color': 'black','linewidth': 2, 'label': 'ReadsAvg' })

plt.xlabel("ReadsAvg",fontsize=16)
plt.ylabel("Density",fontsize=16)
#plt.legend()
#plt.title("Density for values 'ReadsAvg' on the test set") 
plt.xlim([50, 65])
#plt.savefig('hist_y_KV',bbox_inches='tight')
plt.show(hist_KV_ReadsAvg)


#For target score DispFrames
hist_VoD_DispFrames = sns.distplot(Y_VoD_test, hist=True, kde=True, 
             bins=int(20), color = 'lightblue', 
             hist_kws={'edgecolor':'red', 'linewidth': 3},
             kde_kws={'color': 'black','linewidth': 2, 'label': 'DispFrames' })

plt.xlabel('DispFrames',fontsize=16)
plt.ylabel("Density",fontsize=16)
#plt.legend()
#plt.title("Density for values 'DispFrames' on the test set") 
plt.xlim([10, 26])
#plt.savefig('hist_y_VoD',bbox_inches='tight')
plt.show(hist_VoD_DispFrames)

### Task 2.8
Produce a density plot of the estimation errors yj − yˆj in the test set.

In [None]:
#For target values ReadsAvg

reg_KV_ReadsAvg_err = pd.DataFrame(np.absolute(Y_KV_test.iloc[:]-reg_KV_pred[:]), columns = ['ReadsAvg'])
reg_KV_ReadsAvg_err.plot(kind="density")
plt.title("Linear Regression",fontsize=26) 
#plt.xlabel("Absolute error")
plt.ylabel("Density")
plt.legend(fontsize=16)
plt.xlim([-5, 30])
plt.savefig('Reg1', bbox_inches='tight')

rand_forest_KV_ReadsAvg_err = pd.DataFrame(np.absolute(Y_KV_test.iloc[:].values-rand_forest_KV_pred[:]), columns = ['ReadsAvg'])
rand_forest_KV_ReadsAvg_err.plot.density()
plt.title("Random Forest", fontsize=26) 
#plt.xlabel("Absolute error")
plt.ylabel("Density")
plt.legend(fontsize=16)
plt.xlim([-5, 30])
plt.savefig('Rand1', bbox_inches='tight')

NN_KV_ReadsAvg_err = pd.DataFrame(np.absolute(Y_KV_test.iloc[:].values-NN_KV_ReadsAvg_pred.reshape(-1,1)), columns = ['ReadsAvg'])
NN_KV_ReadsAvg_err.plot.density()
plt.title("Estimation error using Neuran Network") 
plt.xlabel("Absolute error")
plt.ylabel("Density")
plt.legend()
plt.xlim([0, 30])

plt.show()

In [None]:
#For DispFrames

reg_VoD_err = pd.DataFrame(np.absolute(Y_VoD_test.iloc[:].values-reg_VoD_pred), columns = ['DispFrames'])
reg_VoD_err.plot.density()
plt.title("Linear Regression",fontsize=26) 
#plt.xlabel("Absolute error")
plt.ylabel("Density")
plt.legend(fontsize=16)
plt.xlim([-5, 30])
plt.savefig('Reg3', bbox_inches='tight')
plt.show()

rand_forest_VoD_err = pd.DataFrame(np.absolute(Y_VoD_test.iloc[:].values-rand_forest_VoD_pred.flatten()), columns = ['DispFrames'])
rand_forest_VoD_err.plot.density()
plt.title("Random Forest", fontsize=26) 
#plt.xlabel("Absolute error")
plt.ylabel("Density")
plt.legend(fontsize=16)
plt.xlim([-5, 30])
plt.savefig('Rand3', bbox_inches='tight')
plt.show()

NN_VoD_err = pd.DataFrame(np.absolute(Y_VoD_test.iloc[:].values-NN_VoD_pred.reshape(-1,1)), columns = ['DispFrames'])
NN_VoD_err.plot.density()
plt.title("Estimation error using Neuran Network") 
plt.xlabel("Absolute error")
plt.ylabel("Density")
plt.legend()
plt.xlim([-5, 30])
plt.show()