In [93]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from collections import Counter, defaultdict
from sklearn.metrics import roc_curve, auc
from sklearn import preprocessing
from sklearn.neural_network import MLPRegressor
from sklearn.model_selection import KFold
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import cross_val_predict
from sklearn.model_selection import train_test_split

In [29]:
dataset = pd.read_csv('network_backup_dataset.csv')

In [64]:
dataset.columns

Index(['Week #', 'Day of Week', 'Backup Start Time - Hour of Day',
       'Work-Flow-ID', 'File Name', 'Size of Backup (GB)',
       'Backup Time (hour)'],
      dtype='object')

In [79]:
def getX_Y_from_dataset(dataset):
    Y = (dataset['Size of Backup (GB)'].as_matrix())
    X = dataset.drop(['Size of Backup (GB)'],axis=1)
    return X,Y

In [62]:
def one_hot_encode(dataset):
    le = preprocessing.LabelEncoder()
    dataset_cat = dataset.select_dtypes(include=[object])
    dataset_cat.head()
    dataset_cat = dataset_cat.apply(le.fit_transform)
    oh_enc = preprocessing.OneHotEncoder()
    oh_enc.fit(dataset_cat)
    onehotlabels = oh_enc.transform(dataset_cat)
    cat_array = onehotlabels.toarray()
    dataset_no_cat = dataset.select_dtypes(exclude=[object])
    non_cat_array = dataset_no_cat.as_matrix()
    return np.concatenate((non_cat_array,cat_array),axis = 1)
    

In [96]:
def perform_10fold(X,y,regressor):
    kf = KFold(n_splits=10,random_state = 0)
    i = 1
    tr_l = []
    ts_l = []
    for train_index, test_index in kf.split(X):
        print("Fold : ",i)
        X_train, X_test = X[train_index], X[test_index]
        y_train, y_test = y[train_index], y[test_index]
        regressor.fit(X_train,y_train)
        train_preds = regressor.predict(X_train)
        test_preds = regressor.predict(X_test)
        tr_l.append(np.sqrt(mean_squared_error(y_train,train_preds)))
        ts_l.append(np.sqrt(mean_squared_error(y_test,test_preds)))
        print("Training RMSE : ",np.sqrt(mean_squared_error(y_train,train_preds)))
        print("Test RMSE : ",np.sqrt(mean_squared_error(y_test,test_preds)))
        i = i+1
    return tr_l,ts_l

In [88]:
X,Y= getX_Y_from_dataset(dataset)

In [89]:
X =one_hot_encode(X)

In [90]:
mlp_reg = MLPRegressor(hidden_layer_sizes=1)

In [91]:
mlp_reg.fit(X,Y)

MLPRegressor(activation='relu', alpha=0.0001, batch_size='auto', beta_1=0.9,
       beta_2=0.999, early_stopping=False, epsilon=1e-08,
       hidden_layer_sizes=1, learning_rate='constant',
       learning_rate_init=0.001, max_iter=200, momentum=0.9,
       nesterovs_momentum=True, power_t=0.5, random_state=None,
       shuffle=True, solver='adam', tol=0.0001, validation_fraction=0.1,
       verbose=False, warm_start=False)

In [None]:
train_rmses,test_rmses=perform_10fold(X,Y,mlp_reg)

Fold :  1
Training RMSE :  0.104223629304
Test RMSE :  0.107425109179
Fold :  2
Training RMSE :  0.104509635921
Test RMSE :  0.100696507812
Fold :  3
Training RMSE :  0.104154273774
Test RMSE :  0.10752319127
Fold :  4
Training RMSE :  0.0719043625662
Test RMSE :  0.0676922993777
Fold :  5
Training RMSE :  0.0710226353073
Test RMSE :  0.0753779882999
Fold :  6
Training RMSE :  0.0719763600554
Test RMSE :  0.0682230049709
Fold :  7
Training RMSE :  0.103980760482
Test RMSE :  0.107579877228
Fold :  8
Training RMSE :  0.0720445684561
Test RMSE :  0.0685736620261
Fold :  9
Training RMSE :  0.107366275619
Test RMSE :  0.118365669163
Fold :  10
