In [193]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings("ignore")
import numpy as np
fpath = './network_backup_dataset.csv'
import numpy as np
from surprise import Dataset
from surprise import Reader
from surprise import KNNWithMeans
from surprise.model_selection import cross_validate
import pickle
from tqdm import tqdm
import plotly as py
from plotly.offline import init_notebook_mode
import plotly.graph_objs as go
py.offline.init_notebook_mode(connected=True)


# Question 1 

## Load Dataset 1

In [194]:
'''load data'''
backup = pd.read_csv(fpath)
#print(backup.shape) #(18588, 7)
#print(backup[:5])
'''grouping data'''
backup_group = backup.groupby([backup.columns[0], backup.columns[1], backup.columns[3]], sort=False).sum().iloc[:,1:2]
pre_gt = backup['Size of Backup (GB)']

In [195]:
workflowsizes = [[],[],[],[],[]]
length=0
for index, row in backup_group.iterrows():
    workflowsizes[int(index[2][-1])].append(row[0])

In [196]:

def draw_lines(y, n, title=''):
    data = []
    
    for i in range(5):
        trace = go.Scatter(x=np.arange(0,n+1,1), y=y[i][:n], mode='lines', 
                           line=dict(width=2), name=('work_flow_'+str(i)))
        data.append(trace)

    layout = go.Layout(title=title,
                       xaxis=dict(title='Day', dtick=7),
                       yaxis=dict(title='Backup sizes', ticks='outside'))

    fig = go.Figure(data=data, layout=layout)
    py.offline.iplot(fig)


In [197]:
draw_lines(workflowsizes, 20, 'Daily backup sizes for every workflows (20 days)')
draw_lines(workflowsizes, 105, 'Daily backup sizes for every workflows (105 days)')

# Question 2

## Converting categorical vars into numerical values

In [198]:
#converting categorical vars into numerical values
import calendar
days = dict(zip(calendar.day_name,range(1,8)))


backup['Day of Week'] = backup['Day of Week'].apply(lambda s: days[s])

#dropping back up time as asked in question
backup.drop('Backup Time (hour)', axis=1, inplace=True)

#fetching only the numeric part of each work_flow_id and file_name_number
f = lambda s: int(s[s.rindex('_')+1:])

backup['Work-Flow-ID'] = backup['Work-Flow-ID'].apply(f)
backup['File Name'] = backup['File Name'].apply(f)
#backup.head()

## Grouping by work flow ids

In [199]:
#grouping by work flow ids. 
#all the rows corresponding to workflow id '0' are in one group
#all rows correspoding to '1' are in another group etc...
#preparing for linear regression
    #-all independent vars except size of backup are put into Xs
    #-dependent variable- size of backup is put into ys
Xs, Ys = [], []
for key, value in backup.groupby('Work-Flow-ID').groups.items():
    group = backup.loc[value].drop('Work-Flow-ID', axis=1).reset_index(drop=True)
    
    Xs.append(group.drop('Size of Backup (GB)', axis=1))
    Ys.append(group[['Size of Backup (GB)']]) 


## Scalar encoding

In [200]:
scaler_encdata = backup # scaler encoding data
print(set(scaler_encdata['Week #']))
print(set(scaler_encdata['Backup Start Time - Hour of Day']))
print(set(scaler_encdata['Day of Week']))
print(set(scaler_encdata))

#print(set(scaler_encdata['Size of Backup (GB)']))

{1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15}
{1, 5, 9, 13, 17, 21}
{1, 2, 3, 4, 5, 6, 7}
{'Backup Start Time - Hour of Day', 'Size of Backup (GB)', 'File Name', 'Day of Week', 'Work-Flow-ID', 'Week #'}


## Question 2 a)

In [201]:
import matplotlib.pyplot as plt
from sklearn import linear_model
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import KFold
from math import sqrt


def linreg_model_2a(dataset, allpre_res): 
    train_RMSE = []
    test_RMSE = []

    kf = KFold(n_splits = 10, random_state = None, shuffle = False)
    # Create linear regression object
    regr = linear_model.LinearRegression()
    for train_index, test_index in kf.split(dataset):

        X_train, X_test = dataset[train_index], dataset[test_index]
        y_train, y_test = pre_gt[train_index], pre_gt[test_index]

        # Train the model using the training sets
        regr.fit(X_train, y_train)
        # Make predictions using the training set
        train_pred_y = regr.predict(X_train)
        # Make predictions using the testing set
        test_pred_y = regr.predict(X_test)
        
        for i in range(len(test_index)):
            allpre_res[test_index[i]] = test_pred_y[i]

   
        # RMSE in train set
        train_RMSE.append(mean_squared_error(y_train, train_pred_y))
        # RMSE in test set
        test_RMSE.append(mean_squared_error(y_test, test_pred_y))

    
    return sqrt(np.mean(train_RMSE)), sqrt(np.mean(test_RMSE))  



In [202]:
# basic linear model 
predictions = [None for i in range(len(backup))]
#print(len(predictions))
print("train RMSE, test RMSE",linreg_model_2a(scaler_encdata.drop(columns=["Size of Backup (GB)"]).values,predictions))


train RMSE, test RMSE (0.10358539364277801, 0.1036758476759903)


In [203]:
    x = np.arange(0, 1000,1)
    y1 = predictions 
    y2 = pre_gt  
    residual = [pre_gt_i - predictions_i for pre_gt_i, predictions_i in zip(pre_gt, predictions)]
    y3 = residual
    y1=y1
    y2=y2
    y3=y3
    # basic model plot
    # fitted value and ground truth

    trace0 = go.Scatter(
    x = x,
    y = y1,
    mode = 'markers',
    name = 'fitted value'
    )
    trace1 = go.Scatter(
    x = x,
    y = y2,
    mode = 'markers',
    name = 'ground truth'
    )

data = [trace0, trace1]
py.offline.iplot(data)
  # fitted value and residual
    trace2 = go.Scatter(
    x = x,
    y = y1,
    mode = 'markers',
    name = 'fitted value'
    )
    trace3 = go.Scatter(
    x = x,
    y = y3,
    mode = 'markers',
    name = 'residual'
    )

data2 = [trace2, trace3]

py.offline.iplot(data2)

In [204]:
#Linear regression with cross validation
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import KFold
from sklearn.metrics import mean_squared_error
from math import sqrt

def linearreg(model, X, y):
    kf = KFold(n_splits=10)
    rmse_train, rmse_test = 0, 0
    for train_index, test_index in kf.split(X):
        X_train, X_test = X.loc[train_index], X.loc[test_index]
        y_train, y_test = y.loc[train_index], y.loc[test_index]
        model.fit(X_train, y_train)
        rmse_train += mean_squared_error(y_train, model.predict(X_train))
        rmse_test += mean_squared_error(y_test, model.predict(X_test))

    return sqrt(rmse_train / 10), sqrt(rmse_test / 10)

## Question 2(d)i

In [205]:
# Question 2(d)i: 
for i in range(len(Ys)):
    rmse_train, rmse_test = linearreg(LinearRegression(), Xs[i], Ys[i])
    print('work_flow_'+str(i)+'   rmse_train '+ str(rmse_train)+'    rmse_test  '+str(rmse_test))

work_flow_0   rmse_train 0.035835520779861095    rmse_test  0.035886970248931206
work_flow_1   rmse_train 0.14876603056260168    rmse_test  0.14891860201393803
work_flow_2   rmse_train 0.04290932063907724    rmse_test  0.0430669058478793
work_flow_3   rmse_train 0.0072438788738825345    rmse_test  0.007260894242099694
work_flow_4   rmse_train 0.08592193679327194    rmse_test  0.08599061411565447


## Question 2(d)ii

In [206]:
# Question 2(d)ii
# High degrees can cause overfitting
from sklearn.preprocessing import PolynomialFeatures
from sklearn.pipeline import make_pipeline

degrees = np.arange(2,10,1)
rmses_train, rmses_test = [], []
for i in range(len(Ys)):
    rmses_train.append([])
    rmses_test.append([])
    for degree in degrees:
        model = make_pipeline(PolynomialFeatures(degree, include_bias=False), LinearRegression())
        rmse_train, rmse_test = linearreg(model, Xs[i], Ys[i])
        rmses_train[i].append(rmse_train)
        rmses_test[i].append(rmse_test)

In [223]:

data = []
    
for i in range(len(rmses_train)):
    trace = go.Scatter(x=x, 
                       y=rmses_train[i], 
                       mode='lines', 
                    line=dict(width=2), 
                       name=('work_flow_'+str(i)))
    data.append(trace)

layout = go.Layout(title='RMSE_train VS Degree',
                       xaxis=dict(title='Degree', dtick=1),
                       yaxis=dict(title='rmse train'))

fig = go.Figure(data=data, layout=layout)
py.offline.iplot(fig)
  

In [210]:
data = []
    
for i in range(len(rmses_test)):
    trace = go.Scatter(x=x, 
                       y=rmses_train[i], 
                       mode='lines', 
                        line=dict(width=2), 
                       name=('work_flow_'+str(i)))
    data.append(trace)

layout = go.Layout(title='rmse test VS Degree',
                       xaxis=dict(title='Degree', dtick=1),
                       yaxis=dict(title='rmse test'))

fig = go.Figure(data=data, layout=layout)
py.offline.iplot(fig)

In [211]:
rmses_test_sum = [[ sum(x) for x in zip(*rmses_test) ]]
#Plot avg test RMSE against degree 
data = []
    
for i in range(len(rmses_test_sum)):
        trace = go.Scatter(x=degrees, 
                           y=rmses_test_sum[i], 
                           mode='lines', 
                           line=dict(width=2), 
                           name=('work_flow_'+str(i)))
data.append(trace)

layout = go.Layout(title='RMSE_test_all VS Degree',
                       xaxis=dict(title='Degree', dtick=1),
                       yaxis=dict(title='RMSE_test_all'))
fig = go.Figure(data=data, layout=layout)
py.offline.iplot(fig)

In [212]:
#getting ground truths and predictions 
X = backup.drop('Size of Backup (GB)', axis=1)
Y = backup[['Size of Backup (GB)']]
model = make_pipeline(PolynomialFeatures(degree=7, include_bias=False), LinearRegression())
kf = KFold(n_splits=10)
predictions, ground_truth = [], []
for train_index, test_index in kf.split(X):
    X_train, X_test = X.loc[train_index], X.loc[test_index]
    y_train, y_test = Y.loc[train_index], Y.loc[test_index]
    model.fit(X_train, y_train)
    for n in model.predict(X_test).tolist():
        predictions.extend(n)
    for n in y_test.values.tolist():
        ground_truth.extend(n)

In [213]:
    x = np.arange(0, 1000,1)
    y1 = predictions 
    y2 = ground_truth  
    residual = [ground_truth_i - predictions_i for ground_truth_i, predictions_i in zip(ground_truth, predictions)]
    y3 = residual
    y1=y1[0:1000]
    y2=y2[0:1000]
    y3=y3[0:1000]
    # basic model plot
    # fitted value and ground truth

    trace0 = go.Scatter(
    x = x,
    y = y1,
    mode = 'markers',
    name = 'fitted value'
    )
    trace1 = go.Scatter(
    x = x,
    y = y2,
    mode = 'markers',
    name = 'ground truth'
    )

data = [trace0, trace1]
py.offline.iplot(data)
  # fitted value and residual
    trace2 = go.Scatter(
    x = x,
    y = y1,
    mode = 'markers',
    name = 'fitted value'
    )
    trace3 = go.Scatter(
    x = x,
    y = y3,
    mode = 'markers',
    name = 'residual'
    )

data2 = [trace2, trace3]

py.offline.iplot(data2)




## Question 2 e)

In [214]:
#question 2e
import warnings
warnings.filterwarnings("ignore")
rmse_train = []
rmse_test = []
rmse_plot_train = []
rmse_plot_test = []
rmse_gd = []
plot_gd_all = []
plot_test_all = []
min_test = 100
min_neigh = 0
folds=10
training_data = pd.DataFrame(backup).astype('float64')
training_label = np.asarray(backup['Size of Backup (GB)'].values, dtype=np.float)
for neighborsize in range(1, 30):
    for rd in range(folds):
        # polynomial_features = PolynomialFeatures(degree=6, include_bias=False)
        clf = KNeighborsRegressor(n_neighbors=neighborsize)
        X_train, X_test, y_train, y_test = train_test_split(training_data, training_label, train_size=0.9)
        pipeline = Pipeline([ ("KNN", clf)])
        pipeline.fit(X_train, y_train)
        train_predict = pipeline.predict(X_train)
        test_predict = pipeline.predict(X_test)
        # clf.fit(onehot_encoded_data, training_label)
        # print standard_data.shape
        # print training_label.shape
        rmse_train.append(np.sqrt(mean_squared_error(y_train, train_predict)))
        rmse_test.append(np.sqrt(mean_squared_error(y_test, test_predict)))
    print ('rmse_train: ', np.mean(rmse_train), 'rmse_test:', np.mean(rmse_test))
    if min_test > np.mean(rmse_test):
        min_test = np.mean(rmse_test)
        min_neigh = neighborsize
    rmse_plot_train.append(np.mean(rmse_train))
    rmse_plot_test.append(np.mean(rmse_test))
    rmse_train = []
    rmse_test = []
print( "min test rmse: ", min_test, "at neighbour size: ", min_neigh)

rmse_train:  0.0 rmse_test: 0.007913164028754307
rmse_train:  0.003928415286991359 rmse_test: 0.010745045772640238
rmse_train:  0.006912012955769267 rmse_test: 0.013569987722437588
rmse_train:  0.011433164265744053 rmse_test: 0.01991426507240188
rmse_train:  0.016369204105343132 rmse_test: 0.02382596431547275
rmse_train:  0.020441121786289887 rmse_test: 0.024756234432773208
rmse_train:  0.02187253655061742 rmse_test: 0.027517920598894373
rmse_train:  0.023621510261708386 rmse_test: 0.029410262337051185
rmse_train:  0.026133258420070964 rmse_test: 0.03147862620939981
rmse_train:  0.029339800442021953 rmse_test: 0.032411447010771643
rmse_train:  0.032891041075129135 rmse_test: 0.038347677183909495
rmse_train:  0.03604178774686018 rmse_test: 0.04076673296798818
rmse_train:  0.03944793457076546 rmse_test: 0.04676733984854359
rmse_train:  0.04156265428904547 rmse_test: 0.04475293504860893
rmse_train:  0.04372523045878933 rmse_test: 0.04521807858523969
rmse_train:  0.04445795387945354 rmse_t

In [217]:
import warnings
warnings.filterwarnings("ignore")
for rd in range(folds):
    # polynomial_features = PolynomialFeatures(degree=6, include_bias=False)
    clf = KNeighborsRegressor(n_neighbors=min_neigh)
    X_train, X_test, y_train, y_test = train_test_split(training_data, training_label, train_size=0.9)
    pipeline = Pipeline([("KNN", clf)])
    pipeline.fit(X_train, y_train)
    train_predict = pipeline.predict(X_train)
    test_predict = pipeline.predict(X_test)
    # clf.fit(onehot_encoded_data, training_label)
    # print standard_data.shape
    # print training_label.shape
    rmse_train.append(np.sqrt(mean_squared_error(y_train, train_predict)))
    rmse_test.append(np.sqrt(mean_squared_error(y_test, test_predict)))
    plot_gd_all.extend(test_predict)
    plot_test_all.extend(y_test)




In [218]:
    x = np.arange(0, 1000,1)
    y1 = plot_test_all 
    y2 = plot_gd_all  
    residual = [ground_truth_i - predictions_i for ground_truth_i, predictions_i in zip(plot_gd_all, plot_test_all)]
    y3 = residual
    y1=y1[0:1000]
    y2=y2[0:1000]
    y3=y3[0:1000]
    # basic model plot
    # fitted value and ground truth

    trace0 = go.Scatter(
    x = x,
    y = y1,
    mode = 'markers',
    name = 'fitted value'
    )
    trace1 = go.Scatter(
    x = x,
    y = y2,
    mode = 'markers',
    name = 'ground truth'
    )

data = [trace0, trace1]
py.offline.iplot(data)
  # fitted value and residual
    trace2 = go.Scatter(
    x = x,
    y = y1,
    mode = 'markers',
    name = 'fitted value'
    )
    trace3 = go.Scatter(
    x = x,
    y = y3,
    mode = 'markers',
    name = 'residual'
    )

data2 = [trace2, trace3]

py.offline.iplot(data2)
