In [109]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from collections import Counter

In [110]:
def file_process(data_file, info=False):
    input_file = data_file
    df = pd.read_csv(input_file, header = 0)
    #df.dropna(inplace = True) 
    df.fillna(1.0/500.0) #the corruption rate for the cache failures is all set to be "NaN"
    df['corrupt_label']=df['corrupt_label'].str.replace('-','.')
    df['flow'] = df['submit_host']+'-'+df['execution_host']
    df['FM']=df['transfer_success']+df['checksum_success']
    
    df_failure = df[df.checksum_success==False]
    df_dummy = pd.get_dummies(df_failure[['submit_host', 'execution_host','flow','transfer_success', 'checksum_success','src_label','dst_label','bytes','corrupt_rate','corrupt_label']], prefix=['submit_host', 'execution_host','src_label','dst_label','flow'], columns=['submit_host', 'execution_host','src_label','dst_label','flow'])
    #df_dummy['bytes']=df_dummy['bytes'].str[0:-4]
    df_dummy['bytes']=pd.to_numeric(df_dummy['bytes'], errors='coerce')
    df_dummy['corrupt_rate']=pd.to_numeric(df_dummy['corrupt_rate'], errors='coerce')
    df_dummy = df_dummy.replace(np.nan, 0, regex=True)
    y_complete=df_dummy['corrupt_label']
    X_complete=df_dummy.drop(['corrupt_label'],axis=1)
    
    if info:
        original_headers = list(df.columns.values)
        print("Original shape:")
        print(original_headers)
        print("Original shape:"+str(df.shape))
        dummy_headers = list(df_dummy.columns.values)
        print("Encoded shape:")
        print(dummy_headers)
        print(str(df_dummy.shape))
    
    return df,df_dummy,X_complete,y_complete
    
    #return df,df_dummy

In [111]:
#input_file = "../data/exogeni/var/iris_results/01v2_02_20210115_0445AM/01v2_02_20210115_0445AM_full.csv"
input_file = "../data/exogeni/var2/iris_results/01v2_02_20210125_1336PM/01v2_02_20210125_1336PM_full.csv"
df_ori,df_dummy,X_complete, y_complete=file_process(input_file,True)
#df_ori,df_dummy=file_process(input_file,False)
y_complete=df_dummy['corrupt_label']
X_complete=df_dummy.drop(['corrupt_label'],axis=1)
df_training=[df_dummy]

training_dataset={"complete":{"X":X_complete,
                 "y":y_complete}}

Original shape:
['root_xwf_id', 'job_id', 'start_time', 'end_time', 'submit_host', 'submit_user', 'execution_host', 'execution_user', 'job_type', 'job_exit_code', 'bytes', 'lfn', 'src_label', 'src_url', 'src_proto_host', 'dst_label', 'dst_url', 'dst_proto_host', 'transfer_success', 'checksum_success', 'actual_checksum', 'expected_checksum', 'scenario', 'corrupt_label', 'corrupt_start', 'corrupt_end', 'corrupt_rate', 'flow', 'FM']
Original shape:(45291, 29)
Encoded shape:
['transfer_success', 'checksum_success', 'bytes', 'corrupt_rate', 'corrupt_label', 'submit_host_syr-submit', 'submit_host_uc-submit', 'submit_host_ucsd-submit', 'submit_host_unl-submit', 'execution_host_syr-compute-c0', 'execution_host_syr-compute-c1', 'execution_host_uc-compute-c0', 'execution_host_uc-compute-c1', 'execution_host_ucsd-compute-c0', 'execution_host_ucsd-compute-c1', 'execution_host_unl-compute-c0', 'execution_host_unl-compute-c1', 'src_label_syr', 'src_label_uc', 'src_label_ucsd', 'src_label_unl', 'dst_

  op=op_str, alt_op=unsupported[op_str]


In [112]:
#test_file = "../data/exogeni/var2/iris_results/01v2_02_20210125_1336PM/01v2_02_20210125_1336PM_full.csv"
test_file = "../data/exogeni/var/iris_results/01v2_02_20210115_0445AM/01v2_02_20210115_0445AM_full.csv"
df_t_ori,df_t_dummy,X_test, y_test=file_process(test_file,True)
#df_t_ori,df_t_dummy=file_process(test_file,True)
#y_test=df_t_dummy['corrupt_label']
#X_test=df_t_dummy.drop(['corrupt_label'],axis=1)

Original shape:
['root_xwf_id', 'job_id', 'start_time', 'end_time', 'submit_host', 'submit_user', 'execution_host', 'execution_user', 'job_type', 'job_exit_code', 'bytes', 'lfn', 'src_label', 'src_url', 'src_proto_host', 'dst_label', 'dst_url', 'dst_proto_host', 'transfer_success', 'checksum_success', 'actual_checksum', 'expected_checksum', 'scenario', 'corrupt_label', 'corrupt_start', 'corrupt_end', 'corrupt_rate', 'flow', 'FM']
Original shape:(44849, 29)
Encoded shape:
['transfer_success', 'checksum_success', 'bytes', 'corrupt_rate', 'corrupt_label', 'submit_host_syr-submit', 'submit_host_uc-submit', 'submit_host_ucsd-submit', 'submit_host_unl-submit', 'execution_host_syr-compute-c0', 'execution_host_syr-compute-c1', 'execution_host_uc-compute-c0', 'execution_host_uc-compute-c1', 'execution_host_ucsd-compute-c0', 'execution_host_ucsd-compute-c1', 'execution_host_unl-compute-c0', 'execution_host_unl-compute-c1', 'src_label_syr', 'src_label_uc', 'src_label_ucsd', 'src_label_unl', 'dst_

In [113]:
def missing_feature(X_complete, X_test):
    print("X_complete:"+str(len(X_complete.columns)))
    print("X_test:" + str(len(X_test.columns)))
    c_t_missing=[]
    for c_c in X_complete.columns:
        if c_c not in X_test.columns:
            #print(c_c+" c_c not in test!")
            c_t_missing.append(c_c)
            
    c_c_missing=[]
    for c_t in X_test.columns:
        if c_t not in X_complete.columns:
            #print(c_t+" c_t not in complete!")
            c_c_missing.append(c_t)

    for c_c in c_t_missing:
        #print(c_t+" c_t dropping!")
        X_test[c_c]='0'
        
    for c_t in c_c_missing:
        #print(c_t+" c_t dropping!")
        X_test=X_test.drop(columns=c_t)
    
    print("After imputation:"+str(len(X_test.columns)))
    
    return X_test

In [114]:
X_test = missing_feature(X_complete, X_test)
df_t_dummy = missing_feature(df_dummy, df_t_dummy)
df_t=[df_t_dummy]
testing_dataset={"testing":{"X":X_test,
                 "y":y_test}}

X_complete:55
X_test:52
After imputation:55
X_complete:56
X_test:53
After imputation:56


In [116]:
def accuracy(model, test_data, classes, class_label, k):
    if(k==0): return null,null
    num = classes.size
    correct_class={}
    correct=0
    for label in classes:
        label_pred=[]
        isCorrect=False
        test_data_1=test_data[test_data[class_label]==label]
        if test_data_1.shape[0]==0:
            print("No match")
            correct=correct+1
            correct_class[label]=1
            continue
        x_test_1=test_data_1.drop([class_label],axis=1)
        test_prob_1 = model.predict_proba(x_test_1)
        label_array=test_prob_1.mean(axis=0)
        #label_array=np.nanmean(np.where(test_rf_prob_1!=0,test_rf_prob_1,np.nan),0)
        label_index = label_array.argmax()
        label_index_sort = label_array.argsort()
        #label_pred[0] = classes[label_index]
        for j in range(0,k): 
            #print(f'j={j}, label={label_index}')
            if(j==0):
                label_pred.append(classes[label_index])
            else:
                label_pred.append(classes[label_index_sort[(-1)*j-1]])
            #print(j,label_pred[j],label)
            if(label==label_pred[j]):
                isCorrect=True
                break
               
        #print(label_pred)
        #print(isCorrect)
        if isCorrect:
            correct=correct+1
            correct_class[label]=1
        else:
            correct_class[label]=0
            print("wrong label:"+label)
            for j in range(0,k): 
                print("predicted label "+str(j)+":"+label_pred[j])
    return correct_class, correct, correct/num 

In [117]:
def train_dt(classifier, df, training_dataset,class_label):
    i=0
    for k, d in training_dataset.items():
        X=d["X"]
        y=d['y']
        clf = classifier
        clf_model=clf.fit(X,y)
        pred=clf.predict(X)
        balanced_accu=balanced_accuracy_score(y, pred)
        f1=f1_score(y, pred,average='weighted')
        print(str(k)+ "label_size:" + str(clf.classes_.size) + ":balanced_accu="+str(balanced_accu)+":F1-Score="+str(f1))
        for j in range(1,4):
            c,correct,accu=accuracy(clf_model, df[i], clf.classes_, class_label, j)
            print("Top-"+str(j)+" Accu="+str(accu)) 
            
    return clf_model

In [118]:
def test_dt(clf_model,df_t, testing_dataset,class_label):
    i=0
    for k, d in testing_dataset.items():
        y_t = d['y']
        X_t = d['X']
        pred=clf_model.predict(X_t)
        balanced_accu=balanced_accuracy_score(y_t, pred)
        f1=f1_score(y_t, pred,average='weighted')
        print(":balanced_accu="+str(balanced_accu)+":F1-Score="+str(f1))
        for j in range(1,4):
            c,correct,accu=accuracy(clf_model, df_t[i], clf_model.classes_,class_label,j)
            print("Top-"+str(j)+" Accu="+str(accu)) 
        i=i+1

In [119]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import OneHotEncoder
from sklearn import tree
import graphviz 
from sklearn.calibration import CalibratedClassifierCV

from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix
from sklearn.utils.multiclass import unique_labels
from sklearn.metrics import multilabel_confusion_matrix,balanced_accuracy_score,f1_score

# 1. Try the top-k classification with the corrupted flows

In [120]:
clf_model = train_dt(RandomForestClassifier(max_depth=20, random_state=0),df_training,training_dataset, 'corrupt_label')

completelabel_size:26:balanced_accu=0.5267422484650454:F1-Score=0.5385219099058715
wrong label:cenic.Link9 
predicted label 0:starlight.Link24 
wrong label:esnet.Link7 
predicted label 0:starlight.Link24 
wrong label:internet2.Link2 
predicted label 0:unl.router.n1.Link14 
wrong label:starlight.Link6 
predicted label 0:syr.compute.c0.Link26 
wrong label:starlight.Link7 
predicted label 0:esnet.Link22 
wrong label:starlight.Link9 
predicted label 0:syr.router.n2.Link24 
wrong label:ucsd.compute.c1.Link13 
predicted label 0:starlight.Link24 
wrong label:ucsd.router.n3.Link22 
predicted label 0:esnet.Link1 
wrong label:unl.compute.c1.Link16 
predicted label 0:unl.router.n1.Link14 
Top-1 Accu=0.6538461538461539
wrong label:cenic.Link9 
predicted label 0:starlight.Link24 
predicted label 1:uc.router.n0.Link12 
wrong label:starlight.Link6 
predicted label 0:syr.compute.c0.Link26 
predicted label 1:syr.router.n2.Link24 
wrong label:starlight.Link7 
predicted label 0:esnet.Link22 
predicted la

In [121]:
test_dt(clf_model,df_t,testing_dataset, 'corrupt_label')

:balanced_accu=0.3770600275569455:F1-Score=0.3462134573288588
wrong label:cenic.Link9 
predicted label 0:starlight.Link24 
No match




No match
No match
wrong label:starlight.Link6 
predicted label 0:syr.router.n2.Link24 
No match
No match
No match
wrong label:uc.compute.c1.Link4 
predicted label 0:uc.router.n0.Link12 
No match
wrong label:ucsd.compute.c1.Link13 
predicted label 0:ucsd.cache 
wrong label:ucsd.router.n3.Link22 
predicted label 0:internet2.Link14 
wrong label:unl.compute.c1.Link16 
predicted label 0:unl.router.n1.Link14 
Top-1 Accu=0.7692307692307693
wrong label:cenic.Link9 
predicted label 0:starlight.Link24 
predicted label 1:uc.router.n0.Link12 
No match
No match
No match
wrong label:starlight.Link6 
predicted label 0:syr.router.n2.Link24 
predicted label 1:syr.compute.c0.Link26 
No match
No match
No match
wrong label:uc.compute.c1.Link4 
predicted label 0:uc.router.n0.Link12 
predicted label 1:esnet.Link22 
No match
wrong label:ucsd.compute.c1.Link13 
predicted label 0:ucsd.cache 
predicted label 1:ucsd.compute.c0.Link23 
wrong label:ucsd.router.n3.Link22 
predicted label 0:internet2.Link14 
predict

# 2. Let's balance the data via oversampling

In [122]:
from imblearn.over_sampling import RandomOverSampler,SMOTE, ADASYN,SMOTENC
from imblearn.combine import SMOTEENN, SMOTETomek

def train_over_sampling(classifier,df,training_dataset,class_label,test=False):
    i=0
    over_sampling = [RandomOverSampler(random_state=0),
        #SMOTE(random_state=0),
        #SMOTEENN(random_state=0),
        #SMOTETomek(random_state=0),
        #SMOTENC(categorical_features=[4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43], random_state=0)

    ]
    for k, d in training_dataset.items():
        for s in range(0,len(over_sampling)):
            X=d["X"]
            y=d['y']
            print("sampling:"+str(s)+":"+str(over_sampling[s]))
            os = over_sampling[s]
            X_resampled, y_resampled = os.fit_resample(X, y)
            #clf = RandomForestClassifier(max_depth=20, random_state=0)
            clf = classifier
            clf_model=clf.fit(X_resampled, y_resampled)
            
            pred=clf.predict(X)
            balanced_accu=balanced_accuracy_score(y, pred)
            f1=f1_score(y, pred,average='weighted')
            print(str(k)+":balanced_accu="+str(balanced_accu)+":F1-Score="+str(f1))
            
            for j in range(1,4):
                if test:
                    c,correct,accu=accuracy(clf_model, df_t[i], clf.classes_,class_label,j)
                else:
                    c,correct,accu=accuracy(clf_model, df[i], clf.classes_,class_label,j)
                print("Top-"+str(j)+" Accu="+str(accu)) 
        i=i+1
        
    return clf_model

In [123]:
#df_ori,df_dummy=file_process(input_file,True)
#y_complete=df_dummy['corrupt_label']
#X_complete=df_dummy.drop(['corrupt_label'],axis=1)

df_ori,df_dummy,X_complete, y_complete=file_process(input_file,True)
df=[df_dummy]

training_dataset={"complete":{"X":X_complete,
                 "y":y_complete}}

clf_model = train_over_sampling(RandomForestClassifier(max_depth=20, random_state=0),df,training_dataset, 'corrupt_label')

  op=op_str, alt_op=unsupported[op_str]


Original shape:
['root_xwf_id', 'job_id', 'start_time', 'end_time', 'submit_host', 'submit_user', 'execution_host', 'execution_user', 'job_type', 'job_exit_code', 'bytes', 'lfn', 'src_label', 'src_url', 'src_proto_host', 'dst_label', 'dst_url', 'dst_proto_host', 'transfer_success', 'checksum_success', 'actual_checksum', 'expected_checksum', 'scenario', 'corrupt_label', 'corrupt_start', 'corrupt_end', 'corrupt_rate', 'flow', 'FM']
Original shape:(45291, 29)
Encoded shape:
['transfer_success', 'checksum_success', 'bytes', 'corrupt_rate', 'corrupt_label', 'submit_host_syr-submit', 'submit_host_uc-submit', 'submit_host_ucsd-submit', 'submit_host_unl-submit', 'execution_host_syr-compute-c0', 'execution_host_syr-compute-c1', 'execution_host_uc-compute-c0', 'execution_host_uc-compute-c1', 'execution_host_ucsd-compute-c0', 'execution_host_ucsd-compute-c1', 'execution_host_unl-compute-c0', 'execution_host_unl-compute-c1', 'src_label_syr', 'src_label_uc', 'src_label_ucsd', 'src_label_unl', 'dst_

In [124]:
test_dt(clf_model,df_t,testing_dataset, 'corrupt_label')

:balanced_accu=0.38526381956451944:F1-Score=0.28174397494575776
No match




No match
No match
wrong label:starlight.Link6 
predicted label 0:syr.router.n2.Link24 
No match
No match
No match
wrong label:uc.compute.c1.Link4 
predicted label 0:uc.router.n0.Link12 
No match
wrong label:ucsd.compute.c1.Link13 
predicted label 0:ucsd.compute.c0.Link23 
wrong label:ucsd.router.n3.Link22 
predicted label 0:ucsd.compute.c0.Link23 
Top-1 Accu=0.8461538461538461
No match
No match
No match
wrong label:starlight.Link6 
predicted label 0:syr.router.n2.Link24 
predicted label 1:internet2.Link14 
No match
No match
No match
No match
wrong label:ucsd.compute.c1.Link13 
predicted label 0:ucsd.compute.c0.Link23 
predicted label 1:ucsd.cache 
wrong label:ucsd.router.n3.Link22 
predicted label 0:ucsd.compute.c0.Link23 
predicted label 1:internet2.Link14 
Top-2 Accu=0.8846153846153846
No match
No match
No match
wrong label:starlight.Link6 
predicted label 0:syr.router.n2.Link24 
predicted label 1:internet2.Link14 
predicted label 2:syr.compute.c0.Link26 
No match
No match
No match
N

# 3. Try the coarser per-site classification

In [125]:
#count the number of labels in the data and the number of labels that actually caused errors.
def num_label(df, df_dummy, site):
    site_ori = df[site]
    counter_site = Counter(site_ori)
    print("original num_label:" + str(len(counter_site)))
    #for k,v in counter_site.items():
    #    print(k)
    
    site_dummy = df_dummy[site]
    counter_site_dummy = Counter(site_dummy)
    print("recorded num_label:" + str(len(counter_site_dummy)))

    site_name = df[site].str.split('.', n=1, expand = True)
    site_dummy_name = df[site].str.split('.', n=1, expand = True)
    
    return counter_site, counter_site_dummy,site_name,site_dummy_name

In [127]:
counter_site,counter_site_dummy,site_name,site_dummy_name=num_label(df_ori,df_dummy,'corrupt_label')
df_ori['site'] = site_name[0]
df_dummy['site'] = site_dummy_name[0]
df_dummy=df_dummy.drop(['corrupt_label'],axis=1)
y_complete=df_dummy['site']
X_complete=df_dummy.drop(['site'],axis=1)

df=[df_dummy]
    
training_dataset={"complete":{"X":X_complete,
                 "y":y_complete}}

site_model = train_dt(RandomForestClassifier(max_depth=20, random_state=0),df,training_dataset, 'site')

original num_label:49
recorded num_label:26
completelabel_size:8:balanced_accu=0.6093384531968332:F1-Score=0.6106753029121078
Top-1 Accu=1.0
Top-2 Accu=1.0
Top-3 Accu=1.0


In [130]:
counter_site_t,counter_site_t_dummy,site_t_name,site_t_dummy_name=num_label(df_t_ori,df_t_dummy,'corrupt_label')
df_t_ori['site'] = site_t_name[0]
df_t_dummy['site'] = site_t_dummy_name[0]
df_t_dummy=df_t_dummy.drop(['corrupt_label'],axis=1)
y_t=df_t_dummy['site']
X_t=df_t_dummy.drop(['site'],axis=1)

df_t=[df_t_dummy]
    
testing_dataset={"complete":{"X":X_t,
                 "y":y_t}}
test_dt(site_model,df_t,testing_dataset, 'site')

original num_label:49
recorded num_label:23
:balanced_accu=0.5880701023451244:F1-Score=0.4487391398386769
wrong label:internet2
predicted label 0:uc
Top-1 Accu=0.875
Top-2 Accu=1.0
Top-3 Accu=1.0


# 4. try the probability based model

In [131]:
def generate_prob_dict(df, target_name):
    # summarize the class distribution
    #target = df.values[:,-5]
    target = df[target_name]
    counter = Counter(target)
    label_prob_dict={}
    for k,v in counter.items():
        per = v / len(target) * 100
        #print('Class=%s, Count=%d, Percentage=%.3f%%' % (k, v, per))
        df_c=df[(df[target_name]==k)]
        target_flow=df_c['flow']
        counter_flow = Counter(target_flow)
        flow_prob_dict={}
        for i,j in counter_flow.items():
            #print(i+":"+str(j)+":"+str(len(target_flow)))
            per_flow = j / len(target_flow) * 100
            #print('flow=%s, Count=%d, Per__flow=%.3f%%' % (i, j, per_flow))
            df_flow=df_c[(df_c['flow']==i)]
            flow_count=df_flow['checksum_success'].value_counts(normalize=True)
            #print(flow_count.index.tolist())
            #print(flow_count.values.tolist())
            flow_count_dict=flow_count.to_dict()
            #print(flow_count_dict)
            if 0 in flow_count_dict:
                flow_prob_dict[i]=flow_count_dict[0]
            else:
                flow_prob_dict[i]=0
        label_prob_dict[k]=flow_prob_dict
    return label_prob_dict

In [152]:
df_ori,df_dummy,X_complete, y_complete=file_process(input_file,True)
label_prob_dict=generate_prob_dict(df_ori,"corrupt_label")
df_prob = pd.DataFrame.from_dict(label_prob_dict, orient='index')
df_prob = df_prob.fillna(0)
df_prob = df_prob[(df_prob.T !=0).any()] #drop rows with all zeros: no corruption labels
X=df_prob.to_numpy()
X=np.nan_to_num(X)
#print(X)
y=df_prob.index
#print(y.shape)

  op=op_str, alt_op=unsupported[op_str]


Original shape:
['root_xwf_id', 'job_id', 'start_time', 'end_time', 'submit_host', 'submit_user', 'execution_host', 'execution_user', 'job_type', 'job_exit_code', 'bytes', 'lfn', 'src_label', 'src_url', 'src_proto_host', 'dst_label', 'dst_url', 'dst_proto_host', 'transfer_success', 'checksum_success', 'actual_checksum', 'expected_checksum', 'scenario', 'corrupt_label', 'corrupt_start', 'corrupt_end', 'corrupt_rate', 'flow', 'FM']
Original shape:(45291, 29)
Encoded shape:
['transfer_success', 'checksum_success', 'bytes', 'corrupt_rate', 'corrupt_label', 'submit_host_syr-submit', 'submit_host_uc-submit', 'submit_host_ucsd-submit', 'submit_host_unl-submit', 'execution_host_syr-compute-c0', 'execution_host_syr-compute-c1', 'execution_host_uc-compute-c0', 'execution_host_uc-compute-c1', 'execution_host_ucsd-compute-c0', 'execution_host_ucsd-compute-c1', 'execution_host_unl-compute-c0', 'execution_host_unl-compute-c1', 'src_label_syr', 'src_label_uc', 'src_label_ucsd', 'src_label_unl', 'dst_

In [153]:
clf_prob = RandomForestClassifier(max_depth=20, random_state=0)
clf_prob.fit(X,y)
clf_prob.score(X,y)

1.0

In [154]:
df_t_ori,df_t_dummy,X_t_complete, y_t_complete=file_process(test_file,True)
df_t_ori = missing_feature(df_ori, df_t_ori)
df_t_dummy = missing_feature(df_dummy, df_t_dummy)
label_prob_dict_test=generate_prob_dict(df_t_ori,"corrupt_label")
df_prob_test = pd.DataFrame.from_dict(label_prob_dict_test, orient='index')
df_prob_test = df_prob_test.fillna(0)
df_prob_test = df_prob[(df_prob_test.T !=0).any()] #drop rows with all zeros: no corruption labels
X_t=df_prob_test.to_numpy()
print(X_t.shape)
X_t=np.nan_to_num(X_t)
print(X_t.shape)
y_t=df_prob_test.index
#print(y.shape)
clf_prob.score(X_t,y_t)

  op=op_str, alt_op=unsupported[op_str]


Original shape:
['root_xwf_id', 'job_id', 'start_time', 'end_time', 'submit_host', 'submit_user', 'execution_host', 'execution_user', 'job_type', 'job_exit_code', 'bytes', 'lfn', 'src_label', 'src_url', 'src_proto_host', 'dst_label', 'dst_url', 'dst_proto_host', 'transfer_success', 'checksum_success', 'actual_checksum', 'expected_checksum', 'scenario', 'corrupt_label', 'corrupt_start', 'corrupt_end', 'corrupt_rate', 'flow', 'FM']
Original shape:(44849, 29)
Encoded shape:
['transfer_success', 'checksum_success', 'bytes', 'corrupt_rate', 'corrupt_label', 'submit_host_syr-submit', 'submit_host_uc-submit', 'submit_host_ucsd-submit', 'submit_host_unl-submit', 'execution_host_syr-compute-c0', 'execution_host_syr-compute-c1', 'execution_host_uc-compute-c0', 'execution_host_uc-compute-c1', 'execution_host_ucsd-compute-c0', 'execution_host_ucsd-compute-c1', 'execution_host_unl-compute-c0', 'execution_host_unl-compute-c1', 'src_label_syr', 'src_label_uc', 'src_label_ucsd', 'src_label_unl', 'dst_

  import sys


1.0

In [162]:
clf_prob.predict([X_t[0]])

array(['unl.cache '], dtype=object)

In [163]:
y_t[0]

'unl.cache '

In [None]:
## not important data 

In [141]:
df_prob_test=df_prob[(df_prob['ucsd-submit-syr-compute-c0']!=0) | (df_prob['ucsd-submit-ucsd-compute-c1']!=0) | (df_prob['ucsd-submit-unl-compute-c0']!=0)]

In [142]:
X_test=df_prob_test.to_numpy()
X_test=np.nan_to_num(X_test)
print(X_test)
y_test=df_prob_test.index

[[0.         0.         0.19269103 0.         0.         0.
  0.         0.         0.         0.         0.         0.
  0.         0.06       0.         0.         0.15714286 0.
  0.45238095 0.28571429 0.         0.52380952 0.         0.3877551
  0.         0.         0.         0.         0.         0.
  0.         0.         0.        ]
 [0.03265306 0.         0.         0.         0.         0.
  0.         0.         0.         0.60714286 0.         0.
  0.21580547 0.         0.         0.         0.         0.
  0.         0.         0.         0.         0.         0.
  0.         0.13928571 0.         0.         0.         0.
  0.         0.         0.        ]
 [0.         0.12857143 0.         0.         0.         0.
  0.         0.06015038 0.         0.         0.         0.
  0.         0.         0.08571429 0.09243697 0.         0.
  0.         0.         0.         0.         0.         0.
  0.         0.         0.45454545 0.         0.57142857 0.
  0.         0.      

In [146]:
from sklearn.gaussian_process import GaussianProcessClassifier
from sklearn.gaussian_process.kernels import RBF

In [147]:
kernel = 1.0 * RBF([1.0])
gpc_rbf_isotropic = GaussianProcessClassifier(kernel=kernel).fit(X, y)

In [148]:
score=gpc_rbf_isotropic.score(X,y)
print(score)

0.2857142857142857


In [149]:
gpc_rbf_isotropic.score(X, y)

0.2857142857142857

In [161]:
df_c=df_ori[(df_ori['corrupt_label']=="syr-cache ")]
display(df_c)

Unnamed: 0,root_xwf_id,job_id,start_time,end_time,submit_host,submit_user,execution_host,execution_user,job_type,job_exit_code,...,checksum_success,actual_checksum,expected_checksum,scenario,corrupt_label,corrupt_start,corrupt_end,corrupt_rate,flow,FM


In [159]:
df_c=df_ori[(df_ori['checksum_success']==False)&(df_ori['execution_host']=='syr-compute-c0')&(df_ori['corrupt_label']=='cenic.Link12 ')]
print(len(df))
print(len(df_c))
#df_c=df[(df['corrupt_label']=="unl-compute-c1")]
display(df_c)

1
69


Unnamed: 0,root_xwf_id,job_id,start_time,end_time,submit_host,submit_user,execution_host,execution_user,job_type,job_exit_code,...,checksum_success,actual_checksum,expected_checksum,scenario,corrupt_label,corrupt_start,corrupt_end,corrupt_rate,flow,FM
26412,fb62a7b5-6065-47db-87b7-4911e6a6c8db,job_sh_ID0000014,1610729088,1610729092,uc-submit,ericafu,syr-compute-c0,ericafu,compute,256,...,False,e4112749f170fc8ab108d93f2795aa1c415dc16745fb5e...,2416276f6d923db58f9a998346cfd386c4c33188b92461...,bypass,cenic.Link12,1.610729e+09,1.610731e+09,0.02,uc-submit-syr-compute-c0,True
26413,fb62a7b5-6065-47db-87b7-4911e6a6c8db,job_sh_ID0000014,1610729088,1610729092,uc-submit,ericafu,syr-compute-c0,ericafu,compute,256,...,False,f637cd2bbbc74fffc19841c625956c60a248b74207b6a0...,66a1e5e51e526d106f1d9cd00a6c2cd05ba536c698b6c6...,bypass,cenic.Link12,1.610729e+09,1.610731e+09,0.02,uc-submit-syr-compute-c0,True
26414,fb62a7b5-6065-47db-87b7-4911e6a6c8db,job_sh_ID0000014,1610729088,1610729092,uc-submit,ericafu,syr-compute-c0,ericafu,compute,256,...,False,16df662f7bc1813cdf1e0ffe17979da2c7e130b7379eaf...,0bab203b629d88b4ce760122fddeecc39879f31389e1a7...,bypass,cenic.Link12,1.610729e+09,1.610731e+09,0.02,uc-submit-syr-compute-c0,True
26415,fb62a7b5-6065-47db-87b7-4911e6a6c8db,job_sh_ID0000014,1610729088,1610729092,uc-submit,ericafu,syr-compute-c0,ericafu,compute,256,...,False,aa889a05a5c9c0d56373a3053eb3ede5aa11f1e865195b...,bf239774fe1bcf1895e2c3c550c1f4f0be5edc223ebaee...,bypass,cenic.Link12,1.610729e+09,1.610731e+09,0.02,uc-submit-syr-compute-c0,True
26416,fb62a7b5-6065-47db-87b7-4911e6a6c8db,job_sh_ID0000014,1610729088,1610729092,uc-submit,ericafu,syr-compute-c0,ericafu,compute,256,...,False,993b0c4e7e46085567af2f4f7762931bda5b2fed111e63...,096a0681b644e28d0082f26ac153459ed15484ba26201f...,bypass,cenic.Link12,1.610729e+09,1.610731e+09,0.02,uc-submit-syr-compute-c0,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
27645,fb62a7b5-6065-47db-87b7-4911e6a6c8db,job_sh_ID0000029,1610730835,1610730839,uc-submit,ericafu,syr-compute-c0,ericafu,compute,256,...,False,7fb8b57ff5accc24887b567f4b1c9ff735b57abd4c9b4f...,7902fe2217fc7ac847e40d8b7fdec6bf820520918be642...,bypass,cenic.Link12,1.610729e+09,1.610731e+09,0.02,uc-submit-syr-compute-c0,True
27655,fb62a7b5-6065-47db-87b7-4911e6a6c8db,job_sh_ID0000009,1610730854,1610730858,uc-submit,ericafu,syr-compute-c0,ericafu,compute,256,...,False,1b15d9fe9b2e43e9027450d154e8c45476c9a7b64fc591...,66a1e5e51e526d106f1d9cd00a6c2cd05ba536c698b6c6...,bypass,cenic.Link12,1.610729e+09,1.610731e+09,0.02,uc-submit-syr-compute-c0,True
27656,fb62a7b5-6065-47db-87b7-4911e6a6c8db,job_sh_ID0000009,1610730854,1610730858,uc-submit,ericafu,syr-compute-c0,ericafu,compute,256,...,False,10bac0ab7f34d814cff569c059098e3a331746d7c327b5...,0bab203b629d88b4ce760122fddeecc39879f31389e1a7...,bypass,cenic.Link12,1.610729e+09,1.610731e+09,0.02,uc-submit-syr-compute-c0,True
27657,fb62a7b5-6065-47db-87b7-4911e6a6c8db,job_sh_ID0000009,1610730854,1610730858,uc-submit,ericafu,syr-compute-c0,ericafu,compute,256,...,False,e51caf24845513a3c946af5623a6ebecc5f4ec586338bd...,bf239774fe1bcf1895e2c3c550c1f4f0be5edc223ebaee...,bypass,cenic.Link12,1.610729e+09,1.610731e+09,0.02,uc-submit-syr-compute-c0,True


In [151]:
gpc_rbf_isotropic.classes_

array(['cenic.Link0 ', 'cenic.Link1 ', 'cenic.Link12 ', 'cenic.Link9 ',
       'esnet.Link2 ', 'esnet.Link22 ', 'internet2.Link0 ',
       'internet2.Link14 ', 'internet2.Link6 ', 'starlight.Link24 ',
       'starlight.Link6 ', 'syr.compute.c0.Link26 ',
       'syr.router.n2.Link24 ', 'uc.compute.c0.Link3 ',
       'uc.compute.c1.Link4 ', 'uc.router.n0.Link12 ',
       'ucsd.compute.c0.Link23 ', 'ucsd.compute.c1.Link13 ',
       'ucsd.router.n3.Link22 ', 'unl.compute.c1.Link16 ',
       'unl.router.n1.Link14 '], dtype=object)