In [317]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from collections import Counter

In [318]:
def file_process(data_file, info=False):
    input_file = data_file
    df = pd.read_csv(input_file, header = 0)
    #df.dropna(inplace = True) 
    df.fillna(1.0/500.0) #the corruption rate for the cache failures is all set to be "NaN"
    df['corrupt_label']=df['corrupt_label'].str.replace('-','.').str.strip()
    df['flow'] = df['submit_host']+'-'+df['execution_host']
    df['FM']=df['transfer_success']+df['checksum_success']
    
    df_failure = df[df.checksum_success==False]
    df_dummy = pd.get_dummies(df_failure[['submit_host', 'execution_host','flow','transfer_success', 'checksum_success','src_label','dst_label','bytes','corrupt_rate','corrupt_label']], prefix=['submit_host', 'execution_host','src_label','dst_label','flow'], columns=['submit_host', 'execution_host','src_label','dst_label','flow'])
    #df_dummy['bytes']=df_dummy['bytes'].str[0:-4]
    df_dummy['bytes']=pd.to_numeric(df_dummy['bytes'], errors='coerce')
    df_dummy['corrupt_rate']=pd.to_numeric(df_dummy['corrupt_rate'], errors='coerce')
    df_dummy = df_dummy.replace(np.nan, 0, regex=True)
    y_complete=df_dummy['corrupt_label']
    X_complete=df_dummy.drop(['corrupt_label'],axis=1)
    
    if info:
        original_headers = list(df.columns.values)
        print("Original shape:")
        print(original_headers)
        print("Original shape:"+str(df.shape))
        dummy_headers = list(df_dummy.columns.values)
        print("Encoded shape:")
        print(dummy_headers)
        print(str(df_dummy.shape))
    
    return df,df_dummy,X_complete,y_complete
    
    #return df,df_dummy

In [319]:
input_file = "../data/exogeni/var/iris_results/01v2_02_20210115_0445AM/01v2_02_20210115_0445AM_full.csv"
#input_file = "../data/exogeni/var2/iris_results/01v2_02_20210125_1336PM/01v2_02_20210125_1336PM_full.csv"
df_ori,df_dummy,X_complete, y_complete=file_process(input_file,True)
#df_ori,df_dummy=file_process(input_file,False)
y_complete=df_dummy['corrupt_label']
X_complete=df_dummy.drop(['corrupt_label'],axis=1)
df_training=[df_dummy]

training_dataset={"complete":{"X":X_complete,
                 "y":y_complete}}

Original shape:
['root_xwf_id', 'job_id', 'start_time', 'end_time', 'submit_host', 'submit_user', 'execution_host', 'execution_user', 'job_type', 'job_exit_code', 'bytes', 'lfn', 'src_label', 'src_url', 'src_proto_host', 'dst_label', 'dst_url', 'dst_proto_host', 'transfer_success', 'checksum_success', 'actual_checksum', 'expected_checksum', 'scenario', 'corrupt_label', 'corrupt_start', 'corrupt_end', 'corrupt_rate', 'flow', 'FM']
Original shape:(44849, 29)
Encoded shape:
['transfer_success', 'checksum_success', 'bytes', 'corrupt_rate', 'corrupt_label', 'submit_host_syr-submit', 'submit_host_uc-submit', 'submit_host_ucsd-submit', 'submit_host_unl-submit', 'execution_host_syr-compute-c0', 'execution_host_syr-compute-c1', 'execution_host_uc-compute-c0', 'execution_host_uc-compute-c1', 'execution_host_ucsd-compute-c0', 'execution_host_ucsd-compute-c1', 'execution_host_unl-compute-c0', 'execution_host_unl-compute-c1', 'src_label_syr', 'src_label_uc', 'src_label_ucsd', 'src_label_unl', 'dst_



In [320]:
test_file = "../data/exogeni/var2/iris_results/01v2_02_20210125_1336PM/01v2_02_20210125_1336PM_full.csv"
#test_file = "../data/demo-test/01v2_02_20210901_1241PM_full_uc-router-n0-link.csv"
#test_file = "../data/demo-test/01v2_02_20210831_1741PM_full_syr-cache.csv"
df_t_ori,df_t_dummy,X_test, y_test=file_process(test_file,True)
#df_t_ori,df_t_dummy=file_process(test_file,True)
#y_test=df_t_dummy['corrupt_label']
#X_test=df_t_dummy.drop(['corrupt_label'],axis=1)

Original shape:
['root_xwf_id', 'job_id', 'start_time', 'end_time', 'submit_host', 'submit_user', 'execution_host', 'execution_user', 'job_type', 'job_exit_code', 'bytes', 'lfn', 'src_label', 'src_url', 'src_proto_host', 'dst_label', 'dst_url', 'dst_proto_host', 'transfer_success', 'checksum_success', 'actual_checksum', 'expected_checksum', 'scenario', 'corrupt_label', 'corrupt_start', 'corrupt_end', 'corrupt_rate', 'flow', 'FM']
Original shape:(45291, 29)
Encoded shape:
['transfer_success', 'checksum_success', 'bytes', 'corrupt_rate', 'corrupt_label', 'submit_host_syr-submit', 'submit_host_uc-submit', 'submit_host_ucsd-submit', 'submit_host_unl-submit', 'execution_host_syr-compute-c0', 'execution_host_syr-compute-c1', 'execution_host_uc-compute-c0', 'execution_host_uc-compute-c1', 'execution_host_ucsd-compute-c0', 'execution_host_ucsd-compute-c1', 'execution_host_unl-compute-c0', 'execution_host_unl-compute-c1', 'src_label_syr', 'src_label_uc', 'src_label_ucsd', 'src_label_unl', 'dst_



In [321]:
df_ori[(df_ori['corrupt_label']=="syr.cache") & (df_ori['checksum_success']==False) ]
#df_t_ori

Unnamed: 0,root_xwf_id,job_id,start_time,end_time,submit_host,submit_user,execution_host,execution_user,job_type,job_exit_code,bytes,lfn,src_label,src_url,src_proto_host,dst_label,dst_url,dst_proto_host,transfer_success,checksum_success,actual_checksum,expected_checksum,scenario,corrupt_label,corrupt_start,corrupt_end,corrupt_rate,flow,FM
1698,c7b37f70-b371-439b-a290-fb9e11d52076,job_sh_ID0000010,1610689063,1610689067,ucsd-submit,ericafu,syr-compute-c0,ericafu,compute,256,1573048.0,Ulysses_by_James_Joyce.txt,ucsd,http://ucsd-staging.data-plane/~ericafu/inputs...,http://ucsd-staging.data-plane,syr,file:///var/lib/condor/execute/dir_13183/pegas...,file://,True,False,2c7792b2f9502a9e8b84d51b8f12071dd94f38cb8528af...,096a0681b644e28d0082f26ac153459ed15484ba26201f...,bypass,syr.cache,1610689000.0,1610689000.0,,ucsd-submit-syr-compute-c0,True
1706,8170f2ba-2d26-458b-85e0-fc686c85f36b,job_sh_ID0000011,1610689084,1610689087,unl-submit,ericafu,syr-compute-c0,ericafu,compute,256,195257.0,Visual_Signaling_By_Signal_Corps_United_States...,unl,http://unl-staging.data-plane/~ericafu/inputs/...,http://unl-staging.data-plane,syr,file:///var/lib/condor/execute/dir_13423/pegas...,file://,True,False,b81b17321eb926c76204a3621c8266af089f06e2c2758e...,7902fe2217fc7ac847e40d8b7fdec6bf820520918be642...,bypass,syr.cache,1610689000.0,1610689000.0,,unl-submit-syr-compute-c0,True


In [322]:
def missing_feature(X_complete, X_test):
    print("X_complete:"+str(len(X_complete.columns)))
    print("X_test:" + str(len(X_test.columns)))
    c_t_missing=[]
    for c_c in X_complete.columns:
        if c_c not in X_test.columns:
            #print(c_c+" c_c not in test!")
            c_t_missing.append(c_c)
            
    c_c_missing=[]
    for c_t in X_test.columns:
        if c_t not in X_complete.columns:
            #print(c_t+" c_t not in complete!")
            c_c_missing.append(c_t)

    for c_c in c_t_missing:
        #print(c_t+" c_t dropping!")
        X_test[c_c]='0'
        
    for c_t in c_c_missing:
        #print(c_t+" c_t dropping!")
        X_test=X_test.drop(columns=c_t)
        
    X_complete, X_test = X_complete.align(X_test, join='inner', axis=1)
    
    print("After imputation:"+str(len(X_test.columns)))
    
    return X_test

In [323]:
X_test = missing_feature(X_complete, X_test)
df_t_dummy = missing_feature(df_dummy, df_t_dummy)

#df_dummy,df_t_dummy = df_dummy.align(df_t_dummy, join='inner', axis=1)
#X_complete, X_test = X_complete.align(X_test, join='inner', axis=1)

df_t=[df_t_dummy]
testing_dataset={"testing":{"X":X_test,
                 "y":y_test}}

X_complete:52
X_test:55
After imputation:52
X_complete:53
X_test:56
After imputation:53


In [324]:
def accuracy(model, test_data, classes, class_label, k):
    if(k==0): return null,null
    num = classes.size
    correct_class={}
    correct=0
    for label in classes:
        label_pred=[]
        isCorrect=False
        test_data_1=test_data[test_data[class_label]==label]
        if test_data_1.shape[0]==0:
            print("No match")
            correct=correct+1
            correct_class[label]=1
            continue
        x_test_1=test_data_1.drop([class_label],axis=1)
        test_prob_1 = model.predict_proba(x_test_1)
        label_array=test_prob_1.mean(axis=0)
        #label_array=np.nanmean(np.where(test_rf_prob_1!=0,test_rf_prob_1,np.nan),0)
        label_index = label_array.argmax()
        label_index_sort = label_array.argsort()
        #label_pred[0] = classes[label_index]
        for j in range(0,k): 
            #print(f'j={j}, label={label_index}')
            if(j==0):
                label_pred.append(classes[label_index])
            else:
                label_pred.append(classes[label_index_sort[(-1)*j-1]])
            #print(j,label_pred[j],label)
            if(label==label_pred[j]):
                isCorrect=True
                break
               
        #print(label_pred)
        #print(isCorrect)
        if isCorrect:
            print("predicted label:"+label)
            correct=correct+1
            correct_class[label]=1
        else:
            correct_class[label]=0
            print("wrong label:"+label)
            for j in range(0,k): 
                print("predicted label "+str(j)+":"+label_pred[j])
    return correct_class, correct, correct/num 

In [325]:
def train_dt(classifier, df, training_dataset,class_label):
    i=0
    for k, d in training_dataset.items():
        X=d["X"]
        y=d['y']
        clf = classifier
        clf_model=clf.fit(X,y)
        pred=clf.predict(X)
        balanced_accu=balanced_accuracy_score(y, pred)
        f1=f1_score(y, pred,average='weighted')
        print(str(k)+ "label_size:" + str(clf.classes_.size) + ":balanced_accu="+str(balanced_accu)+":F1-Score="+str(f1))
        for j in range(1,4):
            c,correct,accu=accuracy(clf_model, df[i], clf.classes_, class_label, j)
            print("Top-"+str(j)+" Accu="+str(accu)) 
            
    return clf_model

In [326]:
def test_dt(clf_model,df_t, testing_dataset,class_label):
    i=0
    for k, d in testing_dataset.items():
        y_t = d['y']
        X_t = d['X']
        pred=clf_model.predict(X_t)
        balanced_accu=balanced_accuracy_score(y_t, pred)
        #f1=f1_score(y_t, pred,average='weighted')
        #print(":balanced_accu="+str(balanced_accu)+":F1-Score="+str(f1))
        for j in range(1,4):
            c,correct,accu=accuracy(clf_model, df_t[i], clf_model.classes_,class_label,j)
            print("Top-"+str(j)+" Accu="+str(accu)) 
        i=i+1

In [327]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import OneHotEncoder
from sklearn import tree
#import graphviz 
from sklearn.calibration import CalibratedClassifierCV

from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix
from sklearn.utils.multiclass import unique_labels
from sklearn.metrics import multilabel_confusion_matrix,balanced_accuracy_score,f1_score

# 1. Try the top-k classification with the corrupted flows

In [328]:
clf_model = train_dt(RandomForestClassifier(max_depth=20, random_state=0),df_training,training_dataset, 'corrupt_label')

completelabel_size:23:balanced_accu=0.4909882019667372:F1-Score=0.5056110778454401
wrong label:cenic.Link0
predicted label 0:internet2.Link14
predicted label:cenic.Link1
predicted label:cenic.Link12
wrong label:cenic.Link9
predicted label 0:starlight.Link24
predicted label:esnet.Link2
predicted label:esnet.Link22
wrong label:internet2.Link0
predicted label 0:cenic.Link12
predicted label:internet2.Link14
wrong label:internet2.Link6
predicted label 0:unl.router.n1.Link14
predicted label:starlight.Link24
wrong label:starlight.Link6
predicted label 0:internet2.Link14
predicted label:syr.cache
predicted label:syr.compute.c0.Link26
predicted label:syr.router.n2.Link24
predicted label:uc.compute.c0.Link3
predicted label:uc.compute.c1.Link4
predicted label:uc.router.n0.Link12
predicted label:ucsd.compute.c0.Link23
predicted label:ucsd.compute.c1.Link13
wrong label:ucsd.router.n3.Link22
predicted label 0:internet2.Link14
predicted label:unl.cache
wrong label:unl.compute.c1.Link16
predicted labe

In [329]:
test_dt(clf_model,df_t,testing_dataset, 'corrupt_label')



No match
No match
predicted label:cenic.Link12
wrong label:cenic.Link9
predicted label 0:starlight.Link24
predicted label:esnet.Link2
predicted label:esnet.Link22
No match
predicted label:internet2.Link14
No match
predicted label:starlight.Link24
wrong label:starlight.Link6
predicted label 0:internet2.Link14
wrong label:syr.cache
predicted label 0:syr.compute.c0.Link26
predicted label:syr.compute.c0.Link26
predicted label:syr.router.n2.Link24
predicted label:uc.compute.c0.Link3
wrong label:uc.compute.c1.Link4
predicted label 0:uc.compute.c0.Link3
predicted label:uc.router.n0.Link12
predicted label:ucsd.compute.c0.Link23
wrong label:ucsd.compute.c1.Link13
predicted label 0:starlight.Link24
wrong label:ucsd.router.n3.Link22
predicted label 0:cenic.Link12
predicted label:unl.cache
wrong label:unl.compute.c1.Link16
predicted label 0:unl.router.n1.Link14
predicted label:unl.router.n1.Link14
Top-1 Accu=0.6956521739130435
No match
No match
predicted label:cenic.Link12
predicted label:cenic.Li

# 2. Let's balance the data via oversampling

In [330]:
from imblearn.over_sampling import RandomOverSampler,SMOTE, ADASYN,SMOTENC
from imblearn.combine import SMOTEENN, SMOTETomek

def train_over_sampling(classifier,df,training_dataset,class_label,test=False):
    i=0
    over_sampling = [RandomOverSampler(random_state=0),
        #SMOTE(random_state=0),
        #SMOTEENN(random_state=0),
        #SMOTETomek(random_state=0),
        #SMOTENC(categorical_features=[4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43], random_state=0)

    ]
    for k, d in training_dataset.items():
        for s in range(0,len(over_sampling)):
            X=d["X"]
            y=d['y']
            print("sampling:"+str(s)+":"+str(over_sampling[s]))
            os = over_sampling[s]
            X_resampled, y_resampled = os.fit_resample(X, y)
            #clf = RandomForestClassifier(max_depth=20, random_state=0)
            clf = classifier
            clf_model=clf.fit(X_resampled, y_resampled)
            
            pred=clf.predict(X)
            balanced_accu=balanced_accuracy_score(y, pred)
            f1=f1_score(y, pred,average='weighted')
            print(str(k)+":balanced_accu="+str(balanced_accu)+":F1-Score="+str(f1))
            
            for j in range(1,4):
                if test:
                    c,correct,accu=accuracy(clf_model, df_t[i], clf.classes_,class_label,j)
                else:
                    c,correct,accu=accuracy(clf_model, df[i], clf.classes_,class_label,j)
                print("Top-"+str(j)+" Accu="+str(accu)) 
        i=i+1
        
    return clf_model

In [331]:
#df_ori,df_dummy=file_process(input_file,True)
#y_complete=df_dummy['corrupt_label']
#X_complete=df_dummy.drop(['corrupt_label'],axis=1)

df_ori,df_dummy,X_complete, y_complete=file_process(input_file,True)
df=[df_dummy]

training_dataset={"complete":{"X":X_complete,
                 "y":y_complete}}

clf_model = train_over_sampling(RandomForestClassifier(max_depth=20, random_state=0),df,training_dataset, 'corrupt_label')



Original shape:
['root_xwf_id', 'job_id', 'start_time', 'end_time', 'submit_host', 'submit_user', 'execution_host', 'execution_user', 'job_type', 'job_exit_code', 'bytes', 'lfn', 'src_label', 'src_url', 'src_proto_host', 'dst_label', 'dst_url', 'dst_proto_host', 'transfer_success', 'checksum_success', 'actual_checksum', 'expected_checksum', 'scenario', 'corrupt_label', 'corrupt_start', 'corrupt_end', 'corrupt_rate', 'flow', 'FM']
Original shape:(44849, 29)
Encoded shape:
['transfer_success', 'checksum_success', 'bytes', 'corrupt_rate', 'corrupt_label', 'submit_host_syr-submit', 'submit_host_uc-submit', 'submit_host_ucsd-submit', 'submit_host_unl-submit', 'execution_host_syr-compute-c0', 'execution_host_syr-compute-c1', 'execution_host_uc-compute-c0', 'execution_host_uc-compute-c1', 'execution_host_ucsd-compute-c0', 'execution_host_ucsd-compute-c1', 'execution_host_unl-compute-c0', 'execution_host_unl-compute-c1', 'src_label_syr', 'src_label_uc', 'src_label_ucsd', 'src_label_unl', 'dst_

In [332]:
test_dt(clf_model,df_t,testing_dataset, 'corrupt_label')

No match
No match
predicted label:cenic.Link12
predicted label:cenic.Link9
predicted label:esnet.Link2
wrong label:esnet.Link22
predicted label 0:syr.compute.c0.Link26
No match
wrong label:internet2.Link14
predicted label 0:esnet.Link2
No match
predicted label:starlight.Link24
predicted label:starlight.Link6
predicted label:syr.cache
predicted label:syr.compute.c0.Link26
predicted label:syr.router.n2.Link24
predicted label:uc.compute.c0.Link3




wrong label:uc.compute.c1.Link4
predicted label 0:uc.compute.c0.Link3
predicted label:uc.router.n0.Link12
predicted label:ucsd.compute.c0.Link23
wrong label:ucsd.compute.c1.Link13
predicted label 0:starlight.Link24
wrong label:ucsd.router.n3.Link22
predicted label 0:cenic.Link12
predicted label:unl.cache
predicted label:unl.compute.c1.Link16
predicted label:unl.router.n1.Link14
Top-1 Accu=0.782608695652174
No match
No match
predicted label:cenic.Link12
predicted label:cenic.Link9
predicted label:esnet.Link2
predicted label:esnet.Link22
No match
predicted label:internet2.Link14
No match
predicted label:starlight.Link24
predicted label:starlight.Link6
predicted label:syr.cache
predicted label:syr.compute.c0.Link26
predicted label:syr.router.n2.Link24
predicted label:uc.compute.c0.Link3
predicted label:uc.compute.c1.Link4
predicted label:uc.router.n0.Link12
predicted label:ucsd.compute.c0.Link23
wrong label:ucsd.compute.c1.Link13
predicted label 0:starlight.Link24
predicted label 1:unl.ro

# 3. Try the coarser per-site classification

In [333]:
#count the number of labels in the data and the number of labels that actually caused errors.
def num_label(df, df_dummy, site):
    site_ori = df[site]
    counter_site = Counter(site_ori)
    print("original num_label:" + str(len(counter_site)))
    #for k,v in counter_site.items():
    #    print(k)
    
    site_dummy = df_dummy[site]
    counter_site_dummy = Counter(site_dummy)
    print("recorded num_label:" + str(len(counter_site_dummy)))

    site_name = df[site].str.split('.', n=1, expand = True)
    site_dummy_name = df[site].str.split('.', n=1, expand = True)
    
    return counter_site, counter_site_dummy,site_name,site_dummy_name

In [334]:
counter_site,counter_site_dummy,site_name,site_dummy_name=num_label(df_ori,df_dummy,'corrupt_label')
df_ori['site'] = site_name[0]
df_dummy['site'] = site_dummy_name[0]
df_dummy=df_dummy.drop(['corrupt_label'],axis=1)
y_complete=df_dummy['site']
X_complete=df_dummy.drop(['site'],axis=1)

df=[df_dummy]
    
training_dataset={"complete":{"X":X_complete,
                 "y":y_complete}}

site_model = train_dt(RandomForestClassifier(max_depth=20, random_state=0),df,training_dataset, 'site')

original num_label:49
recorded num_label:23
completelabel_size:8:balanced_accu=0.6288156540879895:F1-Score=0.6311149472628785
predicted label:cenic
predicted label:esnet
predicted label:internet2
predicted label:starlight
predicted label:syr
predicted label:uc
predicted label:ucsd
predicted label:unl
Top-1 Accu=1.0
predicted label:cenic
predicted label:esnet
predicted label:internet2
predicted label:starlight
predicted label:syr
predicted label:uc
predicted label:ucsd
predicted label:unl
Top-2 Accu=1.0
predicted label:cenic
predicted label:esnet
predicted label:internet2
predicted label:starlight
predicted label:syr
predicted label:uc
predicted label:ucsd
predicted label:unl
Top-3 Accu=1.0


In [335]:
counter_site_t,counter_site_t_dummy,site_t_name,site_t_dummy_name=num_label(df_t_ori,df_t_dummy,'corrupt_label')
df_t_ori['site'] = site_t_name[0]
df_t_dummy['site'] = site_t_dummy_name[0]
df_t_dummy=df_t_dummy.drop(['corrupt_label'],axis=1)
y_t=df_t_dummy['site']
X_t=df_t_dummy.drop(['site'],axis=1)

df_t=[df_t_dummy]
    
testing_dataset={"complete":{"X":X_t,
                 "y":y_t}}
test_dt(site_model,df_t,testing_dataset, 'site')

original num_label:49
recorded num_label:26
predicted label:cenic
wrong label:esnet
predicted label 0:cenic
predicted label:internet2
predicted label:starlight
predicted label:syr
predicted label:uc
wrong label:ucsd
predicted label 0:starlight
predicted label:unl
Top-1 Accu=0.75
predicted label:cenic
wrong label:esnet
predicted label 0:cenic
predicted label 1:starlight
predicted label:internet2
predicted label:starlight
predicted label:syr
predicted label:uc
predicted label:ucsd
predicted label:unl
Top-2 Accu=0.875
predicted label:cenic
predicted label:esnet
predicted label:internet2
predicted label:starlight
predicted label:syr
predicted label:uc
predicted label:ucsd
predicted label:unl
Top-3 Accu=1.0


# 4. try the probability based model

In [336]:
def prob_dict(df_c,group):
    target_flow=df_c[group]
    counter_flow = Counter(target_flow)
    flow_prob_dict={}
    for i,j in counter_flow.items():
    #print(i+":"+str(j)+":"+str(len(target_flow)))
        per_flow = j / len(target_flow) * 100
        #print('flow=%s, Count=%d, Per__flow=%.3f%%' % (i, j, per_flow))
        df_flow=df_c[(df_c[group]==i)]
        flow_count=df_flow['checksum_success'].value_counts(normalize=True)
        #print(flow_count.index.tolist())
        #print(flow_count.values.tolist())
        flow_count_dict=flow_count.to_dict()
        #print(flow_count_dict)
        i_group=group+"_"+i
        if 0 in flow_count_dict:
            flow_prob_dict[i_group]=flow_count_dict[0]
        else:
            flow_prob_dict[i_group]=0
        #print("{}:{}".format(i,flow_prob_dict[i]))
    return flow_prob_dict

In [337]:
def generate_prob_dict(df, target_name,groups):
    # summarize the class distribution
    #target = df.values[:,-5]
    target = df[target_name]
    counter = Counter(target)
    label_prob_dict={}
    for k,v in counter.items():
        per = v / len(target) * 100
        #print('Class=%s, Count=%d, Percentage=%.3f%%' % (k, v, per))
        df_c=df[(df[target_name]==k)]
        flow_prob_dict={}
        for group in groups:
            prob=prob_dict(df_c,group)
            flow_prob_dict={**flow_prob_dict,**prob}
        
        label_prob_dict[k]=flow_prob_dict
    return label_prob_dict

In [338]:
df_ori,df_dummy,X_complete, y_complete=file_process(input_file,True)
#label_prob_dict=generate_prob_dict(df_ori,"corrupt_label","src_label")

groups=["flow","dst_label"]
label_prob_dict=generate_prob_dict(df_ori,"corrupt_label",groups)

df_prob = pd.DataFrame.from_dict(label_prob_dict, orient='index')
df_prob = df_prob.fillna(0)
df_prob = df_prob[(df_prob.T !=0).any()] #drop rows with all zeros: no corruption labels
df_prob=df_prob.sort_index()

X=df_prob.to_numpy()
X=np.nan_to_num(X)
#print(X)
y=df_prob.index
#print(y.shape)



Original shape:
['root_xwf_id', 'job_id', 'start_time', 'end_time', 'submit_host', 'submit_user', 'execution_host', 'execution_user', 'job_type', 'job_exit_code', 'bytes', 'lfn', 'src_label', 'src_url', 'src_proto_host', 'dst_label', 'dst_url', 'dst_proto_host', 'transfer_success', 'checksum_success', 'actual_checksum', 'expected_checksum', 'scenario', 'corrupt_label', 'corrupt_start', 'corrupt_end', 'corrupt_rate', 'flow', 'FM']
Original shape:(44849, 29)
Encoded shape:
['transfer_success', 'checksum_success', 'bytes', 'corrupt_rate', 'corrupt_label', 'submit_host_syr-submit', 'submit_host_uc-submit', 'submit_host_ucsd-submit', 'submit_host_unl-submit', 'execution_host_syr-compute-c0', 'execution_host_syr-compute-c1', 'execution_host_uc-compute-c0', 'execution_host_uc-compute-c1', 'execution_host_ucsd-compute-c0', 'execution_host_ucsd-compute-c1', 'execution_host_unl-compute-c0', 'execution_host_unl-compute-c1', 'src_label_syr', 'src_label_uc', 'src_label_ucsd', 'src_label_unl', 'dst_

In [339]:
clf_prob = RandomForestClassifier(max_depth=20, random_state=0)
clf_prob.fit(X,y)
clf_prob.score(X,y)

1.0

In [340]:
df_t_ori,df_t_dummy,X_t_complete, y_t_complete=file_process(test_file,True)
df_t_ori = missing_feature(df_ori, df_t_ori)
df_t_dummy = missing_feature(df_dummy, df_t_dummy)

#label_prob_dict_test=generate_prob_dict(df_t_ori,"corrupt_label","src_label")

groups=["flow","dst_label"]

label_prob_dict_test=generate_prob_dict(df_t_ori,"corrupt_label",groups)

    
df_prob_test = pd.DataFrame.from_dict(label_prob_dict_test, orient='index')

df_prob_test = missing_feature(df_prob, df_prob_test)

df_prob_test = df_prob_test.fillna(0)
df_prob_test = df_prob[(df_prob_test.T !=0).any()] #drop rows with all zeros: no corruption labels

#df_prob,df_prob_test = df_prob.align(df_prob_test, join='inner', axis=1)
df_prob_test = df_prob_test.sort_index()

X_t=df_prob_test.to_numpy()
print(X_t.shape)
X_t=np.nan_to_num(X_t)
print(X_t.shape)
y_t=df_prob_test.index
#print(y.shape)
clf_prob.score(X_t,y_t)



Original shape:
['root_xwf_id', 'job_id', 'start_time', 'end_time', 'submit_host', 'submit_user', 'execution_host', 'execution_user', 'job_type', 'job_exit_code', 'bytes', 'lfn', 'src_label', 'src_url', 'src_proto_host', 'dst_label', 'dst_url', 'dst_proto_host', 'transfer_success', 'checksum_success', 'actual_checksum', 'expected_checksum', 'scenario', 'corrupt_label', 'corrupt_start', 'corrupt_end', 'corrupt_rate', 'flow', 'FM']
Original shape:(45291, 29)
Encoded shape:
['transfer_success', 'checksum_success', 'bytes', 'corrupt_rate', 'corrupt_label', 'submit_host_syr-submit', 'submit_host_uc-submit', 'submit_host_ucsd-submit', 'submit_host_unl-submit', 'execution_host_syr-compute-c0', 'execution_host_syr-compute-c1', 'execution_host_uc-compute-c0', 'execution_host_uc-compute-c1', 'execution_host_ucsd-compute-c0', 'execution_host_ucsd-compute-c1', 'execution_host_unl-compute-c0', 'execution_host_unl-compute-c1', 'src_label_syr', 'src_label_uc', 'src_label_ucsd', 'src_label_unl', 'dst_

  df_prob_test = df_prob[(df_prob_test.T !=0).any()] #drop rows with all zeros: no corruption labels


1.0

In [341]:
clf_prob.predict([X_t[0]])

array(['syr.router.n2.Link24'], dtype=object)

In [342]:
pd.set_option('max_columns', None)

In [343]:
df_t_ori[(df_t_ori['corrupt_label']=="syr.cache") & (df_t_ori['checksum_success']==False) ]

Unnamed: 0,root_xwf_id,job_id,start_time,end_time,submit_host,submit_user,execution_host,execution_user,job_type,job_exit_code,bytes,lfn,src_label,src_url,src_proto_host,dst_label,dst_url,dst_proto_host,transfer_success,checksum_success,actual_checksum,expected_checksum,scenario,corrupt_label,corrupt_start,corrupt_end,corrupt_rate,flow,FM
1974,40047a43-0b05-42c8-ba25-855e9a6884c2,job_sh_ID0000010,1611584594,1611584598,uc-submit,ericafu,syr-compute-c0,ericafu,compute,256,167529.0,Alices_Adventures_in_Wonderland_by_Lewis_Carro...,uc,http://uc-staging.data-plane/~ericafu/inputs/A...,http://uc-staging.data-plane,syr,file:///var/lib/condor/execute/dir_9432/pegasu...,file://,True,False,4bdbdd7fe7c838ec0b1dca97e80fa70aaa94e4ccbed832...,2416276f6d923db58f9a998346cfd386c4c33188b92461...,bypass,syr.cache,1611585000.0,1611585000.0,,uc-submit-syr-compute-c0,True
1987,275d2eca-eacc-41b4-88b8-da6e2e33f3c9,job_sh_ID0000030,1611584593,1611584598,ucsd-submit,ericafu,syr-compute-c1,ericafu,compute,256,114.0,job_sh,ucsd,http://ucsd-staging.data-plane/~ericafu/inputs...,http://ucsd-staging.data-plane,syr,file:///var/lib/condor/execute/dir_5673/pegasu...,file://,True,False,4def2b079252a1694f1dbfe5fff07d931df86493822156...,90ae2b216dd62dc6a081e863b957d948ff5ace9c63fd78...,bypass,syr.cache,1611585000.0,1611585000.0,,ucsd-submit-syr-compute-c1,True
1992,3abed86a-5841-43fa-b7df-2f8127b06ae1,job_sh_ID0000011,1611584615,1611584618,syr-submit,ericafu,syr-compute-c0,ericafu,compute,256,1573048.0,Ulysses_by_James_Joyce.txt,syr,http://syr-staging.data-plane/~ericafu/inputs/...,http://syr-staging.data-plane,syr,file:///var/lib/condor/execute/dir_9673/pegasu...,file://,True,False,2c7792b2f9502a9e8b84d51b8f12071dd94f38cb8528af...,096a0681b644e28d0082f26ac153459ed15484ba26201f...,bypass,syr.cache,1611585000.0,1611585000.0,,syr-submit-syr-compute-c0,True
1995,40047a43-0b05-42c8-ba25-855e9a6884c2,job_sh_ID0000011,1611584615,1611584618,uc-submit,ericafu,syr-compute-c1,ericafu,compute,256,167529.0,Alices_Adventures_in_Wonderland_by_Lewis_Carro...,uc,http://uc-staging.data-plane/~ericafu/inputs/A...,http://uc-staging.data-plane,syr,file:///var/lib/condor/execute/dir_5909/pegasu...,file://,True,False,4bdbdd7fe7c838ec0b1dca97e80fa70aaa94e4ccbed832...,2416276f6d923db58f9a998346cfd386c4c33188b92461...,bypass,syr.cache,1611585000.0,1611585000.0,,uc-submit-syr-compute-c1,True
2048,3abed86a-5841-43fa-b7df-2f8127b06ae1,job_sh_ID0000012,1611584674,1611584678,syr-submit,ericafu,syr-compute-c0,ericafu,compute,256,1573048.0,Ulysses_by_James_Joyce.txt,syr,http://syr-staging.data-plane/~ericafu/inputs/...,http://syr-staging.data-plane,syr,file:///var/lib/condor/execute/dir_10223/pegas...,file://,True,False,2c7792b2f9502a9e8b84d51b8f12071dd94f38cb8528af...,096a0681b644e28d0082f26ac153459ed15484ba26201f...,bypass,syr.cache,1611585000.0,1611585000.0,,syr-submit-syr-compute-c0,True
2054,e66885ea-a04e-4696-800c-3ab5e8771584,job_sh_ID0000013,1611584696,1611584699,unl-submit,ericafu,syr-compute-c1,ericafu,compute,256,597298.0,The_Adventures_of_Huckleberry_Finn_by_Mark_Twa...,unl,http://unl-staging.data-plane/~ericafu/inputs/...,http://unl-staging.data-plane,syr,file:///var/lib/condor/execute/dir_6466/pegasu...,file://,True,False,1beda0c696095e25922f93f08b029009a9f2aa556237ea...,bf239774fe1bcf1895e2c3c550c1f4f0be5edc223ebaee...,bypass,syr.cache,1611585000.0,1611585000.0,,unl-submit-syr-compute-c1,True
2058,40047a43-0b05-42c8-ba25-855e9a6884c2,job_sh_ID0000013,1611584696,1611584699,uc-submit,ericafu,syr-compute-c0,ericafu,compute,256,167529.0,Alices_Adventures_in_Wonderland_by_Lewis_Carro...,uc,http://uc-staging.data-plane/~ericafu/inputs/A...,http://uc-staging.data-plane,syr,file:///var/lib/condor/execute/dir_10460/pegas...,file://,True,False,4bdbdd7fe7c838ec0b1dca97e80fa70aaa94e4ccbed832...,2416276f6d923db58f9a998346cfd386c4c33188b92461...,bypass,syr.cache,1611585000.0,1611585000.0,,uc-submit-syr-compute-c0,True
2103,e66885ea-a04e-4696-800c-3ab5e8771584,job_sh_ID0000014,1611584754,1611584758,unl-submit,ericafu,syr-compute-c0,ericafu,compute,256,597298.0,The_Adventures_of_Huckleberry_Finn_by_Mark_Twa...,unl,http://unl-staging.data-plane/~ericafu/inputs/...,http://unl-staging.data-plane,syr,file:///var/lib/condor/execute/dir_10709/pegas...,file://,True,False,1beda0c696095e25922f93f08b029009a9f2aa556237ea...,bf239774fe1bcf1895e2c3c550c1f4f0be5edc223ebaee...,bypass,syr.cache,1611585000.0,1611585000.0,,unl-submit-syr-compute-c0,True
2117,e66885ea-a04e-4696-800c-3ab5e8771584,job_sh_ID0000015,1611584775,1611584779,unl-submit,ericafu,syr-compute-c1,ericafu,compute,256,597298.0,The_Adventures_of_Huckleberry_Finn_by_Mark_Twa...,unl,http://unl-staging.data-plane/~ericafu/inputs/...,http://unl-staging.data-plane,syr,file:///var/lib/condor/execute/dir_6718/pegasu...,file://,True,False,1beda0c696095e25922f93f08b029009a9f2aa556237ea...,bf239774fe1bcf1895e2c3c550c1f4f0be5edc223ebaee...,bypass,syr.cache,1611585000.0,1611585000.0,,unl-submit-syr-compute-c1,True
2121,40047a43-0b05-42c8-ba25-855e9a6884c2,job_sh_ID0000015,1611584775,1611584779,uc-submit,ericafu,syr-compute-c0,ericafu,compute,256,167529.0,Alices_Adventures_in_Wonderland_by_Lewis_Carro...,uc,http://uc-staging.data-plane/~ericafu/inputs/A...,http://uc-staging.data-plane,syr,file:///var/lib/condor/execute/dir_10943/pegas...,file://,True,False,4bdbdd7fe7c838ec0b1dca97e80fa70aaa94e4ccbed832...,2416276f6d923db58f9a998346cfd386c4c33188b92461...,bypass,syr.cache,1611585000.0,1611585000.0,,uc-submit-syr-compute-c0,True


In [344]:
df_prob_test

Unnamed: 0,flow_ucsd-submit-syr-compute-c1,flow_ucsd-submit-ucsd-compute-c0,flow_syr-submit-unl-compute-c1,flow_unl-submit-uc-compute-c1,flow_unl-submit-ucsd-compute-c1,flow_uc-submit-unl-compute-c0,flow_ucsd-submit-syr-compute-c0,flow_unl-submit-syr-compute-c1,dst_label_syr,dst_label_ucsd,dst_label_unl,dst_label_uc,flow_uc-submit-uc-compute-c0,flow_syr-submit-syr-compute-c1,flow_ucsd-submit-unl-compute-c0,flow_unl-submit-unl-compute-c1,flow_uc-submit-syr-compute-c0,flow_syr-submit-ucsd-compute-c0,flow_ucsd-submit-uc-compute-c1,flow_unl-submit-syr-compute-c0,flow_uc-submit-syr-compute-c1,flow_syr-submit-unl-compute-c0,flow_uc-submit-unl-compute-c1,flow_syr-submit-syr-compute-c0,flow_unl-submit-uc-compute-c0,flow_syr-submit-uc-compute-c0,flow_uc-submit-ucsd-compute-c0,flow_uc-submit-ucsd-compute-c1,flow_ucsd-submit-ucsd-compute-c1,flow_ucsd-submit-uc-compute-c0,flow_unl-submit-unl-compute-c0,flow_unl-submit-ucsd-compute-c0,flow_uc-submit-uc-compute-c1,flow_syr-submit-ucsd-compute-c1,flow_syr-submit-uc-compute-c1,flow_ucsd-submit-unl-compute-c1,flow_syr-submit-httpgeni-orca,dst_label_httpgeni-orca
syr.router.n2.Link24,0.555556,0.0,0.0,0.0,0.0,0.0,0.428571,0.66,0.332168,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.428571,0.0,0.0,0.566667,0.513514,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
starlight.Link6,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.659091,0.251064,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.588235,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
syr.compute.c0.Link26,0.0,0.0,0.0,0.0,0.0,0.0,0.563107,0.0,0.327526,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.571429,0.0,0.0,0.571429,0.0,0.0,0.0,0.545455,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
unl.router.n1.Link14,0.0,0.0,0.58,0.0,0.0,0.551724,0.0,0.0,0.0,0.0,0.528256,0.0,0.0,0.0,0.631579,0.0,0.0,0.0,0.0,0.0,0.0,0.62,0.642857,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.594937,0.0,0.0
ucsd.compute.c0.Link23,0.0,0.571429,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.354108,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.616279,0.0,0.0,0.0,0.0,0.503937,0.0,0.0,0.0,0.0,0.0,0.0
esnet.Link22,0.434783,0.0,0.0,0.0,0.0,0.0,0.538462,0.0,0.183746,0.0,0.06746,0.33237,0.0,0.0,0.714286,0.0,0.0,0.0,0.602564,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.583333,0.0,0.0,0.0,0.0,0.0,0.571429,0.0,0.0
cenic.Link1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.353791,0.0,0.0,0.0,0.0,0.0,0.0,0.588235,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.551724,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
cenic.Link12,0.0,0.0,0.0,0.0,0.0,0.571429,0.0,0.0,0.25,0.356574,0.394521,0.0,0.0,0.0,0.0,0.0,0.551181,0.0,0.0,0.0,0.5,0.0,0.615385,0.0,0.0,0.0,0.615385,0.58042,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
unl.compute.c1.Link16,0.0,0.0,0.571429,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.324324,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.571429,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.528926,0.0,0.0
ucsd.compute.c1.Link13,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.377604,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.562016,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [345]:
df_prob

Unnamed: 0,flow_ucsd-submit-syr-compute-c1,flow_ucsd-submit-ucsd-compute-c0,flow_syr-submit-unl-compute-c1,flow_unl-submit-uc-compute-c1,flow_unl-submit-ucsd-compute-c1,flow_uc-submit-unl-compute-c0,flow_ucsd-submit-syr-compute-c0,flow_unl-submit-syr-compute-c1,dst_label_syr,dst_label_ucsd,dst_label_unl,dst_label_uc,flow_uc-submit-uc-compute-c0,flow_syr-submit-syr-compute-c1,flow_ucsd-submit-unl-compute-c0,flow_unl-submit-unl-compute-c1,flow_uc-submit-syr-compute-c0,flow_syr-submit-ucsd-compute-c0,flow_ucsd-submit-uc-compute-c1,flow_unl-submit-syr-compute-c0,flow_uc-submit-syr-compute-c1,flow_syr-submit-unl-compute-c0,flow_uc-submit-unl-compute-c1,flow_syr-submit-syr-compute-c0,flow_unl-submit-uc-compute-c0,flow_syr-submit-uc-compute-c0,flow_uc-submit-ucsd-compute-c0,flow_uc-submit-ucsd-compute-c1,flow_ucsd-submit-ucsd-compute-c1,flow_ucsd-submit-uc-compute-c0,flow_unl-submit-unl-compute-c0,flow_unl-submit-ucsd-compute-c0,flow_uc-submit-uc-compute-c1,flow_syr-submit-ucsd-compute-c1,flow_syr-submit-uc-compute-c1,flow_ucsd-submit-unl-compute-c1,flow_syr-submit-httpgeni-orca,dst_label_httpgeni-orca
syr.router.n2.Link24,0.555556,0.0,0.0,0.0,0.0,0.0,0.428571,0.66,0.332168,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.428571,0.0,0.0,0.566667,0.513514,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
starlight.Link6,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.659091,0.251064,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.588235,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
syr.compute.c0.Link26,0.0,0.0,0.0,0.0,0.0,0.0,0.563107,0.0,0.327526,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.571429,0.0,0.0,0.571429,0.0,0.0,0.0,0.545455,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
unl.router.n1.Link14,0.0,0.0,0.58,0.0,0.0,0.551724,0.0,0.0,0.0,0.0,0.528256,0.0,0.0,0.0,0.631579,0.0,0.0,0.0,0.0,0.0,0.0,0.62,0.642857,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.594937,0.0,0.0
ucsd.compute.c0.Link23,0.0,0.571429,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.354108,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.616279,0.0,0.0,0.0,0.0,0.503937,0.0,0.0,0.0,0.0,0.0,0.0
esnet.Link22,0.434783,0.0,0.0,0.0,0.0,0.0,0.538462,0.0,0.183746,0.0,0.06746,0.33237,0.0,0.0,0.714286,0.0,0.0,0.0,0.602564,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.583333,0.0,0.0,0.0,0.0,0.0,0.571429,0.0,0.0
cenic.Link1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.353791,0.0,0.0,0.0,0.0,0.0,0.0,0.588235,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.551724,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
cenic.Link12,0.0,0.0,0.0,0.0,0.0,0.571429,0.0,0.0,0.25,0.356574,0.394521,0.0,0.0,0.0,0.0,0.0,0.551181,0.0,0.0,0.0,0.5,0.0,0.615385,0.0,0.0,0.0,0.615385,0.58042,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
unl.compute.c1.Link16,0.0,0.0,0.571429,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.324324,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.571429,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.528926,0.0,0.0
ucsd.compute.c1.Link13,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.377604,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.562016,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [346]:
y_t[0]

'syr.router.n2.Link24'

In [None]:
## not important data 

In [None]:
df_prob_test=df_prob[(df_prob['ucsd-submit-syr-compute-c0']!=0) | (df_prob['ucsd-submit-ucsd-compute-c1']!=0) | (df_prob['ucsd-submit-unl-compute-c0']!=0)]

In [None]:
X_test=df_prob_test.to_numpy()
X_test=np.nan_to_num(X_test)
print(X_test)
y_test=df_prob_test.index

In [None]:
from sklearn.gaussian_process import GaussianProcessClassifier
from sklearn.gaussian_process.kernels import RBF

In [None]:
kernel = 1.0 * RBF([1.0])
gpc_rbf_isotropic = GaussianProcessClassifier(kernel=kernel).fit(X, y)

In [None]:
score=gpc_rbf_isotropic.score(X,y)
print(score)

In [None]:
gpc_rbf_isotropic.score(X, y)

In [None]:
df_c=df_ori[(df_ori['corrupt_label']=="syr-cache ")]
display(df_c)

In [None]:
df_c=df_ori[(df_ori['checksum_success']==False)&(df_ori['execution_host']=='syr-compute-c0')&(df_ori['corrupt_label']=='cenic.Link12 ')]
print(len(df))
print(len(df_c))
#df_c=df[(df['corrupt_label']=="unl-compute-c1")]
display(df_c)

In [None]:
gpc_rbf_isotropic.classes_