In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from collections import Counter

In [2]:
#input_file = "./output_20200219_2125PM.csv"
input_file = "../data/exogeni/output_20210207_1732PM/output_20210207_1732PM.csv" #OSG Big Topology
df = pd.read_csv(input_file, header = 0)

In [3]:
df.shape

(13018740, 15)

In [4]:
df['FLOW'] = df['SRCNODE']+'-'+df['DESTNODE']
df['FM']=df['FAILURE']+df['MISSING']

In [5]:
df_f=df[df['FM']==1]

In [6]:
df_f.shape

(63159, 17)

In [7]:
def generate_prob_dict(df):
    # summarize the class distribution
    #target = df.values[:,-5]
    target = df['LABEL']
    counter = Counter(target)
    label_prob_dict={}
    for k,v in counter.items():
        per = v / len(target) * 100
        #print('Class=%s, Count=%d, Percentage=%.3f%%' % (k, v, per))
        df_c=df[(df['LABEL']==k)]
        target_flow=df_c['FLOW']
        counter_flow = Counter(target_flow)
        flow_prob_dict={}
        for i,j in counter_flow.items():
            #print(i+":"+str(j)+":"+str(len(target_flow)))
            per_flow = j / len(target_flow) * 100
            #print('flow=%s, Count=%d, Per__flow=%.3f%%' % (i, j, per_flow))
            df_flow=df_c[(df_c['FLOW']==i)]
            flow_count=df_flow['FM'].value_counts(normalize=True)
            #print(flow_count.index.tolist())
            #print(flow_count.values.tolist())
            flow_count_dict=flow_count.to_dict()
            #print(flow_count_dict)
            if 1 in flow_count_dict:
                flow_prob_dict[i]=flow_count_dict[1]
            else:
                flow_prob_dict[i]=0
        label_prob_dict[k]=flow_prob_dict
    return label_prob_dict

In [8]:
def generate_prob_sample_df(df, sample_set, frac):
    frames=[]
    for k in range(sample_set):
        df_sample = df.sample(frac=frac, random_state=1)
        label_prob_dict=generate_prob_dict(df_sample)
        #'index': label, the key of the dict, is the row
        df_prob = pd.DataFrame.from_dict(label_prob_dict, orient='index')
        frames.append(df_prob)
    result = pd.concat(frames)
    X=result.to_numpy()
    X=np.nan_to_num(X)
    #print(X)
    y=result.index
    return X,y

In [9]:
def generate_prob_df(df):
    label_prob_dict=generate_prob_dict(df)
    #'index': label, the key of the dict, is the row
    df_prob = pd.DataFrame.from_dict(label_prob_dict, orient='index')
    X=df_prob.to_numpy()
    X=np.nan_to_num(X)
    #print(X)
    y=df_prob.index
    return X,y

In [10]:
# grid search kernel for gaussian process classifier
from sklearn.datasets import make_classification
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.gaussian_process import GaussianProcessClassifier
from sklearn.gaussian_process.kernels import RBF
from sklearn.gaussian_process.kernels import DotProduct
from sklearn.gaussian_process.kernels import Matern
from sklearn.gaussian_process.kernels import RationalQuadratic
from sklearn.gaussian_process.kernels import WhiteKernel

In [11]:
def GP_cv_learn(X, y):
    # define model
    model = GaussianProcessClassifier()
    # define model evaluation method
    cv = RepeatedStratifiedKFold(n_splits=2, n_repeats=3, random_state=1)
    # define grid
    grid = dict()
    grid['kernel'] = [1*RBF(), 1*DotProduct(), 1*Matern(),  1*RationalQuadratic(), 1*WhiteKernel()]
    # define search
    search = GridSearchCV(model, grid, scoring='accuracy', cv=cv, n_jobs=-1)
    #search = GridSearchCV(model, grid, scoring='accuracy')
    # perform the search
    results = search.fit(X, y)
    # summarize best
    print('Best Mean Accuracy: %.3f' % results.best_score_)
    print('Best Config: %s' % results.best_params_)
    # summarize all
    means = results.cv_results_['mean_test_score']
    params = results.cv_results_['params']
    for mean, param in zip(means, params):
        print(">%.3f with: %r" % (mean, param))

    #kernel = 1.0 * RBF([1.0])
    #kernel = 1*DotProduct()
    #gpc = GaussianProcessClassifier(kernel=kernel).fit(X, y)
    #gpc.score(X, y)

In [12]:
def GP_learn(X, y):
    # define model
    model = GaussianProcessClassifier()
    kernel = 1*RationalQuadratic(alpha=1, length_scale=1)
    gpc = GaussianProcessClassifier(kernel=kernel).fit(X, y)
    score = gpc.score(X, y)
    return score

In [171]:
X,y=generate_prob_sample_df(df, 10, 0.5)

In [159]:
GP_cv_learn(X,y)

Best Mean Accuracy: 0.272
Best Config: {'kernel': 1**2 * RBF(length_scale=1)}
>0.272 with: {'kernel': 1**2 * RBF(length_scale=1)}
>0.100 with: {'kernel': 1**2 * DotProduct(sigma_0=1)}
>0.119 with: {'kernel': 1**2 * Matern(length_scale=1, nu=1.5)}
>0.186 with: {'kernel': 1**2 * RationalQuadratic(alpha=1, length_scale=1)}
>0.014 with: {'kernel': 1**2 * WhiteKernel(noise_level=1)}


In [16]:
X,y=generate_prob_df(df)

In [17]:
print(y)

Index(['0', 'CacheUChic', 'CacheFNAL', 'OriginUNL', 'CacheUNL', 'OriginFNAL',
       'CacheNYU', 'CacheSyracuse', 'CacheBNL', 'CacheSeattle',
       ...
       'SaltLakeRouter_Link10', 'SaltLakeRouter_Link7',
       'SaltLakeRouter_Link36', 'SaltLakeRouter_Link34',
       'SeattleRouter_Link10', 'SeattleRouter_Link0', 'SeattleRouter_Link8',
       'SeattleRouter_Link11', 'SeattleRouter_Link24', 'OriginCal_Link22'],
      dtype='object', length=140)


In [20]:
score = GP_learn(X,y)
print(score)

0.10714285714285714


In [22]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import OneHotEncoder
from sklearn import tree
import sklearn
import graphviz 
from sklearn.calibration import CalibratedClassifierCV

In [18]:
clf_prob = DecisionTreeClassifier(max_depth = 20)
clf_model=clf_prob.fit(X,y)

In [19]:
score=clf_prob.score(X,y)
print(score)

0.6142857142857143


In [23]:
clf_rf_prob = RandomForestClassifier(max_depth = 20)
clf_rf_model=clf_rf_prob.fit(X,y)

In [24]:
score=clf_rf_prob.score(X,y)
print(score)

0.6285714285714286


In [165]:
from sklearn import svm

In [166]:
linear_svc=svm.LinearSVC(random_state=0)
lin_clf=linear_svc.fit(X, y)

In [167]:
score=linear_svc.score(X,y)
print(score)

0.7246376811594203
