In [2]:
import csv
import pandas as pd
import statistics
import random
from scipy import stats
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import cohen_kappa_score
from sklearn.metrics import accuracy_score

In [14]:
def Check_Significant_Energy_Distance(df_1, df_2):
    significant_flag = False
    threshold = 0.24
    feature_detected = []
    
    feature_names = df_1.columns
    
    for feature in feature_names:
        df_1_feature_values = df_1[feature]
        df_2_feature_values = df_2[feature]
        
        distance = stats.energy_distance(df_1_feature_values, df_2_feature_values)
        
        if distance > threshold:
            print("Feature Name: " + feature + " Distance: " + str(distance))
            significant_flag = True
            feature_detected.append(feature)
        
    return significant_flag, feature_detected

# ElecNorm Evaluation

In [16]:
batch_size = 1440

batch_index = 0
batch = []

reference_window = []
current_window = []

batches = []
drifts_detected = []
eval_accuracy_no_change = []
eval_accuracy_distance = []

with open('Datasets/Real_ElecNorm.csv') as csv_file:
    drifts_detected = []
    csv_reader = csv.reader(csv_file)
    for row in csv_reader:
        batch.append(row)
        if len(batch) == batch_size:
            batches.append(batch_index)
            if batch_index == 0:
                print("Currently working on batch " + str(batch_index))
                reference_window.extend(batch)
                reference_window_df = pd.DataFrame(reference_window)
                reference_window_df.columns = reference_window_df.columns.astype(str)
                reference_window_x = reference_window_df.drop('8', axis=1)
                reference_window_x = reference_window_x.drop('0',axis=1)
                reference_window_y = reference_window_df['8']
                
#                 print("Train Initial Classifier")
                random_forest_no_change = RandomForestClassifier(n_estimators=20, random_state=42)
                random_forest_no_change.fit(reference_window_x, reference_window_y)
                
                random_forest_distance = RandomForestClassifier(n_estimators=20, random_state=42)
                random_forest_distance.fit(reference_window_x, reference_window_y)
                
                pred_no_change = random_forest_no_change.predict(reference_window_x)
                accuracy_no_change = accuracy_score(reference_window_y,pred_no_change)
                eval_accuracy_no_change.append(accuracy_no_change)
#                 print("Prediction Accuracy - No Change: " + str(accuracy_no_change))
                
                pred_distance = random_forest_distance.predict(reference_window_x)
                accuracy_distance = accuracy_score(reference_window_y, pred_distance)
                eval_accuracy_distance.append(accuracy_distance)
#                 print("Prediction Accuracy - Distance: " + str(accuracy_distance))
                
                batch = []
                batch_index = batch_index + 1
            else:
                print("Currently working on batch " + str(batch_index))
                
                current_window.extend(batch)
                sample_reference_window = random.sample(reference_window, len(current_window))
                
#                 print(len(reference_window))
#                 print(len(sample_reference_window))
                reference_window_df = pd.DataFrame(sample_reference_window)
                reference_window_df.columns = reference_window_df.columns.astype(str)
                reference_window_x = reference_window_df.drop('8', axis=1)
                reference_window_x = reference_window_x.drop('0',axis=1)
                
                current_window_df = pd.DataFrame(current_window)
                current_window_df.columns = current_window_df.columns.astype(str)
                current_window_x = current_window_df.drop('8', axis=1)
                current_window_x = current_window_x.drop('0',axis=1)
                current_window_y = current_window_df['8']
                
                pred_no_change = random_forest_no_change.predict(current_window_x)
                accuracy_no_change = accuracy_score(current_window_y,pred_no_change)
                eval_accuracy_no_change.append(accuracy_no_change)
#                 print("Prediction Accuracy - No Change: " + str(accuracy_no_change))
                
                pred_distance = random_forest_distance.predict(current_window_x)
                accuracy_distance = accuracy_score(current_window_y, pred_distance)
                eval_accuracy_distance.append(accuracy_distance)
#                 print("Prediction Accuracy - Distance: " + str(accuracy_distance))
                
#                 chunks.append(chunk_num)
                
                drift_flag, drift_feature = Check_Significant_Energy_Distance(reference_window_x, current_window_x)
                
                if drift_flag:
                    drifts_detected.append(batch_index)
                    print('Drift Detected at batch ' + str(batch_index))
                    reference_window = []
                    reference_window.extend(current_window)
                    random_forest_distance = RandomForestClassifier(n_estimators=20, random_state=42)
                    random_forest_distance.fit(current_window_x,current_window_y)
                else:
                    reference_window.extend(current_window)
                    
                current_window = []
                batch = []
                batch_index = batch_index + 1
                
print(batches)
print(eval_accuracy_no_change)
print(eval_accuracy_distance)
print(drifts_detected)

Currently working on batch 0
Currently working on batch 1
Currently working on batch 2
Currently working on batch 3
Currently working on batch 4
Currently working on batch 5
Currently working on batch 6
Currently working on batch 7
Currently working on batch 8
Currently working on batch 9
Currently working on batch 10
Currently working on batch 11
Currently working on batch 12
Feature Name: 6 Distance: 0.2516744909582866
Feature Name: 7 Distance: 0.30790173391189124
Drift Detected at batch 12
Currently working on batch 13
Currently working on batch 14
Currently working on batch 15
Currently working on batch 16
Currently working on batch 17
Feature Name: 4 Distance: 0.248246323366546
Drift Detected at batch 17
Currently working on batch 18
Currently working on batch 19
Currently working on batch 20
Feature Name: 7 Distance: 0.2906080513062038
Drift Detected at batch 20
Currently working on batch 21
Currently working on batch 22
Currently working on batch 23
Currently working on batch 24

# Phishing Evaluation

In [None]:
batch_size = 1000

batch_index = 0
batch = []

reference_window = []
current_window = []

batches = []
drifts_detected = []
eval_accuracy_no_change = []
eval_accuracy_distance = []

with open('Datasets/Real_Phishing.csv') as csv_file:
    drifts_detected = []
    csv_reader = csv.reader(csv_file)
    for row in csv_reader:
        batch.append(row)
        if len(batch) == batch_size:
            batches.append(batch_index)
            if batch_index == 0:
                print("Currently working on batch " + str(batch_index))
                reference_window.extend(batch)
                reference_window_df = pd.DataFrame(reference_window)
                reference_window_df.columns = reference_window_df.columns.astype(str)
                reference_window_x = reference_window_df.drop('46', axis=1)
                reference_window_y = reference_window_df['46']
                
#                 print("Train Initial Classifier")
                random_forest_no_change = RandomForestClassifier(n_estimators=20, random_state=42)
                random_forest_no_change.fit(reference_window_x, reference_window_y)
                
                random_forest_distance = RandomForestClassifier(n_estimators=20, random_state=42)
                random_forest_distance.fit(reference_window_x, reference_window_y)
                
                pred_no_change = random_forest_no_change.predict(reference_window_x)
                accuracy_no_change = accuracy_score(reference_window_y,pred_no_change)
                eval_accuracy_no_change.append(accuracy_no_change)
#                 print("Prediction Accuracy - No Change: " + str(accuracy_no_change))
                
                pred_distance = random_forest_distance.predict(reference_window_x)
                accuracy_distance = accuracy_score(reference_window_y, pred_distance)
                eval_accuracy_distance.append(accuracy_distance)
#                 print("Prediction Accuracy - Distance: " + str(accuracy_distance))
                
                batch = []
                batch_index = batch_index + 1
            else:
                print("Currently working on batch " + str(batch_index))
#                 print(len(reference_window))
#                 reference_window_df = pd.DataFrame(sample_reference_window)
                
                current_window.extend(batch)
                sample_reference_window = random.sample(reference_window, len(current_window))
                reference_window_df = pd.DataFrame(sample_reference_window)
                reference_window_df.columns = reference_window_df.columns.astype(str)
                reference_window_x = reference_window_df.drop('46', axis=1)
                
                current_window_df = pd.DataFrame(current_window)
                current_window_df.columns = current_window_df.columns.astype(str)
                current_window_x = current_window_df.drop('46', axis=1)
                current_window_y = current_window_df['46']
                
                pred_no_change = random_forest_no_change.predict(current_window_x)
                accuracy_no_change = accuracy_score(current_window_y,pred_no_change)
                eval_accuracy_no_change.append(accuracy_no_change)
#                 print("Prediction Accuracy - No Change: " + str(accuracy_no_change))
                
                pred_distance = random_forest_distance.predict(current_window_x)
                accuracy_distance = accuracy_score(current_window_y, pred_distance)
                eval_accuracy_distance.append(accuracy_distance)
#                 print("Prediction Accuracy - Distance: " + str(accuracy_distance))
                
#                 chunks.append(chunk_num)
                
                drift_flag, drift_feature = Check_Significant_Energy_Distance(reference_window_x, current_window_x)
                
                if drift_flag:
                    drifts_detected.append(batch_index)
                    print('Drift Detected at batch ' + str(batch_index))
                    reference_window = []
                    reference_window.extend(current_window)
                    random_forest_distance = RandomForestClassifier(n_estimators=20, random_state=42)
                    random_forest_distance.fit(current_window_x,current_window_y)
                else:
                    reference_window.extend(current_window)
                    
                current_window = []
                batch = []
                batch_index = batch_index + 1
                
print(batches)
print(eval_accuracy_no_change)
print(eval_accuracy_distance)
print(drifts_detected)