In [None]:
import csv
import pandas as pd
import matplotlib.pyplot as plt
import statistics
import math
import shap
import lime
import eli5
from scipy import stats
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

In [None]:
def Check_Significant_Wasserstein_Distance(df_1, df_2):
    significant_flag = False
    threshold = 0.05
    
    feature_names = df_1.columns
    
    feature_detected = []
    for feature in feature_names:
        df_1_feature_values = df_1[feature]
        df_2_feature_values = df_2[feature]
        
        distance = stats.wasserstein_distance(df_1_feature_values, df_2_feature_values)

        if distance > threshold:
            print("Feature Name: " + feature + " Distance: " + str(distance))
            significant_flag = True
            feature_detected.append(feature)
        
    return significant_flag, feature_detected

In [None]:
def Check_Significant_Shap(model, reference_x, current_x):
    print("Checking Significant by Shap values")
    ref_shap_values = shap.TreeExplainer(model).shap_values(reference_x)[1]
    cur_shap_values = shap.TreeExplainer(model).shap_values(current_x)[1]
    
    num_instances = len(ref_shap_values)
    num_features = len(ref_shap_values[0])
    
    ref_feature_shap_values = []
    cur_feature_shap_values = []
    
    for i in range(num_features):
        ref_feature_shap_value = []
        cur_feature_shap_value = []
        for j in range(num_instances):
            ref_feature_shap_value.append(ref_shap_values[j][i])
            cur_feature_shap_value.append(cur_shap_values[j][i])
        ref_feature_shap_values.append(ref_feature_shap_value)
        cur_feature_shap_values.append(cur_feature_shap_value)
   
    update_flag = False
    
    feature_detected = []
    for i in range(num_features):
        ref_feature_shap_value = ref_feature_shap_values[i]
        cur_feature_shap_value = cur_feature_shap_values[i]
        
        ref_mean = statistics.mean(ref_feature_shap_value)
        ref_stdev = statistics.stdev(ref_feature_shap_value)
        ref_size = len(ref_feature_shap_value)
        
        cur_mean = statistics.mean(cur_feature_shap_value)
        cur_stdev = statistics.stdev(cur_feature_shap_value)
        cur_size = len(cur_feature_shap_value)
        
        significant,p = Two_Sample_T_test(ref_mean, cur_mean, ref_stdev, cur_stdev, ref_size, cur_size)
        
        if significant:
            print("Shapley Values Drift Detected at Feature " + str(i))
            print("P-value: " + str(p))
            update_flag = True
            feature_detected.append(i)
    return update_flag, feature_detected

In [None]:
def Check_Significant_LIME(model, lime_explainer, reference_window, current_window):
    print("Checking Significant by LIME values")
    reference_window = reference_window.loc[reference_window['26'].astype(float) == 1.0]
    reference_window_x = reference_window.drop('26',axis=1)
    ref_lime_distribution = Get_LIME_Distribution(reference_window_x, lime_explainer, model)
    
    current_window = current_window.loc[current_window['26'].astype(float) == 1.0]
    current_window_x = current_window.drop('26',axis=1)
    cur_lime_distribution = Get_LIME_Distribution(current_window_x, lime_explainer, model)
    
    update_flag = False
    
    feature_detected = []
    for i in range(len(ref_lime_distribution)):
        ref_lime_mean = ref_lime_distribution[i][0]
        ref_lime_stdev = ref_lime_distribution[i][1]
        ref_lime_size = ref_lime_distribution[i][2]
        
        cur_lime_mean = cur_lime_distribution[i][0]
        cur_lime_stdev = cur_lime_distribution[i][1]
        cur_lime_size = cur_lime_distribution[i][2]
        
        significant, p = Two_Sample_T_test(ref_lime_mean, cur_lime_mean, ref_lime_stdev, cur_lime_stdev, ref_lime_size,
                                          cur_lime_size)
        
        if significant:
            print("LIME Values Drift Detected at Feature " + str(i))
            print("P-value: " + str(p))
            update_flag = True
            feature_detected.append(i)
    return update_flag, feature_detected

In [None]:
def Get_LIME_Distribution(df, explainer, model):
    values = df.astype(float).values
    feature_names = df.columns
    
    LIME_values = []
    
    for value in values:
        LIME_value = explainer.explain_instance(value, model.predict_proba, num_features=len(feature_names)).as_list()
        LIME_values.append(LIME_value)
        
    LIME_distributions = []
    for i in range(len(feature_names)):
        feature_LIME_values = []
        for value in LIME_values:
            print(value)
            feature_LIME_values.append(value[i][1])
        feature_LIME_mean = statistics.mean(feature_LIME_values)
        feature_LIME_stdev = statistics.stdev(feature_LIME_values)
        
        LIME_distributions.append((feature_LIME_mean, feature_LIME_stdev, len(feature_LIME_values)))
    
    return LIME_distributions   

In [None]:
def Check_Significant_ELI5_Local(model, reference_window, current_window):
    print("Checking Significant by ELI5 Local Explaination")
    reference_window = reference_window.loc[reference_window['10'].astype(float) == 1.0]
    reference_window_x = reference_window.drop('10',axis=1)
    ref_eli5_distribution = Get_ELI5_Explaination(model, reference_window_x)
    
    current_window = current_window.loc[current_window['10'].astype(float) == 1.0]
    current_window_x = current_window.drop('10',axis=1)
    cur_eli5_distribution = Get_ELI5_Explaination(model, current_window_x)
    
    feature_detected = []
    for feature in ref_eli5_distribution:
        if feature in cur_eli5_distribution:
            reference_mean = ref_eli5_distribution[feature][0]
            current_mean = cur_eli5_distribution[feature][0]
            reference_stdev = ref_eli5_distribution[feature][1]
            current_stdev = cur_eli5_distribution[feature][1]
            reference_size = ref_eli5_distribution[feature][2]
            current_size = cur_eli5_distribution[feature][2]
                        
            eli5_significant, p = Two_Sample_T_test(reference_mean,current_mean,reference_stdev,current_stdev,
                                                               reference_size,current_size)
                        
            if eli5_significant:
                print("Feature Name: " + feature)
                print("P-value: " + str(p))
                feature_detected.append(feature)
    return feature_detected

In [None]:
def Get_ELI5_Explaination(model, df):
    values = df.astype(float).values
    feature_data = {}
    for value in values:
        exp = eli5.explain_prediction(model, value)
        exp = eli5.format_as_dict(exp)
        weights = exp['targets'][0]['feature_weights']['pos']
        
        for i in range(len(weights)):
            temp = weights[i]
            feature = temp['feature']
            weight = temp['weight']
            if feature not in feature_data:
                feature_weights = []
                feature_weights.append(weight)
                feature_data[feature] = feature_weights
            else:
                feature_weights = feature_data[feature]
                feature_weights.append(weight)
                feature_data[feature] = feature_weights
    
    feature_distribution = {}
    for feature in feature_data:
        feature_values = feature_data[feature]
        feature_mean = statistics.mean(feature_values)
        if len(feature_values) > 1:
            feature_stdev = statistics.stdev(feature_values)
        else:
            feature_stdev = 0
        feature_distribution[feature] = [feature_mean, feature_stdev, len(feature_values)]
    return feature_distribution

In [None]:
def Check_Significant_Permutation_Importance(model, reference_window, current_window):
    print("Checking Significant by Permutation Importance")
    reference_window_x = reference_window.drop('10',axis=1)
    reference_window_y = reference_window['10']
    ref_pi = eli5.sklearn.PermutationImportance(model,random_state=42).fit(reference_window_x, reference_window_y)
    ref_means = ref_pi.feature_importances_
    ref_stdevs = ref_pi.feature_importances_std_
    ref_size = len(reference_window_y)

    current_window_x = current_window.drop('10',axis=1)
    current_window_y = current_window['10']
    cur_pi = eli5.sklearn.PermutationImportance(model,random_state=42).fit(current_window_x, current_window_y)
    cur_means = cur_pi.feature_importances_
    cur_stdevs = cur_pi.feature_importances_std_
    cur_size = len(current_window_y)
    
    
    feature_detected = []
    for i in range(len(ref_means)):
        ref_mean = ref_means[i]
        ref_stdev = ref_stdevs[i]
        cur_mean = cur_means[i]
        cur_stdev = cur_stdevs[i]
        
        pi_significant,p = Two_Sample_T_test(ref_mean, cur_mean, ref_stdev, cur_stdev, ref_size, cur_size)
        if pi_significant:
            print("Feature Name: " + str(i))
            print("P-value: " + str(p))
            feature_detected.append(i)
    return feature_detected

In [None]:
def Two_Sample_T_test(expected_mean, current_mean, expected_sd, current_sd, expected_size, current_size):
    mean_diff = expected_mean - current_mean
    size_sum = math.pow(expected_sd, 2) / expected_size + math.pow(current_sd, 2) / current_size
    if size_sum == 0:
        size_sum = 0.0000000001
    t = mean_diff / math.sqrt(size_sum)
    df = expected_size + current_size - 2
    p = (1 - stats.t.cdf(t, df=df)) * 2

    if p < 0.05:
        return True, p
    else:
        return False, p

# Adult Dataset

In [None]:
# Adult dataset
chunk_size = 2500
chunk_num = 0
chunk = []

reference_window = []
current_window = []

drifts_detected = []

chunks = []
no_change_accuracy = []
distance_accuracy = []

with open('Datasets/1_adult.csv') as csv_file:
    drifts_detected = []
    csv_reader = csv.reader(csv_file)
    for row in csv_reader:
        chunk.append(row)
        if (len(chunk) == chunk_size):
            if (chunk_num == 0):
#                 print("Currently working on chunk " + str(chunk_num))
                reference_window.extend(chunk)
                reference_window_df = pd.DataFrame(reference_window)
                reference_window_df.columns = reference_window_df.columns.astype(str)
                reference_window_x = reference_window_df.drop('65', axis=1)
                reference_window_y = reference_window_df['65']
                
                print("Train Initial Classifier")
                random_forest_no_change = RandomForestClassifier(n_estimators=20,random_state=42)
                random_forest_no_change.fit(reference_window_x, reference_window_y)
                
                random_forest_distance = RandomForestClassifier(n_estimators=20,random_state=42)
                random_forest_distance.fit(reference_window_x, reference_window_y)
                
                pred_no_change = random_forest_no_change.predict(reference_window_x)
                accuracy_no_change = accuracy_score(reference_window_y,pred_no_change)
                no_change_accuracy.append(accuracy_no_change)
                print("Prediction Accuracy - No Change: " + str(accuracy_no_change))
                
                pred_distance = random_forest_distance.predict(reference_window_x)
                accuracy_distance = accuracy_score(reference_window_y, pred_distance)
                distance_accuracy.append(accuracy_distance)
                print("Prediction Accuracy - Distance: " + str(accuracy_distance))
                
                chunks.append(chunk_num)
                
                chunk = []
                chunk_num = chunk_num + 1
            else:
#                 print("Currently working on chunk " + str(chunk_num))
                
                reference_window_df = pd.DataFrame(reference_window)
                reference_window_df.columns = reference_window_df.columns.astype(str)
                reference_window_x = reference_window_df.drop('65', axis=1)
                
                current_window.extend(chunk)
                current_window_df = pd.DataFrame(current_window)
                current_window_df.columns = current_window_df.columns.astype(str)
                current_window_x = current_window_df.drop('65', axis=1)
                current_window_y = current_window_df['65']
                
                pred_no_change = random_forest_no_change.predict(current_window_x)
                accuracy_no_change = accuracy_score(current_window_y,pred_no_change)
                no_change_accuracy.append(accuracy_no_change)
                print("Prediction Accuracy - No Change: " + str(accuracy_no_change))
                
                pred_distance = random_forest_distance.predict(current_window_x)
                accuracy_distance = accuracy_score(current_window_y, pred_distance)
                distance_accuracy.append(accuracy_distance)
                print("Prediction Accuracy - Distance: " + str(accuracy_distance))
                
                chunks.append(chunk_num)
                
                significant, distance_feature_detected = Check_Significant_Wasserstein_Distance(reference_window_x, current_window_x)
                
                if significant:
                    features = reference_window_x.columns.astype(int)
                    
#                     shap_significant = Check_Significant_Shap(random_forest_distance, reference_window_x, current_window_x)
#                     eli5_feature_detected = Check_Significant_ELI5_Local(random_forest_distance, reference_window_df, current_window_df)
                    pi_feature_detected = Check_Significant_Permutation_Importance(random_forest_distance, reference_window_df, current_window_df)
                    
                    drifts_detected.append(chunk_num)
                    print('Drift Detected at chunk ' + str(chunk_num))
                    print("Updating Model")
                    random_forest_distance = RandomForestClassifier(n_estimators=20,random_state=42)
                    random_forest_distance.fit(current_window_x,current_window_y)
                    
                reference_window = []
                reference_window.extend(chunk)
                current_window = []
                
                chunk = []
                chunk_num = chunk_num + 1

# Bank Dataset

In [None]:
# Bank dataset
chunk_size = 2500
chunk_num = 0
chunk = []

reference_window = []
current_window = []

drifts_detected = []

chunks = []
no_change_accuracy = []
distance_accuracy = []

with open('Datasets/2_bank.csv') as csv_file:
    drifts_detected = []
    csv_reader = csv.reader(csv_file)
    for row in csv_reader:
        chunk.append(row)
        if (len(chunk) == chunk_size):
            if (chunk_num == 0):
#                 print("Currently working on chunk " + str(chunk_num))
                reference_window.extend(chunk)
                reference_window_df = pd.DataFrame(reference_window)
                reference_window_df.columns = reference_window_df.columns.astype(str)
                reference_window_x = reference_window_df.drop('48', axis=1)
                reference_window_y = reference_window_df['48']
                
                print("Train Initial Classifier")
                random_forest_no_change = RandomForestClassifier(n_estimators=20,random_state=42)
                random_forest_no_change.fit(reference_window_x, reference_window_y)
                
                random_forest_distance = RandomForestClassifier(n_estimators=20,random_state=42)
                random_forest_distance.fit(reference_window_x, reference_window_y)
                
                pred_no_change = random_forest_no_change.predict(reference_window_x)
                accuracy_no_change = accuracy_score(reference_window_y,pred_no_change)
                no_change_accuracy.append(accuracy_no_change)
                print("Prediction Accuracy - No Change: " + str(accuracy_no_change))
                
                pred_distance = random_forest_distance.predict(reference_window_x)
                accuracy_distance = accuracy_score(reference_window_y, pred_distance)
                distance_accuracy.append(accuracy_distance)
                print("Prediction Accuracy - Distance: " + str(accuracy_distance))
                
                chunks.append(chunk_num)
                
                chunk = []
                chunk_num = chunk_num + 1
            else:
#                 print("Currently working on chunk " + str(chunk_num))
                
                reference_window_df = pd.DataFrame(reference_window)
                reference_window_df.columns = reference_window_df.columns.astype(str)
                reference_window_x = reference_window_df.drop('48', axis=1)
                
                current_window.extend(chunk)
                current_window_df = pd.DataFrame(current_window)
                current_window_df.columns = current_window_df.columns.astype(str)
                current_window_x = current_window_df.drop('48', axis=1)
                current_window_y = current_window_df['48']
                
                pred_no_change = random_forest_no_change.predict(current_window_x)
                accuracy_no_change = accuracy_score(current_window_y,pred_no_change)
                no_change_accuracy.append(accuracy_no_change)
                print("Prediction Accuracy - No Change: " + str(accuracy_no_change))
                
                pred_distance = random_forest_distance.predict(current_window_x)
                accuracy_distance = accuracy_score(current_window_y, pred_distance)
                distance_accuracy.append(accuracy_distance)
                print("Prediction Accuracy - Distance: " + str(accuracy_distance))
                
                chunks.append(chunk_num)
                
                significant,distance_feature_detected = Check_Significant_Wasserstein_Distance(reference_window_x, current_window_x)
                
                if significant:
                    features = reference_window_x.columns.astype(int)
#                     shap_significant = Check_Significant_Shap(random_forest_distance, reference_window_x, current_window_x)
                    eli5_feature_detected = Check_Significant_ELI5_Local(random_forest_distance, reference_window_df, current_window_df)
#                     pi_feature_detected = Check_Significant_Permutation_Importance(random_forest_distance, reference_window_df, current_window_df)
                    drifts_detected.append(chunk_num)
                    print('Drift Detected at chunk ' + str(chunk_num))
                    print("Updating Model")
                    random_forest_distance = RandomForestClassifier(n_estimators=20,random_state=42)
                    random_forest_distance.fit(current_window_x,current_window_y)
                    
                reference_window = []
                reference_window.extend(chunk)
                current_window = []
                
                chunk = []
                chunk_num = chunk_num + 1

# Credit Dataset

In [None]:
# Bank dataset
chunk_size = 1500
chunk_num = 0
chunk = []

reference_window = []
current_window = []

drifts_detected = []

chunks = []
no_change_accuracy = []
distance_accuracy = []

with open('Datasets/3_credit.csv') as csv_file:
    drifts_detected = []
    csv_reader = csv.reader(csv_file)
    for row in csv_reader:
        chunk.append(row)
        if (len(chunk) == chunk_size):
            if (chunk_num == 0):
#                 print("Currently working on chunk " + str(chunk_num))
                reference_window.extend(chunk)
                reference_window_df = pd.DataFrame(reference_window)
                reference_window_df.columns = reference_window_df.columns.astype(str)
                reference_window_x = reference_window_df.drop('26', axis=1)
                reference_window_y = reference_window_df['26']
                
                print("Train Initial Classifier")
                random_forest_no_change = RandomForestClassifier(n_estimators=20,random_state=42)
                random_forest_no_change.fit(reference_window_x, reference_window_y)
                
                random_forest_distance = RandomForestClassifier(n_estimators=20,random_state=42)
                random_forest_distance.fit(reference_window_x, reference_window_y)
                
                lime_explainer = lime.lime_tabular.LimeTabularExplainer(reference_window_x.astype(float).values,
                                                                   mode='classification',
                                                                   training_labels=reference_window_y,
                                                                   feature_names=reference_window_x.columns)
                
                pred_no_change = random_forest_no_change.predict(reference_window_x)
                accuracy_no_change = accuracy_score(reference_window_y,pred_no_change)
                no_change_accuracy.append(accuracy_no_change)
                print("Prediction Accuracy - No Change: " + str(accuracy_no_change))
                
                pred_distance = random_forest_distance.predict(reference_window_x)
                accuracy_distance = accuracy_score(reference_window_y, pred_distance)
                distance_accuracy.append(accuracy_distance)
                print("Prediction Accuracy - Distance: " + str(accuracy_distance))
                
                chunks.append(chunk_num)
                
                chunk = []
                chunk_num = chunk_num + 1
            else:
#                 print("Currently working on chunk " + str(chunk_num))
                
                reference_window_df = pd.DataFrame(reference_window)
                reference_window_df.columns = reference_window_df.columns.astype(str)
                reference_window_x = reference_window_df.drop('26', axis=1)
                
                current_window.extend(chunk)
                current_window_df = pd.DataFrame(current_window)
                current_window_df.columns = current_window_df.columns.astype(str)
                current_window_x = current_window_df.drop('26', axis=1)
                current_window_y = current_window_df['26']
                
                pred_no_change = random_forest_no_change.predict(current_window_x)
                accuracy_no_change = accuracy_score(current_window_y,pred_no_change)
                no_change_accuracy.append(accuracy_no_change)
                print("Prediction Accuracy - No Change: " + str(accuracy_no_change))
                
                pred_distance = random_forest_distance.predict(current_window_x)
                accuracy_distance = accuracy_score(current_window_y, pred_distance)
                distance_accuracy.append(accuracy_distance)
                print("Prediction Accuracy - Distance: " + str(accuracy_distance))
                
                chunks.append(chunk_num)
                
                significant, distance_feature_detected = Check_Significant_Wasserstein_Distance(reference_window_x, current_window_x)
                
                if significant:
                    features = reference_window_x.columns.astype(int)
#                     shap_significant,shap_feature_detected = Check_Significant_Shap(random_forest_distance, reference_window_x, current_window_x)
#                     eli5_feature_detected = Check_Significant_ELI5_Local(random_forest_distance, reference_window_df, current_window_df)
                    pi_feature_detected = Check_Significant_Permutation_Importance(random_forest_distance, reference_window_df, current_window_df)

                    drifts_detected.append(chunk_num)
                    print('Drift Detected at chunk ' + str(chunk_num))
                    print("Updating Model")
                    random_forest_distance = RandomForestClassifier(n_estimators=20,random_state=42)
                    random_forest_distance.fit(current_window_x,current_window_y)
                    
                reference_window = []
                reference_window.extend(chunk)
                current_window = []
                
                chunk = []
                chunk_num = chunk_num + 1

# Gamma Dataset

In [None]:
# Bank dataset
chunk_size = 1000
chunk_num = 0
chunk = []

reference_window = []
current_window = []

drifts_detected = []

chunks = []
no_change_accuracy = []
distance_accuracy = []

with open('Datasets/4_gamma.csv') as csv_file:
    drifts_detected = []
    csv_reader = csv.reader(csv_file)
    for row in csv_reader:
        chunk.append(row)
        if (len(chunk) == chunk_size):
            if (chunk_num == 0):
#                 print("Currently working on chunk " + str(chunk_num))
                reference_window.extend(chunk)
                reference_window_df = pd.DataFrame(reference_window)
                reference_window_df.columns = reference_window_df.columns.astype(str)
                reference_window_x = reference_window_df.drop('10', axis=1)
                reference_window_y = reference_window_df['10']
                
                print("Train Initial Classifier")
                random_forest_no_change = RandomForestClassifier(n_estimators=20,random_state=42)
                random_forest_no_change.fit(reference_window_x, reference_window_y)
                
                random_forest_distance = RandomForestClassifier(n_estimators=20,random_state=42)
                random_forest_distance.fit(reference_window_x, reference_window_y)
                
                pred_no_change = random_forest_no_change.predict(reference_window_x)
                accuracy_no_change = accuracy_score(reference_window_y,pred_no_change)
                no_change_accuracy.append(accuracy_no_change)
                print("Prediction Accuracy - No Change: " + str(accuracy_no_change))
                
                pred_distance = random_forest_distance.predict(reference_window_x)
                accuracy_distance = accuracy_score(reference_window_y, pred_distance)
                distance_accuracy.append(accuracy_distance)
                print("Prediction Accuracy - Distance: " + str(accuracy_distance))
                
                chunks.append(chunk_num)
                
                chunk = []
                chunk_num = chunk_num + 1
            else:
#                 print("Currently working on chunk " + str(chunk_num))
                
                reference_window_df = pd.DataFrame(reference_window)
                reference_window_df.columns = reference_window_df.columns.astype(str)
                reference_window_x = reference_window_df.drop('10', axis=1)
                
                current_window.extend(chunk)
                current_window_df = pd.DataFrame(current_window)
                current_window_df.columns = current_window_df.columns.astype(str)
                current_window_x = current_window_df.drop('10', axis=1)
                current_window_y = current_window_df['10']
                
                pred_no_change = random_forest_no_change.predict(current_window_x)
                accuracy_no_change = accuracy_score(current_window_y,pred_no_change)
                no_change_accuracy.append(accuracy_no_change)
                print("Prediction Accuracy - No Change: " + str(accuracy_no_change))
                
                pred_distance = random_forest_distance.predict(current_window_x)
                accuracy_distance = accuracy_score(current_window_y, pred_distance)
                distance_accuracy.append(accuracy_distance)
                print("Prediction Accuracy - Distance: " + str(accuracy_distance))
                
                chunks.append(chunk_num)
                
                significant,distance_feature_detected = Check_Significant_Wasserstein_Distance(reference_window_x, current_window_x)
                
                if significant:
                    features = reference_window_x.columns.astype(int)
#                     shap_significant = Check_Significant_Shap(random_forest_distance, reference_window_x, current_window_x)
#                     eli5_feature_detected = Check_Significant_ELI5_Local(random_forest_distance, reference_window_df, current_window_df)
                    pi_feature_detected = Check_Significant_Permutation_Importance(random_forest_distance, reference_window_df, current_window_df)
                    drifts_detected.append(chunk_num)
                    print('Drift Detected at chunk ' + str(chunk_num))
                    print("Updating Model")
                    random_forest_distance = RandomForestClassifier(n_estimators=20,random_state=42)
                    random_forest_distance.fit(current_window_x,current_window_y)
                    
                reference_window = []
                reference_window.extend(chunk)
                current_window = []
                
                chunk = []
                chunk_num = chunk_num + 1

# elecNorm Dataset

In [None]:
# elecNorm dataset
chunk_size = 1440
chunk_num = 0
chunk = []

reference_window = []
current_window = []

drifts_detected = []

chunks = []
no_change_accuracy = []
distance_accuracy = []

with open('Datasets/5_elecNorm.csv') as csv_file:
    drifts_detected = []
    csv_reader = csv.reader(csv_file)
    for row in csv_reader:
        chunk.append(row)
        if (len(chunk) == chunk_size):
            if (chunk_num == 0):
#                 print("Currently working on chunk " + str(chunk_num))
                reference_window.extend(chunk)
                reference_window_df = pd.DataFrame(reference_window)
                reference_window_df.columns = reference_window_df.columns.astype(str)
                reference_window_x = reference_window_df.drop('8', axis=1)
                reference_window_x = reference_window_x.drop('0', axis=1)
                reference_window_y = reference_window_df['8']
                
                print("Train Initial Classifier")
                random_forest_no_change = RandomForestClassifier(n_estimators=20,random_state=42)
                random_forest_no_change.fit(reference_window_x, reference_window_y)
                
                random_forest_distance = RandomForestClassifier(n_estimators=20,random_state=42)
                random_forest_distance.fit(reference_window_x, reference_window_y)
                
                pred_no_change = random_forest_no_change.predict(reference_window_x)
                accuracy_no_change = accuracy_score(reference_window_y,pred_no_change)
                no_change_accuracy.append(accuracy_no_change)
                print("Prediction Accuracy - No Change: " + str(accuracy_no_change))
                
                pred_distance = random_forest_distance.predict(reference_window_x)
                accuracy_distance = accuracy_score(reference_window_y, pred_distance)
                distance_accuracy.append(accuracy_distance)
                print("Prediction Accuracy - Distance: " + str(accuracy_distance))
                
                chunks.append(chunk_num)
                
                chunk = []
                chunk_num = chunk_num + 1
            else:
#                 print("Currently working on chunk " + str(chunk_num))
                
                reference_window_df = pd.DataFrame(reference_window)
                reference_window_df.columns = reference_window_df.columns.astype(str)
                reference_window_x = reference_window_df.drop('8', axis=1)
                reference_window_x = reference_window_x.drop('0', axis=1)
                
                current_window.extend(chunk)
                current_window_df = pd.DataFrame(current_window)
                current_window_df.columns = current_window_df.columns.astype(str)
                current_window_x = current_window_df.drop('8', axis=1)
                current_window_x = current_window_x.drop('0', axis=1)
                current_window_y = current_window_df['8']
                
                pred_no_change = random_forest_no_change.predict(current_window_x)
                accuracy_no_change = accuracy_score(current_window_y,pred_no_change)
                no_change_accuracy.append(accuracy_no_change)
                print("Prediction Accuracy - No Change: " + str(accuracy_no_change))
                
                pred_distance = random_forest_distance.predict(current_window_x)
                accuracy_distance = accuracy_score(current_window_y, pred_distance)
                distance_accuracy.append(accuracy_distance)
                print("Prediction Accuracy - Distance: " + str(accuracy_distance))
                
                chunks.append(chunk_num)
                
                significant, distance_feature_detected = Check_Significant_Wasserstein_Distance(reference_window_x, current_window_x)
                
                if significant:
                    drifts_detected.append(chunk_num)
                    print('Drift Detected at chunk ' + str(chunk_num))
                    print("Updating Model")
                    random_forest_distance = RandomForestClassifier(n_estimators=20,random_state=42)
                    random_forest_distance.fit(current_window_x,current_window_y)
                    
                reference_window = []
                reference_window.extend(chunk)
                current_window = []
                
                chunk = []
                chunk_num = chunk_num + 1

# Phishing Dataset

In [None]:
# elecNorm dataset
chunk_size = 1000
chunk_num = 0
chunk = []

reference_window = []
current_window = []

drifts_detected = []

chunks = []
no_change_accuracy = []
distance_accuracy = []

with open('Datasets/6_phishing.csv') as csv_file:
    drifts_detected = []
    csv_reader = csv.reader(csv_file)
    for row in csv_reader:
        chunk.append(row)
        if (len(chunk) == chunk_size):
            if (chunk_num == 0):
#                 print("Currently working on chunk " + str(chunk_num))
                reference_window.extend(chunk)
                reference_window_df = pd.DataFrame(reference_window)
                reference_window_df.columns = reference_window_df.columns.astype(str)
                reference_window_x = reference_window_df.drop('46', axis=1)
                reference_window_y = reference_window_df['46']
                
                print("Train Initial Classifier")
                random_forest_no_change = RandomForestClassifier(n_estimators=20,random_state=42)
                random_forest_no_change.fit(reference_window_x, reference_window_y)
                
                random_forest_distance = RandomForestClassifier(n_estimators=20,random_state=42)
                random_forest_distance.fit(reference_window_x, reference_window_y)
                
                pred_no_change = random_forest_no_change.predict(reference_window_x)
                accuracy_no_change = accuracy_score(reference_window_y,pred_no_change)
                no_change_accuracy.append(accuracy_no_change)
                print("Prediction Accuracy - No Change: " + str(accuracy_no_change))
                
                pred_distance = random_forest_distance.predict(reference_window_x)
                accuracy_distance = accuracy_score(reference_window_y, pred_distance)
                distance_accuracy.append(accuracy_distance)
                print("Prediction Accuracy - Distance: " + str(accuracy_distance))
                
                chunks.append(chunk_num)
                
                chunk = []
                chunk_num = chunk_num + 1
            else:
#                 print("Currently working on chunk " + str(chunk_num))
                
                reference_window_df = pd.DataFrame(reference_window)
                reference_window_df.columns = reference_window_df.columns.astype(str)
                reference_window_x = reference_window_df.drop('46', axis=1)
                
                current_window.extend(chunk)
                current_window_df = pd.DataFrame(current_window)
                current_window_df.columns = current_window_df.columns.astype(str)
                current_window_x = current_window_df.drop('46', axis=1)
                current_window_y = current_window_df['46']
                
                pred_no_change = random_forest_no_change.predict(current_window_x)
                accuracy_no_change = accuracy_score(current_window_y,pred_no_change)
                no_change_accuracy.append(accuracy_no_change)
                print("Prediction Accuracy - No Change: " + str(accuracy_no_change))
                
                pred_distance = random_forest_distance.predict(current_window_x)
                accuracy_distance = accuracy_score(current_window_y, pred_distance)
                distance_accuracy.append(accuracy_distance)
                print("Prediction Accuracy - Distance: " + str(accuracy_distance))
                
                chunks.append(chunk_num)
                
                significant, distance_feature_detected = Check_Significant_Wasserstein_Distance(reference_window_x, current_window_x)
                
                if significant:
                    drifts_detected.append(chunk_num)
                    print('Drift Detected at chunk ' + str(chunk_num))
                    print("Updating Model")
                    random_forest_distance = RandomForestClassifier(n_estimators=20,random_state=42)
                    random_forest_distance.fit(current_window_x,current_window_y)
                    
                reference_window = []
                reference_window.extend(chunk)
                current_window = []
                
                chunk = []
                chunk_num = chunk_num + 1

# Test Dataset

In [None]:
chunk_size = 1000
chunk_num = 0
chunk = []

reference_window = []
current_window = []

drifts_detected = []

chunks = []
no_change_accuracy = []
distance_accuracy = []

# with open('test_shuffled_4.csv') as csv_file:
with open('Datasets/7_SEA.csv') as csv_file:
    drifts_detected = []
    csv_reader = csv.reader(csv_file)
    for row in csv_reader:
        chunk.append(row)
        if (len(chunk) == chunk_size):
            if (chunk_num == 0):
#                 print("Currently working on chunk " + str(chunk_num))
                reference_window.extend(chunk)
                reference_window_df = pd.DataFrame(reference_window)
                reference_window_df.columns = reference_window_df.columns.astype(str)
                
                reference_window_x = reference_window_df.drop('3', axis=1)
                reference_window_y = reference_window_df['3']
                
                print("Train Initial Classifier")
                random_forest_no_change = RandomForestClassifier(n_estimators=20,random_state=42)
                random_forest_no_change.fit(reference_window_x, reference_window_y)
                
                random_forest_distance = RandomForestClassifier(n_estimators=20,random_state=42)
                random_forest_distance.fit(reference_window_x, reference_window_y)
                
                pred_no_change = random_forest_no_change.predict(reference_window_x)
                accuracy_no_change = accuracy_score(reference_window_y,pred_no_change)
                no_change_accuracy.append(accuracy_no_change)
                print("Prediction Accuracy - No Change: " + str(accuracy_no_change))
                
                pred_distance = random_forest_distance.predict(reference_window_x)
                accuracy_distance = accuracy_score(reference_window_y, pred_distance)
                distance_accuracy.append(accuracy_distance)
                print("Prediction Accuracy - Distance: " + str(accuracy_distance))
                
                chunks.append(chunk_num)
                
                chunk = []
                chunk_num = chunk_num + 1
            else:
#                 print("Currently working on chunk " + str(chunk_num))
                
                reference_window_df = pd.DataFrame(reference_window)
                reference_window_df.columns = reference_window_df.columns.astype(str)
                reference_window_x = reference_window_df.drop('3', axis=1)
                
                current_window.extend(chunk)
                current_window_df = pd.DataFrame(current_window)
                current_window_df.columns = current_window_df.columns.astype(str)
                current_window_x = current_window_df.drop('3', axis=1)
                current_window_y = current_window_df['3']
                
                pred_no_change = random_forest_no_change.predict(current_window_x)
                accuracy_no_change = accuracy_score(current_window_y,pred_no_change)
                no_change_accuracy.append(accuracy_no_change)
                print("Prediction Accuracy - No Change: " + str(accuracy_no_change))
                
                pred_distance = random_forest_distance.predict(current_window_x)
                accuracy_distance = accuracy_score(current_window_y, pred_distance)
                distance_accuracy.append(accuracy_distance)
                print("Prediction Accuracy - Distance: " + str(accuracy_distance))
                
                chunks.append(chunk_num)
                
                significant = Check_Significant_Wasserstein_Distance(reference_window_x, current_window_x)
                
                if significant:
                    Check_Significant_Shap(random_forest_distance, reference_window_x, current_window_x)
                    drifts_detected.append(chunk_num)
                    print('Drift Detected at chunk ' + str(chunk_num))
                    print("Updating Model")
                    random_forest_distance = RandomForestClassifier(n_estimators=20,random_state=42)
                    random_forest_distance.fit(current_window_x,current_window_y)
                    
                reference_window = []
                reference_window.extend(chunk)
                current_window = []
                
                chunk = []
                chunk_num = chunk_num + 1

In [None]:
print(drifts_detected)

In [None]:
plt.plot(chunks,no_change_accuracy,label='No Change')
plt.plot(chunks,distance_accuracy,'-bx',markevery=drifts_detected,label='Distance')
plt.legend()

# Evaluation

In [None]:
print(chunks)
print(no_change_accuracy)
print(distance_accuracy)
print(drifts_detected)

In [None]:
print(chunks)
print(features)
print(drifts_detected)
print(distance_feature_detected)
# print(shap_feature_detected)
# print(eli5_feature_detected)
# print(pi_feature_detected)