In [None]:
import pandas as pd
import numpy as np
from sklearn.linear_model import SGDClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import KFold
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from river.drift import ADWIN
from river.drift.binary import DDM


In [None]:
low_load = pd.read_csv('./labeled_dataset/SQL Server/low-load-sqlserver-memory-monitoring.csv', sep=',')
medium_load = pd.read_csv('./labeled_dataset/SQL Server/medium-load-sqlserver-memory-monitoring.csv', sep=',')
high_load = pd.read_csv('./labeled_dataset/SQL Server/high-load-sqlserver-memory-monitoring.csv', sep=',')

In [None]:
low_load

In [None]:
low_load = low_load.drop(columns=['Date', 'Time', 'trend', 'seasonal', 'residual'], errors='ignore')


In [None]:
print(low_load['Aging_Label'].value_counts(normalize=True))

In [None]:
X = low_load.drop(columns=['Aging_Label'])
y = low_load['Aging_Label']

kf = KFold(n_splits=5, shuffle=True)
accuracies = []
precisions = []
recalls = []
f1_scores = []

for train_index, test_index in kf.split(X):
    X_train, X_test = X.iloc[train_index], X.iloc[test_index]
    y_train, y_test = y.iloc[train_index], y.iloc[test_index]

    clf = RandomForestClassifier()
    clf.fit(X_train, y_train)
    y_pred = clf.predict(X_test)

    acc = accuracy_score(y_test, y_pred)
    prec = precision_score(y_test, y_pred, average='binary')
    rec = recall_score(y_test, y_pred, average='binary')
    f1 = f1_score(y_test, y_pred, average='binary')

    accuracies.append(acc)
    precisions.append(prec)
    recalls.append(rec)
    f1_scores.append(f1)

print("Accuracies in each fold:", accuracies)
print("Precisions in each fold:", precisions)
print("Recalls in each fold:", recalls)
print("F1-scores in each fold:", f1_scores)

print("\nAverages:")
print("Average accuracy:", sum(accuracies)/len(accuracies))
print("Average precision:", sum(precisions)/len(precisions))
print("Average recall:", sum(recalls)/len(recalls))
print("Average F1-score:", sum(f1_scores)/len(f1_scores))


# Baseline Model Evaluation by Load

In [None]:
results = pd.DataFrame(columns=['Model', 'Predicted Load', 'Accuracy', 'Precision', 'Recall', 'F1-Score'])

mean_accuracy = sum(accuracies)/len(accuracies)
mean_precision = sum(precisions)/len(precisions)
mean_recall = sum(recalls)/len(recalls)
mean_f1 = sum(f1_scores)/len(f1_scores)

results.loc[len(results)] = [
    f"{clf.__class__.__name__}-Low Load",
    "Low",
    mean_accuracy,
    mean_precision,
    mean_recall,
    mean_f1
]


In [None]:
medium_load = medium_load.drop(columns=['Date', 'Time', 'trend', 'seasonal', 'residual'], errors='ignore')

X2 = medium_load.drop(columns=['Aging_Label'], errors='ignore')
y2 = medium_load['Aging_Label']

X2 = X2[X.columns]

X2_scaled = X2

y2_pred = clf.predict(X2_scaled)

print("\nEvaluation on df2:")
print("Accuracy:", accuracy_score(y2, y2_pred))
print("Precision:", precision_score(y2, y2_pred, average='binary'))
print("Recall:", recall_score(y2, y2_pred, average='binary'))
print("F1-score:", f1_score(y2, y2_pred, average='binary'))


In [None]:
mean_accuracy = accuracy_score(y2, y2_pred)
mean_precision = precision_score(y2, y2_pred, average='binary')
mean_recall = recall_score(y2, y2_pred, average='binary')
mean_f1 = f1_score(y2, y2_pred, average='binary')

results.loc[len(results)] = [
    f"{clf.__class__.__name__}-Medium Load",
    "Medium",
    mean_accuracy,
    mean_precision,
    mean_recall,
    mean_f1
]


In [None]:

high_load = high_load.drop(columns=['Date', 'Time', 'trend', 'seasonal', 'residual'], errors='ignore')

X3 = high_load.drop(columns=['Aging_Label'], errors='ignore')
y3 = high_load['Aging_Label']

X3 = X3[X.columns]

X3_scaled = X3

y3_pred = clf.predict(X3_scaled)

print("\nAvaliação em df3:")
print("Acurácia:", accuracy_score(y3, y3_pred))
print("Precisão:", precision_score(y3, y3_pred, average='binary'))
print("Revocação:", recall_score(y3, y3_pred, average='binary'))
print("F1-score:", f1_score(y3, y3_pred, average='binary'))

In [None]:
mean_accuracy = accuracy_score(y3, y3_pred)
mean_precision = precision_score(y3, y3_pred, average='binary')
mean_recall = recall_score(y3, y3_pred, average='binary')
mean_f1 = f1_score(y3, y3_pred, average='binary')

results.loc[len(results)] = [
    f"{clf.__class__.__name__}-Low Load",
    "High",
    mean_accuracy,
    mean_precision,
    mean_recall,
    mean_f1
]

In [None]:
results

In [None]:
print(results)

# Sudden Drift Simulation

In [None]:
df_low_start = low_load[low_load['Elapsed_time'] <= 86145]
df_medium_end = medium_load[medium_load['Elapsed_time'] >= 86150]
df_low_medium = pd.concat([df_low_start, df_medium_end], ignore_index=True)
df_low_medium.reset_index(drop=True, inplace=True)


## Baseline Model Evaluation in Sudden Drift Scenario (Low-Medium)

In [None]:
X_low_medium = df_low_medium.drop(columns=['Aging_Label'], errors='ignore')
y_low_medium = df_low_medium['Aging_Label']

X_low_medium = X_low_medium[X.columns]

y_low_medium_pred = clf.predict(X_low_medium)

print("Evaluation on Low-Medium Load Dataset:")
print("Accuracy:", accuracy_score(y_low_medium, y_low_medium_pred))
print("Precision:", precision_score(y_low_medium, y_low_medium_pred, average='binary'))
print("Recall:", recall_score(y_low_medium, y_low_medium_pred, average='binary'))
print("F1-score:", f1_score(y_low_medium, y_low_medium_pred, average='binary'))


In [None]:
X_low_medium = df_low_medium.drop(columns=['Aging_Label'], errors='ignore')
y_low_medium = df_low_medium['Aging_Label']

X_low_medium = X_low_medium[X.columns]

y_low_medium_pred = clf.predict(X_low_medium)

results.loc[len(results)] = [
    f"{clf.__class__.__name__}-Low Load",
    "Low-Medium",
    accuracy_score(y_low_medium, y_low_medium_pred),
    precision_score(y_low_medium, y_low_medium_pred, average='binary'),
    recall_score(y_low_medium, y_low_medium_pred, average='binary'),
    f1_score(y_low_medium, y_low_medium_pred, average='binary')
]


In [None]:
df_low_start = low_load[low_load['Elapsed_time'] <= 86145]
df_high_end = high_load[high_load['Elapsed_time'] >= 86150]
df_low_high = pd.concat([df_low_start, df_high_end], ignore_index=True)
df_low_high.reset_index(drop=True, inplace=True)


## Baseline Model Evaluation in Sudden Drift Scenario (Low-High):

In [None]:
X_low_high = df_low_high.drop(columns=['Aging_Label'], errors='ignore')
y_low_high = df_low_high['Aging_Label']

X_low_high = X_low_high[X.columns]

y_low_high_pred = clf.predict(X_low_high)

print("Evaluation on Low-High Drift Dataset:")
print("Accuracy:", accuracy_score(y_low_high, y_low_high_pred))
print("Precision:", precision_score(y_low_high, y_low_high_pred, average='binary'))
print("Recall:", recall_score(y_low_high, y_low_high_pred, average='binary'))
print("F1-score:", f1_score(y_low_high, y_low_high_pred, average='binary'))


In [None]:
results.loc[len(results)] = [
    f"{clf.__class__.__name__}-Low Load",
    "Low-High",
    accuracy_score(y_low_high, y_low_high_pred),
    precision_score(y_low_high, y_low_high_pred, average='binary'),
    recall_score(y_low_high, y_low_high_pred, average='binary'),
    f1_score(y_low_high, y_low_high_pred, average='binary')
]

In [None]:
print(results)

# Gradual Drift Simulation

In [None]:
stable_period_size = 5000
transition_period_size = 10000 
random_seed = 42

np.random.seed(random_seed)

iter_low = low_load.iterrows()
iter_medium = medium_load.iterrows()
iter_high = high_load.iterrows()

stream_rows = []

print("Starting creation of gradual drift stream (time series version)...")

try:
    print(f"Phase 1: Adding {stable_period_size} samples of low load...")
    for _ in range(stable_period_size):
        stream_rows.append(next(iter_low)[1])

    print(f"Phase 2: Transition from Low -> Medium over {transition_period_size} samples...")
    for i in range(transition_period_size):
        p_medium = i / transition_period_size
        if np.random.rand() < p_medium:
            stream_rows.append(next(iter_medium)[1])
        else:
            stream_rows.append(next(iter_low)[1])

    print(f"Phase 3: Adding {stable_period_size} samples of medium load...")
    for _ in range(stable_period_size):
        stream_rows.append(next(iter_medium)[1])

    print(f"Phase 4: Transition from Medium -> High over {transition_period_size} samples...")
    for i in range(transition_period_size):
        p_high = i / transition_period_size
        if np.random.rand() < p_high:
            stream_rows.append(next(iter_high)[1])
        else:
            stream_rows.append(next(iter_medium)[1])

    print(f"Phase 5: Adding {stable_period_size} samples of high load...")
    for _ in range(stable_period_size):
        stream_rows.append(next(iter_high)[1])

except StopIteration:
    print("Warning: One of the load dataframes was fully consumed.")

df_gradual_drift_ts = pd.DataFrame(stream_rows)

df_gradual_drift_ts['Elapsed_time'] = np.arange(len(df_gradual_drift_ts)) * 5
df_gradual_drift_ts.reset_index(drop=True, inplace=True)

print("\nGradual drift stream (time series) successfully created!")
print(f"Total dataframe size: {df_gradual_drift_ts.shape[0]} samples.")


## Baseline Model Evaluation in Gradual Drift Scenario:

In [None]:
X_gradual_ts = df_gradual_drift_ts.drop(columns=['Aging_Label'], errors='ignore')
y_gradual_ts = df_gradual_drift_ts['Aging_Label']

X_gradual_ts = X_gradual_ts[X.columns]

y_gradual_ts_pred = clf.predict(X_gradual_ts)

print("\nBaseline Model Evaluation in Gradual Drift Scenario (Time Series):")
acc_gradual_ts = accuracy_score(y_gradual_ts, y_gradual_ts_pred)
prec_gradual_ts = precision_score(y_gradual_ts, y_gradual_ts_pred, average='binary')
rec_gradual_ts = recall_score(y_gradual_ts, y_gradual_ts_pred, average='binary')
f1_gradual_ts = f1_score(y_gradual_ts, y_gradual_ts_pred, average='binary')

print(f"Accuracy: {acc_gradual_ts}")
print(f"Precision: {prec_gradual_ts}")
print(f"Recall: {rec_gradual_ts}")
print(f"F1-score: {f1_gradual_ts}")


In [None]:
results.loc[len(results)] = [
    f"{clf.__class__.__name__}-Low Load",
    "Gradual Drift (TS)",  # TS = Time-Series
    acc_gradual_ts,
    prec_gradual_ts,
    rec_gradual_ts,
    f1_gradual_ts
]

# Display the updated results table
print("\n--- Updated Results Table ---")
print(results)

# Recurring Drift Simulation

In [None]:
period_length = 5800
num_cycles = 2
random_seed = 42
np.random.seed(random_seed)

iter_low_rec = low_load.iterrows()
iter_medium_rec = medium_load.iterrows()
iter_high_rec = high_load.iterrows()

stream_rows_rec = []

print("Starting creation of the recurring drift stream...")

try:
    for cycle in range(num_cycles):
        print(f"Starting cycle {cycle + 1}/{num_cycles}...")
        
        print(f"  Adding {period_length} samples of low load...")
        for _ in range(period_length):
            stream_rows_rec.append(next(iter_low_rec)[1])

        print(f"  Adding {period_length} samples of medium load...")
        for _ in range(period_length):
            stream_rows_rec.append(next(iter_medium_rec)[1])
            
        print(f"  Adding {period_length} samples of high load...")
        for _ in range(period_length):
            stream_rows_rec.append(next(iter_high_rec)[1])

except StopIteration:
    print("Warning: One of the load dataframes was fully consumed before completing all cycles.")

df_recurrent_drift = pd.DataFrame(stream_rows_rec)
df_recurrent_drift['Elapsed_time'] = np.arange(len(df_recurrent_drift)) * 5
df_recurrent_drift.reset_index(drop=True, inplace=True)

print("\nRecurring drift stream successfully created!")
print(f"Total dataframe size: {df_recurrent_drift.shape[0]} samples.")


## Baseline Model Evaluation in Recurring Drift Scenario:

In [None]:
X_recurrent = df_recurrent_drift.drop(columns=['Aging_Label'], errors='ignore')
y_recurrent = df_recurrent_drift['Aging_Label']
X_recurrent = X_recurrent[X.columns]

y_recurrent_pred = clf.predict(X_recurrent)

print("\nBaseline Model Evaluation in Recurring Drift Scenario:")
acc_recurrent = accuracy_score(y_recurrent, y_recurrent_pred)
prec_recurrent = precision_score(y_recurrent, y_recurrent_pred, average='binary')
rec_recurrent = recall_score(y_recurrent, y_recurrent_pred, average='binary')
f1_recurrent = f1_score(y_recurrent, y_recurrent_pred, average='binary')

print(f"Accuracy: {acc_recurrent}")
print(f"Precision: {prec_recurrent}")
print(f"Recall: {rec_recurrent}")
print(f"F1-score: {f1_recurrent}")


In [None]:
results.loc[len(results)] = [
    f"{clf.__class__.__name__}-Low Load",
    "Recurring Drift",
    acc_recurrent,
    prec_recurrent,
    rec_recurrent,
    f1_recurrent
]

print("\n--- Final Results Table ---")
print(results)


In [None]:
def run_stream_with_ddm_refit(base_model, df_stream, drift_type, label_col='Aging_Label', window_size=2000):

    X_stream = df_stream.drop(columns=[label_col])
    y_stream = df_stream[label_col]

    ddm = DDM()

    y_preds = []
    drift_occurrences = []
    data_window = []

    print("Starting data stream simulation with drift detection...")

    for i in range(len(df_stream)):
        current_x = X_stream.iloc[[i]]
        true_y = y_stream.iloc[i]

        y_pred = base_model.predict(current_x)[0]
        y_preds.append(y_pred)

        error = int(y_pred != true_y)

        data_window.append((current_x, true_y))
        if len(data_window) > window_size:
            data_window.pop(0)

        ddm.update(error)

        if ddm.drift_detected:
            print(f"\n--- Drift Detected at index: {i} ---")
            drift_occurrences.append(i)

            X_window = pd.concat([x for x, _ in data_window])
            y_window = pd.Series([y for _, y in data_window])

            if len(np.unique(y_window)) > 1:
                base_model.fit(X_window, y_window)
            else:
                print(f"Data window at index {i} contains only one class. Skipping re-training.")

            ddm = DDM()
            print(f"Re-training with {len(data_window)} samples. DDM reset.")

    acc = accuracy_score(y_stream, y_preds)
    f1 = f1_score(y_stream, y_preds, average='binary')
    prec = precision_score(y_stream, y_preds, average='binary')
    rec = recall_score(y_stream, y_preds, average='binary')

    print("\n--- Simulation Completed ---")
    print(f"Total drifts detected: {len(drift_occurrences)}")
    print(f"Accuracy: {acc:.4f}")
    print(f"Precision: {prec:.4f}")
    print(f"Recall: {rec:.4f}")
    print(f"F1-score: {f1:.4f}")

    results.loc[len(results)] = [
        f"{base_model.__class__.__name__} - DDM - re_fit - Low Load",
        f"Drift {drift_type}",
        acc,
        prec,
        rec,
        f1
    ]

    return [base_model, results]


In [None]:
def run_stream_with_adwin_refit(base_model, df_stream, drift_type, label_col='Aging_Label', window_size=2000):

    X_stream = df_stream.drop(columns=[label_col])
    y_stream = df_stream[label_col]

    adwin = ADWIN()

    y_preds = []
    drift_occurrences = []
    data_window = []

    print(f"Starting data stream simulation with ADWIN for '{drift_type}' drift...")

    for i in range(len(df_stream)):
        current_x = X_stream.iloc[[i]]
        true_y = y_stream.iloc[i]

        y_pred = base_model.predict(current_x)[0]
        y_preds.append(y_pred)

        error = int(y_pred != true_y)

        data_window.append((current_x, true_y))
        if len(data_window) > window_size:
            data_window.pop(0)

        adwin.update(error)

        if adwin.drift_detected:
            print(f"\n--- Drift Detected (ADWIN) at index: {i} ---")
            drift_occurrences.append(i)

            X_window = pd.concat([x for x, _ in data_window])
            y_window = pd.Series([y for _, y in data_window])

            if len(np.unique(y_window)) > 1:
                print(f"Re-training the model with the last {len(data_window)} samples...")
                base_model.fit(X_window, y_window)
            else:
                print(f"Data window at index {i} contains only one class. Skipping re-training.")

            adwin = ADWIN()
            print("ADWIN detector reset.")

    acc = accuracy_score(y_stream, y_preds)
    f1 = f1_score(y_stream, y_preds, average='binary')
    prec = precision_score(y_stream, y_preds, average='binary')
    rec = recall_score(y_stream, y_preds, average='binary')

    print("\n--- Simulation Completed (ADWIN) ---")
    print(f"Total drifts detected: {len(drift_occurrences)}")
    print(f"Accuracy: {acc:.4f}")
    print(f"Precision: {prec:.4f}")
    print(f"Recall: {rec:.4f}")
    print(f"F1-score: {f1:.4f}")

    results.loc[len(results)] = [
        f"{base_model.__class__.__name__} - ADWIN - re_fit - Low Load",
        f"Drift {drift_type}",
        acc,
        prec,
        rec,
        f1
    ]

    return [base_model, results]


In [None]:
new_model, results = run_stream_with_ddm_refit(clf, df_gradual_drift_ts, drift_type='Gradual', label_col='Aging_Label', window_size=2000)
new_model, results = run_stream_with_ddm_refit(clf, df_recurrent_drift, drift_type='Recurrent', label_col='Aging_Label', window_size=2000)
new_model, results = run_stream_with_ddm_refit(clf, df_low_high, drift_type='Sudden Drift (Low-High)', label_col='Aging_Label', window_size=2000)
new_model, results = run_stream_with_ddm_refit(clf, df_low_medium, drift_type='Sudden Drift (Low-Medium)', label_col='Aging_Label', window_size=2000)


In [None]:
run_stream_with_adwin_refit(clf, df_gradual_drift_ts, drift_type='Gradual', label_col='Aging_Label')
run_stream_with_adwin_refit(clf, df_recurrent_drift, drift_type='Recurrent', label_col='Aging_Label')
run_stream_with_adwin_refit(clf, df_low_high, drift_type='Sudden Drift (Low-High)', label_col='Aging_Label')
run_stream_with_adwin_refit(clf, df_low_medium, drift_type='Sudden Drift (Low-Medium)', label_col='Aging_Label')


In [None]:
results

In [None]:
print(results)