In [2]:
import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import precision_score, recall_score, f1_score, accuracy_score
from collections import deque
from imblearn.over_sampling import SMOTE
import matplotlib.pyplot as plt

In [3]:
data = pd.read_csv('creditcard.csv')
df = pd.DataFrame(data)


In [4]:
def create_datastream_by_time(data, time_interval):
    datastream = []
    start_time = int(data['Time'].min())
    end_time = int(data['Time'].max())
    
    for t in range(start_time, end_time + 1, time_interval):
        block = data[(data['Time'] >= t) & (data['Time'] < t + time_interval)]
        if not block.empty:
            datastream.append(block)
    
    return datastream
time_interval = 3000
S = create_datastream_by_time(df, time_interval)

In [None]:
class DataStreamProcessor:
    def __init__(self, m, k, unique_labels):
        self.m = m
        self.k = k
        self.unique_labels = unique_labels
        self.C = deque(maxlen=m)
        self.pre = []
        self.true_labels = []
        self.class_1_samples = pd.DataFrame()

    def process_data_stream(self, S):
        for i in range(len(S) - 1):
            Bi = S[i]
            self._update_class_1_samples(Bi)
            X = Bi.iloc[:, :-1]
            y = Bi.iloc[:, -1]
            Ci = self._train_random_forest_classifier(X, y)
            self.C.append(Ci)
            if len(self.C) < self.k:
                next_block = S[i + 1]
                S[i + 1] = pd.concat([next_block, self.class_1_samples], ignore_index=True)
                continue
            Bi_1 = S[i + 1]
            block_predictions = self._predict_block(Bi_1)
            self.pre.append(block_predictions)
            self.true_labels.append(list(Bi_1.iloc[:, -1]))
            if i + 1 < len(S):
                self._update_next_block(S, i)
        return self.C, self.pre, self.true_labels

    def _update_class_1_samples(self, Bi):
        class_1_new_samples = Bi[Bi['Class'] == 1]
        self.class_1_samples = pd.concat([self.class_1_samples, class_1_new_samples], ignore_index=True)
        if len(self.class_1_samples) > 40:
            self.class_1_samples = self.class_1_samples.iloc[-40:]

    def _update_next_block(self, S, i):
        next_block = S[i + 1]
        updated_block = pd.concat([next_block, self.class_1_samples], ignore_index=True)
    
        X = updated_block.drop(columns=['Class'])
        y = updated_block['Class']

        smote = SMOTE(sampling_strategy=0.15)
        X_resampled, y_resampled = smote.fit_resample(X, y)
        S[i + 1] = pd.concat([X_resampled, y_resampled], axis=1)

    def _predict_block(self, Bi):
        block_predictions = []
        for _, row in Bi.iterrows():
            sample = pd.DataFrame([row[:-1]], columns=Bi.columns[:-1])
            anpha = (3000 / len(Bi)) * 0.2
            selected_classifiers = self._adaptive_ensemble_size(sample, anpha)
            pre_sample = self._tendency_prediction(selected_classifiers)
            block_predictions.append(pre_sample)
        return block_predictions

    def _train_random_forest_classifier(self, X, y, n_estimators=70, max_depth=None, min_samples_split=2, min_samples_leaf=1, class_weight="balanced", random_state=42):
        rf = RandomForestClassifier(
            n_estimators=n_estimators,
            max_depth=max_depth,
            min_samples_split=min_samples_split,
            min_samples_leaf=min_samples_leaf,
            class_weight=class_weight,
            random_state=random_state
        )
        rf.fit(X, y)
        return rf

    def _adaptive_ensemble_size(self, sample, anpha, min_num=3):
        probability_list = []
        sample_proba = [clf.predict_proba(sample)[0] for clf in self.C]
        for label in self.unique_labels:
            li = []
            latest_proba = None
            for proba in sample_proba:
                probability_dict = {lbl: prob for lbl, prob in zip(self.C[0].classes_, proba)}
                current_proba = probability_dict.get(label, 0)
                if len(li) < min_num:
                    li.append(current_proba)
                else:
                    if latest_proba is None:
                        latest_proba = probability_dict.get(label, 0)
                    if abs(current_proba - latest_proba) < anpha:
                        li.append(current_proba)
                    else:
                        break
            probability_list.append(li)
        return probability_list

    def _tendency_prediction(self, probability_list, epsilon=0.01):
        predicted_probabilities = []
        for li in probability_list:
            x = np.arange(1, len(li) + 1)
            y = np.array(li)
            slope, intercept = self._linear_regression(x, y)
            next_value = slope * (len(li) + 2) + intercept
            li = [next_value] + li
            weights = 1 - np.arange(1, len(li) + 1) * epsilon
            weighted_prob = np.dot(li, weights) / len(li)
            predicted_probabilities.append(weighted_prob)
        if predicted_probabilities[0] < predicted_probabilities[1] + 0.5:
            return self.unique_labels[1]
        Ps = self.unique_labels[np.argmax(predicted_probabilities)]        
        return Ps

    def _linear_regression(self, x, y):
        A = np.vstack([x, np.ones(len(x))]).T
        m, c = np.linalg.lstsq(A, y, rcond=None)[0]
        return m, c

    def compute_metrics(self):
        all_predictions = [pred for block in self.pre for pred in block]
        all_true_labels = [label for block in self.true_labels for label in block]
        if len(all_predictions) > 0 and len(all_true_labels) > 0:
            precision = precision_score(all_true_labels, all_predictions, average='binary', pos_label=1)
            recall = recall_score(all_true_labels, all_predictions, average='binary', pos_label=1)
            f1 = f1_score(all_true_labels, all_predictions, average='binary', pos_label=1)
            accuracy = accuracy_score(all_true_labels, all_predictions)
            return precision, recall, f1, accuracy
        else:
            return None, None, None, None
# Example usage
m = 15
k = 3
unique_labels = list(set(df.iloc[:, -1]))
processor = DataStreamProcessor(m, k, unique_labels)
C, pre, true_labels = processor.process_data_stream(S)
precision, recall, f1, accuracy = processor.compute_metrics()

if precision is not None:
    print("Total Metrics:")
    print(f"  Precision: {precision:.4f}")
    print(f"  Recall: {recall:.4f}")
    print(f"  F1 Score: {f1:.4f}")
    print(f"  Accuracy: {accuracy:.4f}")
else:
    print("No predictions or true labels available to compute metrics.")

processor.plot_block_f1_scores()