In [56]:
import numpy as np
import pandas as pd

def c(size):
    if size > 2:
        return 2 * (np.log(size-1) + 0.5772156649) - 2 * (size-1) / size
    if size == 2:
        return 1
    return 0

class LeafNode:
    def __init__(self, size, data):
        self.size = size
        self.data = data

class DecisionNode:
    def __init__(self, left, right, splitAtt, splitVal):
        self.left = left
        self.right = right
        self.splitAtt = splitAtt
        self.splitVal = splitVal

class IsolationTree:
    def __init__(self, height, height_limit):
        self.height = height
        self.height_limit = height_limit

    def fit(self, X: np.ndarray, improved=False):
        """
        Given a 2D matrix of observations, create an isolation tree. Set field
        self.root to the root of that tree and return it.
        """
        if self.height >= self.height_limit or X.shape[0] <= 2:
            self.root = LeafNode(X.shape[0], X)
            return self.root

        # Choose Random Split Attributes and Value
        num_features = X.shape[1]
        splitAtt = np.random.randint(0, num_features)
        splitVal = np.random.uniform(min(X[:, splitAtt]), max(X[:, splitAtt]))

        X_left = X[X[:, splitAtt] < splitVal]
        X_right = X[X[:, splitAtt] >= splitVal]

        left = IsolationTree(self.height + 1, self.height_limit)
        right = IsolationTree(self.height + 1, self.height_limit)
        left.fit(X_left)
        right.fit(X_right)
        self.root = DecisionNode(left.root, right.root, splitAtt, splitVal)
        self.n_nodes = self.count_nodes(self.root)
        return self.root

    def count_nodes(self, root):
        count = 0
        stack = [root]
        while stack:
            node = stack.pop()
            count += 1
            if isinstance(node, DecisionNode):
                stack.append(node.right)
                stack.append(node.left)
        return count

class IsolationTreeEnsemble:
    def __init__(self, sample_size, n_trees=10, trees=None):
        self.sample_size = sample_size
        # If pretrained trees are provided, use them and update n_trees accordingly
        if trees is not None:
            self.trees = trees
            self.n_trees = len(trees)
        else:
            self.n_trees = n_trees
            self.trees = []
        self.all_trees = self.trees[:]  # Initialize with any given pretrained trees

    def fit(self, X: np.ndarray, improved=False):
        """
        Given a 2D matrix of observations, create an ensemble of IsolationTree
        objects and store them in a list: self.trees.  Convert DataFrames to
        ndarray objects.
        """
        # If pretrained trees are used, skip training new trees
        if self.trees:
            print("Using pretrained trees")
            return self

        if isinstance(X, pd.DataFrame):
            X = X.values
        n_rows = X.shape[0]
        height_limit = np.ceil(np.log2(self.sample_size))
        for i in range(self.n_trees):
            data_index = np.random.randint(0, n_rows, self.sample_size)
            X_sub = X[data_index]
            tree = IsolationTree(0, height_limit)
            tree.fit(X_sub)
            self.trees.append(tree)
            self.all_trees.append(tree)
        return self

    def path_length(self, X:np.ndarray) -> np.ndarray:
        """
        Given a 2D matrix of observations, X, compute the average path length
        for each observation in X.
        """
        paths = []
        for row in X:
            path = []
            for tree in self.trees:
                node = tree.root
                length = 0
                while isinstance(node, DecisionNode):
                    if row[node.splitAtt] < node.splitVal:
                        node = node.left
                    else:
                        node = node.right
                    length += 1
                leaf_size = node.size
                pathLength = length + c(leaf_size)
                path.append(pathLength)
            paths.append(path)
        paths = np.array(paths)
        return np.mean(paths, axis=1)

    def anomaly_score(self, X:pd.DataFrame) -> np.ndarray:
        """
        Given a 2D matrix of observations, X, compute the anomaly score
        for each x_i observation, returning an ndarray of them.
        """
        if isinstance(X, pd.DataFrame):
            X = X.values
        avg_length = self.path_length(X)
        scores = np.array([np.power(2, -l/c(self.sample_size)) for l in avg_length])
        return scores

    def predict_from_anomaly_scores(self, scores:np.ndarray, threshold:float) -> np.ndarray:
        """
        Given an array of scores and a score threshold, return an array of
        the predictions: 1 for any score >= the threshold and 0 otherwise.
        """
        return np.array([1 if s >= threshold else 0 for s in scores])

    def predict(self, X:np.ndarray, threshold:float) -> np.ndarray:
        "A shorthand for calling anomaly_score() and predict_from_anomaly_scores()."
        scores = self.anomaly_score(X)
        prediction = self.predict_from_anomaly_scores(scores, threshold)
        return prediction


In [57]:
import numpy as np
import pandas as pd
from sklearn.metrics import accuracy_score, precision_score, recall_score

# Load your dataset
data = pd.read_csv(r"/home/saipranav/Desktop/fedl/tt4/x_part1.csv")

# Separate features and labels
X = data.drop(columns=['Class','cent_labels']).values  # Features

# Parameters
sample_size = len(X)
n_trees = 100
# threshold = 0.5

# Initialize the IsolationTreeEnsemble without prebuilt trees
ensemble = IsolationTreeEnsemble(sample_size=sample_size, n_trees=n_trees)

# Fit the ensemble to the data
ensemble.fit(X)

# Compute anomaly scores
scores = ensemble.anomaly_score(X)
all_built_trees1 = ensemble.all_trees

data['Anomaly_Score'] = scores

data.to_csv(r"/home/saipranav/Desktop/fedl/tt4/x_part1.csv", index=False)

# Predict anomalies
#predictions = ensemble.predict_from_anomaly_scores(scores, threshold)

# # Evaluation
# accuracy = accuracy_score(y, predictions)
# precision = precision_score(y, predictions)
# recall = recall_score(y, predictions)

# Print the results
# print(f"Standard Usage - Building Trees from Scratch:")
# print(f"Precision: {precision:.2f}")
# print(f"Recall: {recall:.2f}")
# print(f'Accuracy: {accuracy:.4f}')


In [59]:
print(all_built_trees1)
print(len(all_built_trees1))

[<__main__.IsolationTree object at 0x7f67d4bbfd90>, <__main__.IsolationTree object at 0x7f67d4c4e550>, <__main__.IsolationTree object at 0x7f67ceeb2ac0>, <__main__.IsolationTree object at 0x7f67d72ad2e0>, <__main__.IsolationTree object at 0x7f67d56ab7f0>, <__main__.IsolationTree object at 0x7f67d6d87460>, <__main__.IsolationTree object at 0x7f67cff81040>, <__main__.IsolationTree object at 0x7f67d6d5b520>, <__main__.IsolationTree object at 0x7f67c1a13520>, <__main__.IsolationTree object at 0x7f67c1a45040>, <__main__.IsolationTree object at 0x7f67d70c73a0>, <__main__.IsolationTree object at 0x7f67d70cbdc0>, <__main__.IsolationTree object at 0x7f67d6d203a0>, <__main__.IsolationTree object at 0x7f67d718c1c0>, <__main__.IsolationTree object at 0x7f67d6eb4e20>, <__main__.IsolationTree object at 0x7f67d6df5160>, <__main__.IsolationTree object at 0x7f67d00fc760>, <__main__.IsolationTree object at 0x7f67cfed35e0>, <__main__.IsolationTree object at 0x7f67cfe59040>, <__main__.IsolationTree object

In [60]:
import numpy as np
import pandas as pd
from sklearn.metrics import accuracy_score, precision_score, recall_score

# Load your dataset
data = pd.read_csv(r"/home/saipranav/Desktop/fedl/tt4/x_part2.csv")

# Separate features and labels
X = data.drop(columns=['Class','cent_labels']).values  # Features

# Parameters
sample_size = len(X)
n_trees = 100
# threshold = 0.5

# Initialize the IsolationTreeEnsemble without prebuilt trees
ensemble = IsolationTreeEnsemble(sample_size=sample_size, n_trees=n_trees)

# Fit the ensemble to the data
ensemble.fit(X)

# Compute anomaly scores
scores = ensemble.anomaly_score(X)
all_built_trees2 = ensemble.all_trees

data['Anomaly_Score'] = scores

data.to_csv(r"/home/saipranav/Desktop/fedl/tt4/x_part2.csv", index=False)

# Predict anomalies
#predictions = ensemble.predict_from_anomaly_scores(scores, threshold)

# # Evaluation
# accuracy = accuracy_score(y, predictions)
# precision = precision_score(y, predictions)
# recall = recall_score(y, predictions)

# Print the results
# print(f"Standard Usage - Building Trees from Scratch:")
# print(f"Precision: {precision:.2f}")
# print(f"Recall: {recall:.2f}")
# print(f'Accuracy: {accuracy:.4f}')


In [61]:
print(all_built_trees2)
print(len(all_built_trees2))

[<__main__.IsolationTree object at 0x7f67d1af2ee0>, <__main__.IsolationTree object at 0x7f67ceeb20d0>, <__main__.IsolationTree object at 0x7f67d6aa0760>, <__main__.IsolationTree object at 0x7f67c33db670>, <__main__.IsolationTree object at 0x7f67cb010730>, <__main__.IsolationTree object at 0x7f67cc5ecc70>, <__main__.IsolationTree object at 0x7f67cc672730>, <__main__.IsolationTree object at 0x7f67cc67dcd0>, <__main__.IsolationTree object at 0x7f67ce5d5490>, <__main__.IsolationTree object at 0x7f67ce5c4310>, <__main__.IsolationTree object at 0x7f67ce364790>, <__main__.IsolationTree object at 0x7f67ce204af0>, <__main__.IsolationTree object at 0x7f67ce51daf0>, <__main__.IsolationTree object at 0x7f67ce43cf10>, <__main__.IsolationTree object at 0x7f67ce6368b0>, <__main__.IsolationTree object at 0x7f67c8d71970>, <__main__.IsolationTree object at 0x7f67ce23fd90>, <__main__.IsolationTree object at 0x7f67c3de3af0>, <__main__.IsolationTree object at 0x7f67c3384040>, <__main__.IsolationTree object

In [62]:
import numpy as np
import pandas as pd
from sklearn.metrics import accuracy_score, precision_score, recall_score

# Load your dataset
data = pd.read_csv(r"/home/saipranav/Desktop/fedl/tt4/x_part3.csv")

# Separate features and labels
X = data.drop(columns=['Class','cent_labels']).values  # Features

# Parameters
sample_size = len(X)
n_trees = 100
# threshold = 0.5

# Initialize the IsolationTreeEnsemble without prebuilt trees
ensemble = IsolationTreeEnsemble(sample_size=sample_size, n_trees=n_trees)

# Fit the ensemble to the data
ensemble.fit(X)

# Compute anomaly scores
scores = ensemble.anomaly_score(X)
all_built_trees3 = ensemble.all_trees

data['Anomaly_Score'] = scores

data.to_csv(r"/home/saipranav/Desktop/fedl/tt4/x_part3.csv", index=False)

# Predict anomalies
#predictions = ensemble.predict_from_anomaly_scores(scores, threshold)

# # Evaluation
# accuracy = accuracy_score(y, predictions)
# precision = precision_score(y, predictions)
# recall = recall_score(y, predictions)

# Print the results
# print(f"Standard Usage - Building Trees from Scratch:")
# print(f"Precision: {precision:.2f}")
# print(f"Recall: {recall:.2f}")
# print(f'Accuracy: {accuracy:.4f}')


In [13]:
# for i in all_built_trees2:
#     all_built_trees1.append(i)
# print(len(all_built_trees1))
# #

In [63]:
import numpy as np
import pandas as pd
from sklearn.metrics import accuracy_score, precision_score, recall_score

# Load your dataset
data = pd.read_csv(r"/home/saipranav/Desktop/fedl/tt4/x_part4.csv")

# Separate features and labels
X = data.drop(columns=['Class','cent_labels']).values  # Features

# Parameters
sample_size = len(X)
n_trees = 100
# threshold = 0.5

# Initialize the IsolationTreeEnsemble without prebuilt trees
ensemble = IsolationTreeEnsemble(sample_size=sample_size, n_trees=n_trees)

# Fit the ensemble to the data
ensemble.fit(X)

# Compute anomaly scores
scores = ensemble.anomaly_score(X)
all_built_trees4 = ensemble.all_trees

data['Anomaly_Score'] = scores

data.to_csv(r"/home/saipranav/Desktop/fedl/tt4/x_part4.csv", index=False)

# Predict anomalies
#predictions = ensemble.predict_from_anomaly_scores(scores, threshold)

# # Evaluation
# accuracy = accuracy_score(y, predictions)
# precision = precision_score(y, predictions)
# recall = recall_score(y, predictions)

# Print the results
# print(f"Standard Usage - Building Trees from Scratch:")
# print(f"Precision: {precision:.2f}")
# print(f"Recall: {recall:.2f}")
# print(f'Accuracy: {accuracy:.4f}')


In [64]:
import numpy as np
import pandas as pd
from sklearn.metrics import accuracy_score, precision_score, recall_score

# Load your dataset
data = pd.read_csv(r"/home/saipranav/Desktop/fedl/tt4/x_part5.csv")

# Separate features and labels
X = data.drop(columns=['Class','cent_labels']).values  # Features

# Parameters
sample_size = len(X)
n_trees = 100
# threshold = 0.5

# Initialize the IsolationTreeEnsemble without prebuilt trees
ensemble = IsolationTreeEnsemble(sample_size=sample_size, n_trees=n_trees)

# Fit the ensemble to the data
ensemble.fit(X)

# Compute anomaly scores
scores = ensemble.anomaly_score(X)
all_built_trees5 = ensemble.all_trees

data['Anomaly_Score'] = scores

data.to_csv(r"/home/saipranav/Desktop/fedl/tt4/x_part5.csv", index=False)

# Predict anomalies
#predictions = ensemble.predict_from_anomaly_scores(scores, threshold)

# # Evaluation
# accuracy = accuracy_score(y, predictions)
# precision = precision_score(y, predictions)
# recall = recall_score(y, predictions)

# Print the results
# print(f"Standard Usage - Building Trees from Scratch:")
# print(f"Precision: {precision:.2f}")
# print(f"Recall: {recall:.2f}")
# print(f'Accuracy: {accuracy:.4f}')


In [71]:
import numpy as np
import pandas as pd
from sklearn.metrics import accuracy_score, precision_score, recall_score

# Load your dataset
data = pd.read_csv(r"/home/saipranav/Desktop/fedl/tt4/x_part6.csv")

# Separate features and labels
X = data.drop(columns=['Class','cent_labels']).values  # Features

# Parameters
sample_size = len(X)
n_trees = 100
# threshold = 0.5

# Initialize the IsolationTreeEnsemble without prebuilt trees
ensemble = IsolationTreeEnsemble(sample_size=sample_size, n_trees=n_trees)

# Fit the ensemble to the data
ensemble.fit(X)

# Compute anomaly scores
scores = ensemble.anomaly_score(X)
all_built_trees6 = ensemble.all_trees

data['Anomaly_Score'] = scores

data.to_csv(r"/home/saipranav/Desktop/fedl/tt4/x_part6.csv", index=False)

# Predict anomalies
#predictions = ensemble.predict_from_anomaly_scores(scores, threshold)

# # Evaluation
# accuracy = accuracy_score(y, predictions)
# precision = precision_score(y, predictions)
# recall = recall_score(y, predictions)

# Print the results
# print(f"Standard Usage - Building Trees from Scratch:")
# print(f"Precision: {precision:.2f}")
# print(f"Recall: {recall:.2f}")
# print(f'Accuracy: {accuracy:.4f}')


In [12]:
import numpy as np
import pandas as pd
from sklearn.metrics import accuracy_score, precision_score, recall_score

# Load your dataset
data = pd.read_csv(r"/home/saipranav/Desktop/fedl/10c/x_part7.csv")

# Separate features and labels
X = data.drop(columns=['Class','cent_labels']).values  # Features

# Parameters
sample_size = len(X)
n_trees = 100
# threshold = 0.5

# Initialize the IsolationTreeEnsemble without prebuilt trees
ensemble = IsolationTreeEnsemble(sample_size=sample_size, n_trees=n_trees)

# Fit the ensemble to the data
ensemble.fit(X)

# Compute anomaly scores
scores = ensemble.anomaly_score(X)
all_built_trees7 = ensemble.all_trees

data['Anomaly_Score'] = scores

data.to_csv(r"/home/saipranav/Desktop/fedl/10c/x_part7.csv", index=False)

# Predict anomalies
#predictions = ensemble.predict_from_anomaly_scores(scores, threshold)

# # Evaluation
# accuracy = accuracy_score(y, predictions)
# precision = precision_score(y, predictions)
# recall = recall_score(y, predictions)

# Print the results
# print(f"Standard Usage - Building Trees from Scratch:")
# print(f"Precision: {precision:.2f}")
# print(f"Recall: {recall:.2f}")
# print(f'Accuracy: {accuracy:.4f}')


In [13]:
import numpy as np
import pandas as pd
from sklearn.metrics import accuracy_score, precision_score, recall_score

# Load your dataset
data = pd.read_csv(r"/home/saipranav/Desktop/fedl/10c/x_part8.csv")

# Separate features and labels
X = data.drop(columns=['Class','cent_labels']).values  # Features

# Parameters
sample_size = len(X)
n_trees = 100
# threshold = 0.5

# Initialize the IsolationTreeEnsemble without prebuilt trees
ensemble = IsolationTreeEnsemble(sample_size=sample_size, n_trees=n_trees)

# Fit the ensemble to the data
ensemble.fit(X)

# Compute anomaly scores
scores = ensemble.anomaly_score(X)
all_built_trees8 = ensemble.all_trees

data['Anomaly_Score'] = scores

data.to_csv(r"/home/saipranav/Desktop/fedl/10c/x_part8.csv", index=False)

# Predict anomalies
#predictions = ensemble.predict_from_anomaly_scores(scores, threshold)

# # Evaluation
# accuracy = accuracy_score(y, predictions)
# precision = precision_score(y, predictions)
# recall = recall_score(y, predictions)

# Print the results
# print(f"Standard Usage - Building Trees from Scratch:")
# print(f"Precision: {precision:.2f}")
# print(f"Recall: {recall:.2f}")
# print(f'Accuracy: {accuracy:.4f}')


In [14]:
import numpy as np
import pandas as pd
from sklearn.metrics import accuracy_score, precision_score, recall_score

# Load your dataset
data = pd.read_csv(r"/home/saipranav/Desktop/fedl/10c/x_part9.csv")

# Separate features and labels
X = data.drop(columns=['Class','cent_labels']).values  # Features

# Parameters
sample_size = len(X)
n_trees = 100
# threshold = 0.5

# Initialize the IsolationTreeEnsemble without prebuilt trees
ensemble = IsolationTreeEnsemble(sample_size=sample_size, n_trees=n_trees)

# Fit the ensemble to the data
ensemble.fit(X)

# Compute anomaly scores
scores = ensemble.anomaly_score(X)
all_built_trees9= ensemble.all_trees

data['Anomaly_Score'] = scores

data.to_csv(r"/home/saipranav/Desktop/fedl/10c/x_part9.csv", index=False)

# Predict anomalies
#predictions = ensemble.predict_from_anomaly_scores(scores, threshold)

# # Evaluation
# accuracy = accuracy_score(y, predictions)
# precision = precision_score(y, predictions)
# recall = recall_score(y, predictions)

# Print the results
# print(f"Standard Usage - Building Trees from Scratch:")
# print(f"Precision: {precision:.2f}")
# print(f"Recall: {recall:.2f}")
# print(f'Accuracy: {accuracy:.4f}')


In [15]:
import numpy as np
import pandas as pd
from sklearn.metrics import accuracy_score, precision_score, recall_score

# Load your dataset
data = pd.read_csv(r"/home/saipranav/Desktop/fedl/10c/x_part10.csv")

# Separate features and labels
X = data.drop(columns=['Class','cent_labels']).values  # Features

# Parameters
sample_size = len(X)
n_trees = 100
# threshold = 0.5

# Initialize the IsolationTreeEnsemble without prebuilt trees
ensemble = IsolationTreeEnsemble(sample_size=sample_size, n_trees=n_trees)

# Fit the ensemble to the data
ensemble.fit(X)

# Compute anomaly scores
scores = ensemble.anomaly_score(X)
all_built_trees10 = ensemble.all_trees

data['Anomaly_Score'] = scores

data.to_csv(r"/home/saipranav/Desktop/fedl/10c/x_part10.csv", index=False)

# Predict anomalies
#predictions = ensemble.predict_from_anomaly_scores(scores, threshold)

# # Evaluation
# accuracy = accuracy_score(y, predictions)
# precision = precision_score(y, predictions)
# recall = recall_score(y, predictions)

# Print the results
# print(f"Standard Usage - Building Trees from Scratch:")
# print(f"Precision: {precision:.2f}")
# print(f"Recall: {recall:.2f}")
# print(f'Accuracy: {accuracy:.4f}')


In [69]:


# Load a third dataset
data_third = pd.read_csv(r"/home/saipranav/Desktop/fedl/tt4/x_part6.csv")

# Separate features and labels
X_third = data_third.drop(columns=['Class','cent_labels']).values
# y_third = data_third['Class'].values

# Initialize ensemble with merged trees
ensemble_merged = IsolationTreeEnsemble(sample_size=sample_size, n_trees=len(all_built_trees2), trees=all_built_trees5)

# # Fit to the third dataset (using merged prebuilt trees)
# ensemble_merged.fit(X_third)

# Compute anomaly scores
scores_c5= ensemble_merged.anomaly_score(X_third)

data_third['scores_c5'] = scores_c5

data_third.to_csv(r"/home/saipranav/Desktop/fedl/tt4/x_part6.csv", index=False)

# Predict anomalies
# predictions_third = ensemble_merged.predict_from_anomaly_scores(scores_third, threshold)

# # Evaluation
# accuracy_third = accuracy_score(y_third, predictions_third)
# precision_third = precision_score(y_third, predictions_third)
# recall_third = recall_score(y_third, predictions_third)

# # Print the results
# print(f"\nUsage with Merged Trees:")
# print(f"Precision: {precision_third:.2f}")
# print(f"Recall: {recall_third:.2f}")
# print(f'Accuracy: {accuracy_third:.4f}')

In [9]:


# Load a third dataset
data_third = pd.read_csv(r"C:\Users\PRANAV\OneDrive\Desktop\Federated Learning\x_part2.csv")

# Separate features and labels
X_third = data_third.drop(columns=['Class']).values
y_third = data_third['Class'].values

# Initialize ensemble with merged trees
ensemble_merged = IsolationTreeEnsemble(sample_size=sample_size, n_trees=len(all_built_trees1), trees=all_built_trees1)

# # Fit to the third dataset (using merged prebuilt trees)
# ensemble_merged.fit(X_third)

# Compute anomaly scores
scores_third = ensemble_merged.anomaly_score(X_third)

# Predict anomalies
predictions_third = ensemble_merged.predict_from_anomaly_scores(scores_third, threshold)

# Evaluation
accuracy_third = accuracy_score(y_third, predictions_third)
precision_third = precision_score(y_third, predictions_third)
recall_third = recall_score(y_third, predictions_third)

# Print the results
print(f"\nUsage with Merged Trees:")
print(f"Precision: {precision_third:.2f}")
print(f"Recall: {recall_third:.2f}")
print(f'Accuracy: {accuracy_third:.4f}')



Usage with Merged Trees:
Precision: 0.18
Recall: 0.48
Accuracy: 0.9957
