# Project 3a: TIMESERIES PATTERN TREE

## Dataset import

In [3]:
import pandas as pd 
import random

## load the datasets
dataset_path = "/Users/sebastianodarconso/Desktop/università/magistrale_progetti/bdss/UCRArchive_2018/MedicalImages/MedicalImages_TRAIN.tsv"
dataset = pd.read_csv(dataset_path, sep="\t", header=None)
dataset.head(3)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,90,91,92,93,94,95,96,97,98,99
0,10,-0.680044,-0.15254,0.463497,1.125701,1.784473,2.387815,2.88659,3.239594,3.417875,...,-0.669457,-0.669649,-0.667227,-0.663867,-0.659006,-0.653538,-0.648224,-0.643643,-0.640175,-0.637993
1,4,2.232027,2.863343,3.229689,3.284083,3.025244,2.488196,1.742625,0.890124,0.062662,...,-0.697279,-0.702275,-0.707983,-0.714786,-0.722466,-0.730555,-0.737959,-0.743156,-0.744424,-0.7412
2,10,2.746582,2.727073,2.688213,2.630574,2.554928,2.46224,2.353654,2.230476,2.094162,...,-0.474689,-0.403397,-0.331573,-0.260273,-0.190542,-0.123397,-0.059818,-0.000729,0.053012,0.100628


## Node class definition

In [4]:
## class Node 
class Node:
    def __init__(self):
        self.rule = None
        self.association_rules = None
        self.class_label = None
        self.information_gain = 0.0 
        self.true_branch = None
        self.false_branch = None
        self.dataset_length = 0 
        self.ts = None

## Support function

In [5]:
def calculate_support(dataset, pattern):
    dataset_list = dataset.iloc[:, 1:].values.tolist()
    count = 0
    for transaction in dataset_list:
        if pattern_matched(pattern, transaction):
            count += 1
    return count / len(dataset)


def pattern_matched(sequence, transaction):
    for value, is_greater, index, _ in sequence:
        if index >= len(transaction):
            return False
        if is_greater:
            if not transaction[index] > value:
                return False
        else:
            if not transaction[index] < value:
                return False
    return True

## Association rules extraction

In [6]:
# def extract_association_rules(dataset, pts, confidence_threshold):
#     highest_confidence_rule = None
#     max_confidence = -1
#     best_split_index = -1
    
#     sequence_support = calculate_support(dataset, pts)

#     for i in range(1, len(pts)):
#         antecedent = pts[:i]
#         antecedent_support = calculate_support(dataset, antecedent)
#         confidence = sequence_support / antecedent_support if antecedent_support > 0 else 0
        
#         if confidence >= confidence_threshold and confidence > max_confidence:
#             max_confidence = confidence
#             highest_confidence_rule = (pts, confidence, i)
#             best_split_index = i
            
#     return highest_confidence_rule if highest_confidence_rule else None

## New Association rules extraction
The previous function gives the association rules that have a greater or equal confidence than the threshold, this usually results in association rules that have only one consequent, making it less "informative".
Now the new function returns the association rule before the one that gives a greater or equal confidence than the threshold.

In [7]:
def extract_association_rules(dataset, pts, confidence_threshold):
    sequence_support = calculate_support(dataset, pts)
    best_pts = None
    for i in range(1, len(pts)):
        antecedent = pts[:i]
        antecedent_support = calculate_support(dataset, antecedent)
        confidence = sequence_support / antecedent_support if antecedent_support > 0 else 0

        if confidence >= confidence_threshold:
            break
        
        best_pts = (pts, confidence, i)
    
    return best_pts if best_pts else None # Return None if no such index is found

## Candidates generation

In [8]:
## time series sample function
def random_ts_sample(dataset, indexes):
    sampled = dataset.sample()
    while sampled.index[0] in indexes:
        sampled = dataset.sample()
    
    return sampled, sampled.index[0] + 1

## generate candidates function 
def generate_candidates(dataset, min_support):

    ## initialize list of indexes 
    I = list()

    ## initialize list of candidates 
    candidates = list()

    ## initialize list of patterns 
    pts = list()

    while len(pts) <= 2 or calculate_support(dataset, pts) >= min_support:

        if (len(pts) >= len(dataset.columns) - 1):
            break
    
        ## sample random time series
        random_ts, ts_index = random_ts_sample(dataset, I)
        random_ts_class = random_ts[0].values[0]
        ts_as_list = random_ts.values.tolist()[0][1:]

        ## sample random index and add it to I
        random_index = random.randint(0, len(ts_as_list) - 1)
        while random_index in I:
            random_index = random.randint(0, len(ts_as_list) - 1)
        I.append(random_index)

        ## sample random time series prime 
        random_ts_prime, ts_prime_index = random_ts_sample(dataset, I)
        while ts_prime_index == ts_index:
            random_ts_prime, ts_prime_index = random_ts_sample(dataset, I)
        
        random_ts_prime_as_list = random_ts_prime.values.tolist()[0][1:]    

        candidate = (ts_as_list[random_index],
                    random_ts_prime_as_list[random_index] >= ts_as_list[random_index],
                    random_index, 
                    random_ts_class)

        candidates.append(candidate)
        pts = sorted(candidates, key=lambda x: x[2])


    ## remove last element because it does not satisfy the min_support condition
    return pts[:-1]

## Information gain functions

In [9]:
import math
from collections import Counter

def calculate_entropy(labels):
    label_counts = Counter(labels)
    total_count = len(labels)
    entropy = 0.0
    
    for count in label_counts.values():
        probability = count / total_count
        if probability > 0:
            entropy -= probability * math.log2(probability)
    
    return entropy

def calculate_information_gain(parent_labels, true_labels, false_labels):
    parent_entropy = calculate_entropy(parent_labels)
    true_entropy = calculate_entropy(true_labels)
    false_entropy = calculate_entropy(false_labels)
    
    total_count = len(parent_labels)
    true_weight = len(true_labels) / total_count
    false_weight = len(false_labels) / total_count
    
    info_gain = parent_entropy - (true_weight * true_entropy + false_weight * false_entropy)
    return info_gain

## Dataset split functions 
Branches that satisfy the rules and branches that don't.

In [10]:
def matches_rule(ts, rule):
    tsv_true = []
    tsv_false = []
    
    for (value, condition, index, _) in rule:
        if index >= len(ts):
            return False
        if condition == True and ts[index] < value:
            tsv_false.append((ts[index], index))
            return False
        elif condition == False and ts[index] > value:
            tsv_false.append((ts[index], index))
            return True
        else:
            tsv_true.append((ts[index], index))
        
    return True

def dataset_split(dataset, rules):
    dataset_list = dataset.iloc[:, :].values.tolist()
    false_subset = []
    true_subset = []

    for ts in dataset_list:
        if matches_rule(ts[1:], rules):
            true_subset.append(ts)
        else:
            false_subset.append(ts)
    
    return false_subset, true_subset

## Tree generation

In [11]:
# def rpts_tree(Ts, labels, gain=-1, max_height=float('inf'), MIN_SUPPORT=None, min_samples=1, depth=0):

#     best_info_gain = gain
#     min_gain = 0.01
#     CONFIDENCE_THRESHOLD = 0.7

#     if depth >= max_height or len(Ts) < min_samples:
#         return None
    
#     ## create a new node
#     v = Node()
#     v.ts = Ts

#     ## get the most common label
#     v.class_label = Counter(labels).most_common(1)[0][0]
#     if not any(label == v.class_label for label in labels):
#         return v   

#     ## generate candidates
#     pts = generate_candidates(dataset, MIN_SUPPORT)

#     association_rules = extract_association_rules(dataset, pts, CONFIDENCE_THRESHOLD)

#     if association_rules is not None:
#         pattern, confidence, split_index = association_rules
#         ## add association rule to node
#         v.association_rules = "Antecedent:" + str(pattern[:split_index]) + "  ==> Consequent:" + str(pattern[split_index:]) + " Confidence:" + str(round(confidence, 2)) + " Split Index:" + str(split_index)
#         final_candidates = pattern[:split_index] + pattern[split_index:]
#     else:
#         final_candidates = pts

#     if not final_candidates:
#         return v  
    
#     v.rule = final_candidates
#     true_subset, false_subset = dataset_split(Ts, final_candidates)

#     if not true_subset or not false_subset:
#         return v 

#     labels_true = [ts[0] for ts in true_subset]
#     labels_false = [ts[0] for ts in false_subset]
#     ig = calculate_information_gain(labels, labels_true, labels_false)

#     if ig < min_gain:
#         return v
    
#     if ig > best_info_gain:
#         best_info_gain = ig
#     else:
#         return v
    
#     v.information_gain = calculate_information_gain(labels, labels_true, labels_false)

#     print(f"Depth: {depth}, Rule: {v.rule if v.association_rules is None else v.association_rules}, Information Gain: {v.information_gain:.4f}, Class Label: {v.class_label}")
#     v.dataset_length = len(Ts)
#     v.true_branch = rpts_tree(pd.DataFrame(true_subset), labels_true, best_info_gain, max_height, MIN_SUPPORT, min_samples, depth + 1)
#     v.false_branch = rpts_tree(pd.DataFrame(false_subset), labels_false, best_info_gain, max_height, MIN_SUPPORT, min_samples, depth + 1)

#     return v

# # labels = dataset[0].values.tolist()
# # rpts_tree(dataset, labels)

## New tree generation
The previous rpts tree function generates only one pattern per node, resulting in trees that might perform bad in terms of information gain, this version generates **n** candidates, where **n** is user-inserted, then for each candidate takes only the one that has the best information gain.

In [12]:
from tqdm import tqdm
def rpts_tree(Ts, labels, n_candidates, max_height=float('inf'), MIN_SUPPORT=None, min_samples=1, depth=0):
    min_gain = 0.01
    CONFIDENCE_THRESHOLD = 0.7

    if depth >= max_height or len(Ts) < min_samples:
        return None
    
    ## create a new node
    v = Node()
    v.ts = Ts

    ## get the most common label
    v.class_label = Counter(labels).most_common(1)[0][0]
    if not any(label == v.class_label for label in labels):
        return v   

    ## generate candidates
    all_candidates = []
    for _ in range(n_candidates):
        pts = generate_candidates(dataset, MIN_SUPPORT)
        all_candidates.append(pts)

    best_candidate = None
    best_info_gain = -1
    best_true_subset = None
    best_false_subset = None

    for pts in all_candidates:

        true_subset, false_subset = dataset_split(Ts, pts)
        if not true_subset or not false_subset:
            continue

        labels_true = [ts[0] for ts in true_subset]
        labels_false = [ts[0] for ts in false_subset]

        ig = calculate_information_gain(labels, labels_true, labels_false)

        if ig > best_info_gain:
            best_info_gain = ig
            best_candidate = pts
            best_true_subset = true_subset
            best_false_subset = false_subset

    if best_candidate is None or best_info_gain < min_gain:
        return v
    
    v.rule = best_candidate
    v.information_gain = best_info_gain
    v.dataset_length = len(Ts)

    association_rules = extract_association_rules(dataset, best_candidate, CONFIDENCE_THRESHOLD)

    if association_rules is not None:
        pattern, confidence, split_index = association_rules
        ## add association rule to node
        v.association_rules = "Antecedent:" + str(pattern[:split_index]) + "  ==> Consequent:" + str(pattern[split_index:]) + " Confidence:" + str(round(confidence, 2)) + " Split Index:" + str(split_index) 
        final_candidates = pattern[:split_index] + pattern[split_index:]
    else:
        final_candidates = pts

    if not final_candidates:
        return v
    
    v.rule = final_candidates
    
    print(f"Depth: {depth}, Rule: {v.rule if v.association_rules is None else  v.association_rules}, Information Gain: {v.information_gain:.4f}, Class Label: {v.class_label}")
    v.dataset_length = len(Ts)
    v.true_branch = rpts_tree(pd.DataFrame(best_true_subset), labels_true, n_candidates, max_height, MIN_SUPPORT, min_samples, depth + 1)
    v.false_branch = rpts_tree(pd.DataFrame(best_false_subset), labels_false, n_candidates,  max_height, MIN_SUPPORT, min_samples, depth + 1)

    return v

# labels = dataset[0].values.tolist()
# rpts_tree(dataset, labels)

## Tree printing

In [13]:
def print_tree(node, depth=0):
    if node is None:
        return

    # Check and skip nodes where information gain is 0
    if hasattr(node, 'information_gain') and node.information_gain == 0.0:
        return
    
    print("\nDepth: ", depth)
    indent = "  " * depth
    print(f"{indent}Node class: {node.class_label}")
    print(f"{indent}Dataset Length: {node.dataset_length}")

    if hasattr(node, 'rule') and node.rule:
        print(f"{indent}Rule Length: {len(node.rule)}")

    if hasattr(node, 'information_gain'):
        # Check if information_gain is not None before printing
        if node.information_gain is not None:
            print(f"{indent}Information Gain: {node.information_gain:.4f}")
        else:
            print(f"{indent}Information Gain: None")
    
    if node.true_branch or node.false_branch:
        # Print true branch if it exists and has significant information gain
        if node.true_branch and (not hasattr(node.true_branch, 'information_gain') or node.true_branch.information_gain != 0.0):
            print(f"{indent}True Branch:")
            print_tree(node.true_branch, depth + 1)
        
        # Print false branch if it exists and has significant information gain
        if node.false_branch and (not hasattr(node.false_branch, 'information_gain') or node.false_branch.information_gain != 0.0):
            print(f"{indent}False Branch:")
            print_tree(node.false_branch, depth + 1)


## Tree informations

In [14]:
import plotly.graph_objects as go
import plotly.express as px
import pandas as pd

def print_ts(df, l, depth, branch):
    # Get unique classes from the DataFrame
    classes = df.iloc[:, 0].unique()
    
    # Define a color palette with enough colors for all unique classes
    colors = px.colors.qualitative.Plotly[:len(classes)]  # Using Plotly's default qualitative color palette
    if len(classes) > len(colors):
        colors = px.colors.qualitative.Plotly * (len(classes) // len(px.colors.qualitative.Plotly) + 1)
    
    fig = go.Figure()
    added_classes = []  # To track which classes have been added to the plot for legend purposes
    
    for idx in l:
        ts = df.iloc[idx, 1:]  # Assuming the first column is the class and we skip it for the data
        classe = df.iloc[idx, 0]  # Class is in the first column
        
        # Define the legend group based on class
        legend_group = f'class_{classe}'
        
        # Determine whether to show in legend
        show_in_legend = classe not in added_classes
        
        # Assign color based on class
        color_idx = list(classes).index(classe)
        color = colors[color_idx]
        
        # Add trace to the figure
        fig.add_trace(go.Scatter(y=ts, mode='lines', 
                                 line=dict(color=color), 
                                 name=f'Class {classe}',
                                 legendgroup=legend_group,
                                 showlegend=show_in_legend))
        
        # Remember that this class has been added
        if show_in_legend:
            added_classes.append(classe)
    
    fig.update_layout(title='Visualizzazione Time-Series',
                      xaxis_title='Tempo', 
                      yaxis_title='Ampiezza', 
                      legend_title='Class')
    
    # Save the figure as an HTML file
    fig.write_html(f"timeseries_output/timeseries_node{depth}_{branch}.html")

## Tree generation and visualization
This might take some time due to the generation of html files (by default this option is disabled).

In [17]:
from graphviz import Digraph
def visualize_tree(node, ts_list=[], graph=None, depth=0, parent_name=None, branch_label=None):
    
    # ts_list = []
    if graph is None:
        graph = Digraph()
        root_label = f'Class: {node.class_label}\nIG: {node.information_gain:.4f}' if node.information_gain is not None else f'Class: {node.class_label}\nIG: None\nDataset Length: {node.dataset_length}'
        graph.node(name='root', label=root_label)
        parent_name = 'root'

    time_series = [i for i in range(node.dataset_length)]
    current_name = f'node_{depth}_{id(node)}'
    ig_label = f'{node.information_gain:.4f}' if node.information_gain is not None else 'None'
    rule_label = f'{node.rule}' if node.rule else 'None'
    label = f'Class: {node.class_label}\nIG: {ig_label}\nDataset Length: {node.dataset_length}'

    # Only add node if information gain is not zero
    if node.information_gain is not None and node.information_gain != 0:
        graph.node(name=current_name, label=label)
        if node.ts is not None:
            ts_list.append((node.ts, branch_label))
        if parent_name:
            edge_label = branch_label if branch_label else ''
            graph.edge(parent_name, current_name, label=edge_label)
        if node.true_branch:
            visualize_tree(node.true_branch, ts_list, graph, depth + 1, current_name, branch_label='True')
        if node.false_branch:
            visualize_tree(node.false_branch, ts_list, graph, depth + 1, current_name, branch_label='False')
    
    return graph, ts_list

def tree_depth(node):
    if node is None:
        return 0
    return 1 + max(tree_depth(node.true_branch), tree_depth(node.false_branch))

def tree_size(node):
    if node is None:
        return 0
    return 1 + tree_size(node.true_branch) + tree_size(node.false_branch)

labels = dataset[0]

MIN_SUPPORT = 0.05
n_candidates = int(input("Enter the number of candidates: "))
tree = rpts_tree(dataset, labels, n_candidates, MIN_SUPPORT=MIN_SUPPORT, min_samples=1)
print_tree(tree)
tree_visualization, ts_list = visualize_tree(tree)
tree_visualization.render('timeseries_output/decision_tree', format='png', cleanup=True)

depth = tree_depth(tree)
size = tree_size(tree)
print()
print(f"Tree Depth: {depth}")
print(f"Tree Size: {size}")


## UNCOMMENT TO SAVE TIME SERIES HTML FILES IT MIGHT TAKE A WHILE
# for i, (ts, branch) in enumerate(ts_list):
#     depth = i
#     indexes = [j for j in range(len(ts))]
    
#     for t in ts:
#         print_ts(ts, indexes, depth, branch)


Depth: 0, Rule: Antecedent:[(0.15829449, True, 8, 5), (0.44479883, False, 11, 10), (0.40805209, False, 24, 2), (0.57644027, False, 37, 2), (0.51936704, False, 39, 3)]  ==> Consequent:[(-0.40281828, False, 66, 10), (-0.34465538, False, 67, 10)] Confidence:0.24 Split Index:5, Information Gain: 0.2095, Class Label: 10
Depth: 1, Rule: Antecedent:[(-0.55186974, True, 0, 10), (-0.331507, False, 35, 10), (0.17604474, False, 48, 10)]  ==> Consequent:[(-0.55623409, False, 62, 10)] Confidence:0.16 Split Index:3, Information Gain: 1.6290, Class Label: 10.0
Depth: 2, Rule: Antecedent:[(0.24385655, True, 1, 9), (1.0369738, False, 12, 10), (1.7587607, False, 16, 10), (-0.9657366, True, 17, 10), (-0.59259904, True, 28, 9), (0.024065439, False, 31, 10), (-0.57488495, True, 55, 10), (-0.1209455, False, 66, 5), (-0.4737542, False, 79, 10)]  ==> Consequent:[(-0.58373997, False, 92, 2)] Confidence:0.68 Split Index:9, Information Gain: 0.4445, Class Label: 10.0
Depth: 3, Rule: Antecedent:[(1.3203494, True,

## Project 3b: SEQUENCE BOOSTING

In [None]:
import numpy as np
import random
from math import log, exp
from tqdm import tqdm
from collections import Counter


Z_dataset = []
labels = []
alphabet = set()

with open("/Users/sebastianodarconso/Desktop/università/magistrale_progetti/bdss/exercise3/dataset_boosting/question.txt") as f:
    test_data = f.readlines()
    for line in test_data:
        (label, data) = line.split(" ", 1)
        data_array = [int(x) for x in data.split()]
        Z_dataset.append(((1 if int(label[0]) == 1 else -1), data_array))

for label, data in Z_dataset:
    if label not in labels:
        labels.append(label)
    for value in data:
        alphabet.add(value)

alphabet = sorted(list(alphabet))
if len(labels) != 2:
    raise ValueError("Only binary classification is supported")

count = Counter([label for label, _ in Z_dataset])
print("Count of labels: ", count)   

print("Z Dataset: ", Z_dataset)
print("Len Z Dataset: ", len(Z_dataset))
print("Labels: ", labels)
print("Alphabet: ", alphabet)

shuffled_Z = Z_dataset.copy()
random.shuffle(shuffled_Z)

print("Shuffled Z Dataset: ", shuffled_Z)

In [None]:
def contains_subsequence_(sequence, subsequence):
    sub_len = len(subsequence)
    for i in range(len(sequence) - sub_len + 1):
        if sequence[i:i + sub_len] == subsequence:
            return True
    
    return False

def compute_error(dataset, labels, weights, subsequence, potential_label):
    error = 0
    for i, (seq, l) in enumerate(zip(dataset, labels)):
        seq = seq[1]
        contains_subsequence = contains_subsequence_(seq, subsequence)
        if (contains_subsequence and l != potential_label) or (not contains_subsequence and l == potential_label):
            error += weights[i]
    return error

def find_best_sequence(Z, weights):
    best_sequence = None
    best_sequence_label = None
    best_error = float('inf')   
    labels = [label for label, _ in Z]

    for i in tqdm(range(len(Z))):
        sequence = Z[i][1]
        label = labels[i]

        for j in range(len(sequence)):
            for k in range(j + 1, len(sequence) + 1):
                subsequence = sequence[j:k]
                for potential_label in set(labels):
                    error = compute_error(Z, labels, weights, subsequence, potential_label)
                    if error < best_error:
                        best_error = error
                        best_sequence = subsequence
                        best_sequence_label = potential_label
    
    return best_sequence, best_sequence_label, best_error

def update_weights(dataset, labels, weights, best_sequence, best_label, alpha):
    updated_weights = np.copy(weights)
    for i, (seq, l) in enumerate(zip(dataset, labels)):
        seq = seq[1]
        contains_subsequence = contains_subsequence_(seq, best_sequence)

        if(contains_subsequence and l == best_label) or (not contains_subsequence and l != best_label):
            updated_weights[i] *= np.exp(alpha)
        else:
            updated_weights[i] *= np.exp(-alpha)
    
    return updated_weights / np.sum(updated_weights)


def classify(sequence, rules, alphas):
    score = 0
    for alpha, (seq, label) in zip(alphas, rules):
        if contains_subsequence_(sequence, seq):
            score += alpha if label == 1 else -alpha
        else:
            score += -alpha if label == 1 else alpha

    return np.sign(score)


def compute_classifier_error(dataset, classifier, rules, alphas):
    error_count = 0
    for label, sequence in dataset:
        if classifier(sequence, rules, alphas) != label:
            pred = classifier(sequence)
            print(f"Classifier prediction: {pred}, true label: {label}")
            error_count += 1
    return error_count / len(dataset)

In [None]:
from sklearn.metrics import accuracy_score, confusion_matrix

def sequence_boosting(dataset, delta):

    N = len(dataset)
    weights = np.ones(N) / N
    alphas = []
    rules = []
    labels = [label for label, _ in dataset]
    classifier_errors = []
    iteration_info = []
    
    t = 1

    while True:

        print(f"Iteration: {t}")
        best_sequence, best_label, error = find_best_sequence(dataset, weights)

        if best_sequence is None or error >= 0.5:
            break 

        alpha_t = 0.5 * log((1 - error) / (error + 1e-10))

        print(f"Best sequence: {best_sequence}, best label: {best_label}, error: {error:.4f}, alpha: {alpha_t:.4f}")
        alphas.append(alpha_t)
        rules.append((best_sequence, best_label))

        weights = update_weights(dataset, labels, weights, best_sequence, best_label, alpha_t)
        
        error_count = 0
        predictions = []
        for label, sequence in dataset:
            if classify(sequence, rules, alphas) != label:
                pred = classify(sequence, rules, alphas)
                predictions.append(pred)
                error_count += 1

        overall_classifier_error = error_count / len(dataset)
        classifier_errors.append(overall_classifier_error)
        print(f"Classifier error: {overall_classifier_error}")

        iteration_info.append({
            "iteration": t,
            "alpha": alpha_t,
            "error": error,
            "classigier_error": overall_classifier_error,
            "weights": weights.copy(),
            "best_sequence": best_sequence,
            "best_label": best_label,
            "predictions": predictions
        })

        if overall_classifier_error < delta or overall_classifier_error == 0:
            print("Classifier error below threshold.") if overall_classifier_error < delta else print("Classifier error is zero.")
            break

        if t > 100:
            print("Reached maximum number of iterations.")
            break

        t += 1
    
    final_classifier = lambda seq: classify(seq, rules, alphas)

    final_predictions = [classify(seq, rules, alphas) for _, seq in dataset]
    y_true = labels
    y_pred = final_predictions
    cm = confusion_matrix(y_true, y_pred)
    accuracy = accuracy_score(y_true, y_pred)

    return final_classifier, rules, alphas, classifier_errors, iteration_info, cm, accuracy

In [None]:
count_shuffled_reduced = Counter([label for label, _ in shuffled_Z[:100]])
print("Count of labels: ", count_shuffled_reduced)

In [None]:
classifier, rules, alphas, errors, iteration_info, cm, accuracy = sequence_boosting(shuffled_Z[:100], delta=0.1)

In [None]:
count = 0
for label, sequence in shuffled_Z[:100]:
    prediction = classifier(sequence)
    if prediction == label:
        count += 1
    # print(f"Predicted label = {prediction}, true label = {label}")


print(f"Number of correct predictions: {count} out of {len(shuffled_Z[:100])}")
print(f"Percentage of correct: {(count * 100) / len(shuffled_Z[:100]):.2f}%")

## Weights interpretation
Higher weights indicate data points that are difficult to classify for the previous ensembled classifier, this means that the new classifier needs to give more attention to those weights.

In [None]:
import pandas as pd
import plotly.express as px
import matplotlib.pyplot as plt

def plot_weights(iteration_info):
    # Create a DataFrame to hold the data for all iterations
    data = []
    for info in iteration_info:
        for idx, weight in enumerate(info['weights']):
            data.append({'Data Point': idx, 'Weight': weight, 'Iteration': f"Iteration {info['iteration']}"})

    # Convert the list of dictionaries to a pandas DataFrame
    df = pd.DataFrame(data)

    # Create the line plot using Plotly Express
    fig = px.line(df, x='Data Point', y='Weight', color='Iteration',
                  labels={'Data Point': 'Data Point', 'Weight': 'Weights', 'Iteration': 'Iterations'},
                  title='Weights Distribution Over Iterations')

    # Show the plot
    fig.show()
    fig.write_html(f"boosting_output/weights.html")

plot_weights(iteration_info)

## Alphas interpretation

- In the early iterations alpha values should be high as the model is learning and picking effective classifiers.
- In mid iterations alpha values might decrease or fluctuate as the model begins to deal with harder to classify instances.
- In late iterations alpha values might come to a plateau or become lower as each new classifier adds less incremental value.

In [None]:
def plot_alphas(iteration_info):
    # Extract iteration numbers and alpha values
    data = [{'Iteration': info['iteration'], 'Alpha': info['alpha']} for info in iteration_info]

    # Convert to a DataFrame
    df = pd.DataFrame(data)

    # Create the line plot using Plotly Express
    fig = px.line(df, x='Iteration', y='Alpha', 
                  labels={'Iteration': 'Iteration', 'Alpha': 'Alpha'},
                  title='Alpha Values Over Iterations',
                  markers=True) # Ensures markers are shown on the line

    # Show the plot
    fig.show()
    fig.write_html(f"boosting_output/alphas.html")


plot_alphas(iteration_info)

In [None]:
def plot_classifier_errors(classifier_errors):
    # Create a DataFrame with iterations and corresponding classifier errors
    data = {
        'Iteration': list(range(1, len(classifier_errors) + 1)),
        'Classifier Error': classifier_errors
    }
    
    df = pd.DataFrame(data)

    # Create the line plot with markers using Plotly Express
    fig = px.line(df, x='Iteration', y='Classifier Error', 
                  labels={'Iteration': 'Iteration', 'Classifier Error': 'Classifier Error'},
                  title='Classifier Errors Over Iterations',
                  markers=True) # Adds markers to the plot

    # Show the plot
    fig.show()
    fig.write_html(f"boosting_output/error.html")

plot_classifier_errors(errors)

In [None]:
import seaborn as sns

'''
[[TRUE POSITIVE, FALSE NEGATIVE],
 [FALSE POSITIVE, TRUE NEGATIVE]]

'''
## This confusion matrix makes no sense since it's done on the training set, to make it work we should split the dataset into training and testing
def plot_confusion_matrix(cm, class_names, title='Confusion Matrix', cmap='Blues'):
    plt.figure(figsize=(8, 6))
    sns.heatmap(cm, annot=True, fmt='d', cmap=cmap, xticklabels=class_names, yticklabels=class_names)
    plt.title(title)
    plt.xlabel('Predicted Label')
    plt.ylabel('True Label')
    plt.savefig("boosting_output/confusion_matrix.png")
    plt.show()

plot_confusion_matrix(cm, class_names=['-1', '1]'], title='Confusion Matrix', cmap='Blues')

In [None]:
print(f"Accuracy: {accuracy}")