In [1]:
import pandas as pd
import numpy as np
import random
import math

In [2]:
# Define the Node class
class Node:
    def __init__(self):
        self.positive_child = None
        self.negative_child = None
        self.label = None
        self.rule = None

# Define helper functions
def create_node():
    return Node()


In [3]:
def determine_class_label(Tsv):
    #print("Tsv in determine_class_label:", Tsv[:5])  # Add this line to check the input
    if not Tsv:
        return None
    
    label_count = {}
    for ts, label in Tsv:
        if label in label_count:
            label_count[label] += 1
        else:
            label_count[label] = 1
    return max(label_count, key=label_count.get)


In [4]:
def set_node_label(node, label):
    node.label = label

def label_node(node, rule):
    node.rule = rule

def satisfies_rule(ts, rule):
    pts, index = rule
    if not isinstance(pts, (list, tuple)) or not isinstance(index, int):
        return False
    
    if index < 0 or index >= len(ts):
        return False
    
    subseq = ts[index: index + len(pts)]
    for i, (value, sign) in enumerate(pts):
        if (sign == '+' and subseq[i] < value) or (sign == '-' and subseq[i] > value):
            return False
    return True


In [5]:
def split_dataset(Tsv, rule):
    Tsv_positive = [(ts, label) for ts, label in Tsv if satisfies_rule(ts, rule)]
    Tsv_negative = [(ts, label) for ts, label in Tsv if not satisfies_rule(ts, rule)]
    #print("Tsv_positive:", Tsv_positive[:5])  # Add this line to check the output
    #print("Tsv_negative:", Tsv_negative[:5])  # Add this line to check the output
    return Tsv_positive, Tsv_negative


In [6]:
def check_support(pts, Tsv, min_support=0.1):
    count = sum(1 for ts, _ in Tsv if satisfies_rule(ts, (pts, 0)))
    support = count / len(Tsv)
    return support >= min_support

def calculate_confidence(pts, Tsv):
    satisfying_instances = [(ts, label) for ts, label in Tsv if satisfies_rule(ts, (pts, 0))]
    
    if not satisfying_instances:
        return 0.0
    
    correct_classifications = sum(1 for ts, label in satisfying_instances if determine_class_label([(ts, label)]) == label)
    
    confidence = correct_classifications / len(satisfying_instances)
    
    return confidence


In [7]:
def check_interesting_measures(pts, M, Tsv):
    support = check_support(pts, Tsv)
    confidence = calculate_confidence(pts, Tsv)
    
    return M['support'][0] <= support <= M['support'][1] and M['confidence'][0] <= confidence <= M['confidence'][1]


In [8]:
def generate_candidate_patterns(Tsv, M):
    if not Tsv:
        return None  # Handle case where Tsv is empty
    
    I = set()
    r = []
    
    while True:
        ts, label = random.choice(Tsv)
        i = random.choice([idx for idx in range(len(ts)) if idx not in I])
        I.add(i)
        
        ts_prime, _ = random.choice(Tsv)
        
        pattern_element = (ts[i], ts_prime[i] >= ts[i], i)
        r.append(pattern_element)
        
        sorted_I = sorted(I)
        pts = [(ts[idx], pattern_element[1]) for idx in sorted_I]
        
        if len(pts) < 2:
            continue
        
        if check_support(pts, Tsv) and check_interesting_measures(pts, M, Tsv):
            return pts


In [9]:
def calculate_loss(rpts, Tsv, L):
    loss = 0.0
    return loss

def select_best_pattern_rule(RPTSv, Tsv, L):
    best_rpts = None
    best_loss = float('inf')
    
    for rpts in RPTSv:
        loss = calculate_loss(rpts, Tsv, L)
        if loss < best_loss:
            best_loss = loss
            best_rpts = rpts
    
    return best_rpts


In [10]:
# Define the rpts_tree function
def rpts_tree(Ts, M, L, max_height, min_samples):
    def build_tree(Ts, M, L, current_height):
        v = create_node()
        
        Tsv = Ts
        #print("Tsv in build_tree:", Tsv[:5])  # Add this line to check the input
        
        class_label = determine_class_label(Tsv)
        if class_label is None:
            return v
        
        set_node_label(v, class_label)
        
        if len(Tsv) == 0:
            return v
        
        RPTSv = generate_candidate_patterns(Tsv, M)
        
        if RPTSv is None:
            return v
        
        rptsv = select_best_pattern_rule(RPTSv, Tsv, L)
        
        label_node(v, rptsv)
        
        Tsv_positive, Tsv_negative = split_dataset(Tsv, rptsv)
        
        if current_height >= max_height or len(Tsv) < min_samples:
            return v
        
        v.positive_child = build_tree(Tsv_positive, M, L, current_height + 1)
        v.negative_child = build_tree(Tsv_negative, M, L, current_height + 1)
        
        return v
    
    return build_tree(Ts, M, L, 0)


In [11]:
# Function to print the tree structure
def print_tree(node, level=0):
    indent = "  " * level
    if node is None:
        return
    print(f"{indent}Node: Label={node.label}, Rule={node.rule}")
    print_tree(node.positive_child, level + 1)
    print_tree(node.negative_child, level + 1)


In [12]:
# Load the dataset
#file_path = '/Users/majidtavakoli/Documents/Medical_bionformatics/decision support systems/Ecxersices/Final/Coffee/Coffee_TEST.txt'  # Replace with the actual file path if needed
#data = pd.read_csv(file_path)

# Assuming the 'Class' column contains the class label
#class_column = 'Class'

# Assuming time-series data are in a specific range of columns
# In this case, we'll assume the time-series data start from the 'Train' column to 'DTW (w=100)'
#time_series_columns = data.columns[3:12]  # Adjust the range according to your dataset

# Convert the dataframe to a list of tuples (time_series, label)
#Ts = [(row[time_series_columns].apply(pd.to_numeric, errors='coerce').fillna(0).values.tolist(), row[class_column]) for index, row in data.iterrows()]
#print("Converted Ts:", Ts[:5])  # Add this line to check the conversion

# Define the set of interesting measures M
#M = {
 #   'support': (0.1, 1.0),  # Example measure: Support should be between 10% and 100%
  #  'confidence': (0.5, 1.0)  # Example measure: Confidence should be between 50% and 100%
#}


#data_list = [ [float(f) for f in x[1].values[0].split()] for x in list(data.iterrows())  ]
#x = [l[1:] for l in data_list]
#y = [l[0] for l in data_list]
#x,y

In [13]:
import pandas as pd

# Step 1: Define the path to the .txt file
file_path = '/Users/majidtavakoli/Documents/Medical_bionformatics/decision support systems/Ecxersices/Final/Coffee/Coffee_TEST.txt'

# Step 2: Read the .txt file without headers (header=None)
data = pd.read_csv(file_path, delimiter='\t', header=None)

# Step 3: Print the first few rows to check the structure of the dataset
print("First few rows of the dataset:")
print(data.head())

# Step 4: Assuming the last column contains the class labels
class_column = data.columns[-1]  # Assuming the last column contains the class labels

# Step 5: Assuming time-series data are in the first N columns (replace with the actual range)
time_series_columns = data.columns[:-1]  # All columns except the last one are time-series data

# Step 6: Convert the dataframe to a list of tuples (time_series, label)
Ts = [(row[time_series_columns].apply(pd.to_numeric, errors='coerce').fillna(0).values.tolist(), row[class_column]) 
      for index, row in data.iterrows()]

# Step 7: Print the first few examples of the converted time-series data
print("Converted Ts:", Ts[:5])

# Step 8: Optionally, separate the time-series data (X) and labels (Y)
X = [t[0] for t in Ts]  # Time-series data
Y = [t[1] for t in Ts]  # Labels

# Print first 5 examples of X and Y for verification
print("First 5 X (Time-series):", X[:5])
print("First 5 Y (Labels):", Y[:5])

# Step 9: Define the set of interesting measures M (customize these ranges as needed)
M = {
    'support': (0.1, 1.0),  # Support should be between 10% and 100%
    'confidence': (0.5, 1.0)  # Confidence should be between 50% and 100%
}

# Print the set of measures M to verify
print("Interesting measures M:", M)


First few rows of the dataset:
                                                   0
0     0.0000000e+00  -5.7437159e-01  -5.2338044e-...
1     0.0000000e+00  -5.9474332e-01  -5.4580966e-...
2     0.0000000e+00  -6.3032320e-01  -5.8714820e-...
3     0.0000000e+00  -6.3373679e-01  -5.6094286e-...
4     0.0000000e+00  -5.7732118e-01  -5.5932569e-...
Converted Ts: [([], '   0.0000000e+00  -5.7437159e-01  -5.2338044e-01  -4.4876379e-01  -4.6651461e-01  -5.9004311e-01  -7.3918779e-01  -8.0091704e-01  -8.1514776e-01  -8.4054255e-01  -8.6630830e-01  -8.7463199e-01  -8.4164884e-01  -8.2459755e-01  -8.2119474e-01  -8.7320791e-01  -9.5264845e-01  -9.9157859e-01  -9.9606863e-01  -9.7311994e-01  -9.6290781e-01  -9.7581260e-01  -9.9099643e-01  -1.0034398e+00  -1.0318264e+00  -1.0288549e+00  -1.0003457e+00  -9.6893924e-01  -9.3857634e-01  -9.4098649e-01  -9.4073583e-01  -9.5839386e-01  -9.8225027e-01  -9.7491265e-01  -9.7035137e-01  -9.4754631e-01  -9.4660448e-01  -9.2715433e-01  -8.5785469e-01  -8.4

In [14]:
# Define the loss function L (for example, information gain)
def information_gain(parent, positive_child, negative_child):
    def entropy(class_counts):
        total = sum(class_counts)
        return -sum((count / total) * math.log2(count / total) for count in class_counts if count != 0)
    
    parent_entropy = entropy([count for _, count in parent.items()])
    positive_entropy = entropy([count for _, count in positive_child.items()])
    negative_entropy = entropy([count for _, count in negative_child.items()])
    
    positive_weight = sum(positive_child.values()) / sum(parent.values())
    negative_weight = sum(negative_child.values()) / sum(parent.values())
    
    return parent_entropy - (positive_weight * positive_entropy + negative_weight * negative_entropy)

L = information_gain


In [15]:
def tree():
    print("this is a tree function")

tree()    

this is a tree function


In [16]:
def print_tree(node, depth=0):
    """
    Recursively prints the tree structure.
    
    :param node: The current node in the tree.
    :param depth: The current depth of the node (used for indentation).
    """
    if node is None:
        return
    
    # Print the current node
    print("  " * depth + str(node))
    
    # Recursively print the children (assuming binary tree)
    if hasattr(node, 'positive'):
        print_tree(node.positive, depth + 1)
    if hasattr(node, 'negative'):
        print_tree(node.negative, depth + 1)

# Example usage:
# Assuming 'tree' is the root of your tree structure
print_tree(tree)


<function tree at 0x10710c860>


In [17]:
import random

def generate_candidate_patterns(Tsv, M):
    # Initialize an empty set to store chosen indices
    I = set()

    # Keep choosing patterns until valid indices are found
    while True:
        ts, label = random.choice(Tsv)

        # Get available indices not already in I
        available_indices = [idx for idx in range(len(ts)) if idx not in I]

        # Check if there are any available indices left
        if not available_indices:
            # Handle the case where no indices are left
            print("No available indices left to choose from.")
            return None  # Or handle in some other appropriate way

        # Choose an index from available indices
        i = random.choice(available_indices)
        I.add(i)

        # Generate ts_prime as before
        ts_prime, _ = random.choice(Tsv)

        # Continue your logic here...


In [18]:

# Define maximum height of the tree and minimum samples per leaf
max_height = 5
min_samples = 10

# Build the tree
tree = rpts_tree(Ts, M, L, max_height, min_samples)

# Print the tree structure
print_tree(tree)


No available indices left to choose from.
<__main__.Node object at 0x1070c6e40>


In [19]:
print(tree)

<__main__.Node object at 0x1070c6e40>
