In [16]:
import numpy as np
import pandas as pd
from collections import Counter
from sklearn.model_selection import train_test_split
from sklearn.utils import shuffle
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

def entropy(labels):
    unique_labels, counts = np.unique(labels, return_counts=True)
    probabilities = counts / len(labels)
    entropy_value = -np.sum(probabilities * np.log2(probabilities))
    return entropy_value

def information_gain(y, x):
    parent_entropy = entropy(y)
    info_a = 0
    for value in set(x):
        partition_indices = x[x == value].index
        partition_entropy = entropy(y[partition_indices])
        info_a += len(partition_indices) / len(x) * partition_entropy
    gain_a = parent_entropy - info_a
    return gain_a

def decision_tree(X_train, y_train, max_depth, current_depth=0):
    if len(set(y_train)) == 1 or current_depth == max_depth or len(X_train.columns) == 0:
        class_counts = Counter(y_train)
        majority_class = class_counts.most_common(1)[0][0]
        return {"class_label": majority_class}
    
    gains = {}
    for attr in X_train.columns:
        if X_train[attr].dtype == 'object':
            gains[attr] = information_gain(y_train, X_train[attr])
        else:
            # For numerical attributes
            attr_mean = X_train[attr].mean()
            partition_indices_left = X_train[X_train[attr] <= attr_mean].index
            partition_indices_right = X_train[X_train[attr] > attr_mean].index
            partition_entropy_left = entropy(y_train[partition_indices_left])
            partition_entropy_right = entropy(y_train[partition_indices_right])
            weight_left = len(partition_indices_left) / len(X_train)
            weight_right = len(partition_indices_right) / len(X_train)
            info_a = weight_left * partition_entropy_left + weight_right * partition_entropy_right
            gains[attr] = entropy(y_train) - info_a
    
    best_attr = max(gains, key=gains.get)
    node = {"attribute": best_attr, "leaf": {}}
    
    if X_train[best_attr].dtype == 'object':
        unique_values = X_train[best_attr].unique()
        for value in unique_values:
            partition_indices = X_train[X_train[best_attr] == value].index
            node["leaf"][value] = decision_tree(X_train.loc[partition_indices], y_train.loc[partition_indices], max_depth, current_depth + 1)
    else:
        attr_mean = X_train[best_attr].mean()
        partition_indices_left = X_train[X_train[best_attr] <= attr_mean].index
        partition_indices_right = X_train[X_train[best_attr] > attr_mean].index
        node["leaf"]["<= " + str(attr_mean)] = decision_tree(X_train.loc[partition_indices_left], y_train.loc[partition_indices_left], max_depth, current_depth + 1)
        node["leaf"]["> " + str(attr_mean)] = decision_tree(X_train.loc[partition_indices_right], y_train.loc[partition_indices_right], max_depth, current_depth + 1)
    
    return node

def classify_random_forest(trees, subsampled_attributes, X_test):
    class_labels = []
    for _, test_row in X_test.iterrows():
        tree_votes = []
        for tree, sub_attributes in zip(trees, subsampled_attributes):
            test_features = pd.DataFrame(test_row[sub_attributes]).T
            predicted_label = classify(tree, test_features)
            tree_votes.append(predicted_label[0])  # Append predicted label
        class_labels.append(max(set(tree_votes), key=tree_votes.count))  # Perform majority voting
    return class_labels

def classify(tree, features):
    class_labels = []
    for _, feature in features.iterrows():
        node = tree
        while "class_label" not in node:
            split_attr = node["attribute"]
            feature_value = feature[split_attr]
            if feature_value in node["leaf"]:
                node = node["leaf"][feature_value]
            else:
                print("Feature value not found in leaf nodes. Node:", node)
                temp = max(node["leaf"].items(), key=lambda x: len(x[1]))[0]
                print("class_labels: ",temp,"node[class_labels]: ",node["class_label"])
                class_labels.append(max(node["leaf"].items(), key=lambda x: len(x[1]))[0])
                break
        else:
            if "class_label" in node:
                class_labels.append(node["class_label"])
            else:
                class_labels.append(max(node["leaf"].items(), key=lambda x: len(x[1]))[0])
    return class_labels


# def classify(tree, features):
#     class_labels = []
#     for _, feature in features.iterrows():
#         node = tree
#         while "class_label" not in node:
#             split_attr = node["attribute"]
#             feature_value = feature[split_attr]
#             if feature_value in node["leaf"]:
#                 node = node["leaf"][feature_value]
#             else:
#                 print("Feature value not found in leaf nodes. Node:", node)
#                 print("class_labels: ",class_labels,"node[class_labels]: ",node["class_labels"])
#                 # If feature value not found in leaf node, return class label of the majority class in that node
#                 class_labels.append(max(node["leaf"].items(), key=lambda x: len(x[1]))[0])
#                 break
#         else:
#             # Reached leaf node, append the class label
#             class_labels.append(node["class_label"])
#     return class_labels



def bootstrap_sampling(X, y):
    indices = np.random.choice(len(X), size=len(X), replace=True)
    return X.iloc[indices], y.iloc[indices]

def fit_random_forest(num_trees, max_depth, example_subsample_rate, attr_subsample_rate, X_train, y_train):
    trees = []
    subsampled_attributes = []

    for i in range(num_trees):
        # Bootstrap sampling to create a bootstrapped dataset
        bootstrapped_X, bootstrapped_y = bootstrap_sampling(X_train, y_train)

        # Subsample attributes
        subsampled_attr_indexes = np.random.choice(range(X_train.shape[1]), int(X_train.shape[1] * attr_subsample_rate), replace=False)
        subsampled_attributes.append(subsampled_attr_indexes.tolist())
        subsampled_X = bootstrapped_X.iloc[:, subsampled_attr_indexes]

        # Build decision tree using the bootstrapped and subsampled dataset
        tree = decision_tree(subsampled_X, bootstrapped_y, max_depth)
        trees.append(tree)

    return trees, subsampled_attributes

def confusion_matrix(y_true, y_pred):
    print("y_true: ",y_true,"y_pred: ",y_pred)
    classes = np.unique(np.concatenate([y_true, y_pred]))
    n_classes = len(classes)
    conf_matrix = np.zeros((n_classes, n_classes), dtype=int)

    for i, true_label in enumerate(classes):
        for j, pred_label in enumerate(classes):
            conf_matrix[i, j] = np.sum((y_true == true_label) & (y_pred == pred_label))

    return conf_matrix

def calculate_metrics(conf_matrix):
    TP = np.diag(conf_matrix)
    FP = np.sum(conf_matrix, axis=0) - TP
    FN = np.sum(conf_matrix, axis=1) - TP
    TN = np.sum(conf_matrix) - (TP + FP + FN)

    accuracy = np.sum(TP) / np.sum(conf_matrix)
    
    precision = np.where(TP + FP == 0, 0, TP / (TP + FP))
    recall = np.where(TP + FN == 0, 0, TP / (TP + FN))
    
    # Handle division by zero or empty slices for F1-score
    f1_score = np.zeros_like(precision)
    non_zero_denominator = (precision + recall) != 0
    f1_score[non_zero_denominator] = 2 * (precision[non_zero_denominator] * recall[non_zero_denominator]) / (precision[non_zero_denominator] + recall[non_zero_denominator])
    
    return accuracy, precision, recall, f1_score


def stratified_cross_validation(X, y, n_folds, num_trees, max_depth, example_subsample_rate, attr_subsample_rate):
    fold_size = len(X) // n_folds
    accuracies = []
    precisions = []
    recalls = []
    f1_scores = []
    conf_matrices = []
    all_predictions = []

    for i in range(n_folds):
        start = i * fold_size
        end = (i + 1) * fold_size

        X_train_fold = pd.concat([X[:start], X[end:]])
        y_train_fold = pd.concat([y[:start], y[end:]])

        X_validation_fold = X[start:end]
        y_validation_fold = y[start:end]
        
        trees, subsampled_attributes = fit_random_forest(num_trees, max_depth, example_subsample_rate, attr_subsample_rate, X_train_fold, y_train_fold)
        predictions = classify_random_forest(trees, subsampled_attributes, X_validation_fold)
        
        all_predictions.extend(predictions)
        
        # Convert y_validation_fold to list
        y_validation_fold_list = y_validation_fold.tolist()
        
        # Calculate metrics
        conf_matrix = confusion_matrix(y_validation_fold_list, predictions)
        acc, prec, rec, f1 = calculate_metrics(conf_matrix)
        
        accuracies.append(acc)
        precisions.append(prec)
        recalls.append(rec)
        f1_scores.append(f1)
        
        conf_matrices.append(conf_matrix)

        mean_accuracy = np.mean(accuracies)
        mean_precision = np.mean([np.mean(precision, axis=0) for precision in precisions], axis=0)
        mean_recall = np.mean([np.mean(recall, axis=0) for recall in recalls], axis=0)
        mean_f1_score = np.nanmean([np.nanmean(f1_score, axis=0) for f1_score in f1_scores], axis=0)

    return mean_accuracy, mean_precision, mean_recall, mean_f1_score, conf_matrices
        
        

if __name__ == "__main__":
    df_wine = pd.read_csv("/Users/noshitha/Downloads/hw3/datasets/hw3_wine.csv", delimiter="\t")

    # Shuffle the dataset
    df_wine_shuffle = shuffle(df_wine)

    # Split the dataset into features and target variable
    X = df_wine_shuffle.iloc[:, 1:]  # Assuming the first column is the target variable
    y = df_wine_shuffle.iloc[:, 0] 
    
    #n_trees_list = [1, 5, 10, 20, 30, 40, 50]
    n_trees_list = [1]
    n_folds = 10
    max_depth = 3
    example_subsample_rate = 0.5
    attr_subsample_rate = 0.5
    
    accuracy  = []
    precision = []
    recall    = []
    f1_score  = []
    
    for num_trees in n_trees_list:
        print("num_trees: ",num_trees)
        accuracies, precisions, recalls, f1_scores, conf_matrices = stratified_cross_validation(X, y, n_folds, num_trees, max_depth, example_subsample_rate, attr_subsample_rate)
        print("Accuracies:", accuracies)
        print("Precisions:", precisions)
        print("Recalls:", recalls)
        print("F1-scores:", f1_scores)
        print("conf_matrices: ",conf_matrices)
        accuracy.append(accuracies)
        precision.append(precisions)
        recall.append(recalls)
        f1_score.append(f1_scores)
    

num_trees:  1
Feature value not found in leaf nodes. Node: {'attribute': '13', 'leaf': {'<= 749.111801242236': {'attribute': '8', 'leaf': {'<= 0.3913567839195981': {'attribute': '1', 'leaf': {'<= 12.902106060605918': {'class_label': 2}, '> 12.902106060605918': {'class_label': 2}}}, '> 0.3913567839195981': {'attribute': '2', 'leaf': {'<= 3.176329704510097': {'class_label': 3}, '> 3.176329704510097': {'class_label': 3}}}}}, '> 749.111801242236': {'attribute': '8', 'leaf': {'<= 0.3379166666666667': {'attribute': '1', 'leaf': {'<= 13.517655310621155': {'class_label': 1}, '> 13.517655310621155': {'class_label': 1}}}, '> 0.3379166666666667': {'attribute': '2', 'leaf': {'<= 2.2331876606683827': {'class_label': 1}, '> 2.2331876606683827': {'class_label': 3}}}}}}}


KeyError: 'class_label'

In [3]:
df_wine.head()

Unnamed: 0,# class,1,2,3,4,5,6,7,8,9,10,11,12,13
0,1,14.23,1.71,2.43,15.6,127,2.8,3.06,0.28,2.29,5.64,1.04,3.92,1065
1,1,13.2,1.78,2.14,11.2,100,2.65,2.76,0.26,1.28,4.38,1.05,3.4,1050
2,1,13.16,2.36,2.67,18.6,101,2.8,3.24,0.3,2.81,5.68,1.03,3.17,1185
3,1,14.37,1.95,2.5,16.8,113,3.85,3.49,0.24,2.18,7.8,0.86,3.45,1480
4,1,13.24,2.59,2.87,21.0,118,2.8,2.69,0.39,1.82,4.32,1.04,2.93,735


In [4]:
X.head()

Unnamed: 0,1,2,3,4,5,6,7,8,9,10,11,12,13
107,12.72,1.75,2.28,22.5,84,1.38,1.76,0.48,1.63,3.3,0.88,2.42,488
153,13.23,3.3,2.28,18.5,98,1.8,0.83,0.61,1.87,10.52,0.56,1.51,675
102,12.34,2.45,2.46,21.0,98,2.56,2.11,0.34,1.31,2.8,0.8,3.38,438
33,13.76,1.53,2.7,19.5,132,2.95,2.74,0.5,1.35,5.4,1.25,3.0,1235
122,12.42,4.43,2.73,26.5,102,2.2,2.13,0.43,1.71,2.08,0.92,3.12,365


In [5]:
y.head()

107    2
153    3
102    2
33     1
122    2
Name: # class, dtype: int64

In [6]:
print(conf_matrices)

NameError: name 'conf_matrices' is not defined

### Accuracy

In [None]:
plt.plot(n_trees_list, accuracy, marker='o')
plt.xlabel('Number of Trees')
plt.ylabel('Accuracy')
plt.title('Accuracy vs Number of Trees - House Votes Dataset')
plt.grid(True)
plt.show()

### Precision

In [None]:
plt.plot(n_trees_list, precision, marker='o')
plt.xlabel('Number of Trees')
plt.ylabel('Precision'))
plt.title('Precision vs Number of Trees - House Votes Dataset')
plt.grid(True)
plt.show()

### Recall

In [None]:
plt.plot(n_trees_list, precision, marker='o')
plt.xlabel('Number of Trees')
plt.ylabel('Recall'))
plt.title('Recall vs Number of Trees - House Votes Dataset')
plt.grid(True)
plt.show()

### F1_score

In [None]:
plt.plot(n_trees_list, precision, marker='o')
plt.xlabel('Number of Trees')
plt.ylabel('F1_score'))
plt.title('F1_score vs Number of Trees - House Votes Dataset')
plt.grid(True)
plt.show()