# K-Nearest Neighbors Algorithm Implementation in Python

In [2]:
import pickle
import time
import numpy as np
import pandas as pd
from numpy.linalg import norm
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
from sklearn.metrics import f1_score, accuracy_score, precision_score, recall_score
from sklearn.neighbors import KNeighborsClassifier
data = np.load('data.npy', allow_pickle=True)
# game_id , resnet(1 x 1024) , vit (1 x 512) , label_name , guess time

# Plot of Distribution of Labels

In [7]:
data_labels = np.array(data[:, 3])
values, counts = np.unique(data_labels, return_counts=True)
plt.figure(figsize=(10, 40))
plt.xlabel('Frequency')
plt.ylabel('Label')
plt.title('Distribution of the various labels')
plt.barh(values, counts)
plt.tight_layout()
plt.show()

# Initial KNN Model

In [None]:
class Initial_KNN:
    def __init__(self, k, encoder_type, metric,ratio=0.8):
        self.k = k
        self.encoder_type = encoder_type
        self.metric = metric
        self.measure = 'mode'
        self.ratio = ratio

    def get_encoder_type(self):
        return self.encoder_type
    def set_encoder_type(self,encoder_type):
        self.encoder_type = encoder_type

    def get_metric(self):
        return self.metric
    def set_metric(self,metric):
        self.metric = metric

    def get_k(self): # get method for K
        return self.k
    def set_k(self,k): # set method for K
        self.k = k

    def get_ratio(self):
        return self.ratio
    def set_ratio(self,ratio):
        self.ratio = ratio


    def get_measure(self, A):
        if self.measure == 'mode':
            unique_values, counts = np.unique(np.array(A), return_counts=True)
            mode_index = np.argmax(counts)
            mode = unique_values[mode_index]
            return mode

    def get_distance(self, E1, E2):
        if self.metric == 'manhattan':
            return np.sum(np.abs(np.array(E1) - np.array(E2)))
        elif self.metric == 'euclidean':
            return np.sqrt(np.sum(np.square(np.array(E1) - np.array(E2))))
        elif self.metric == 'cosine':
            return 1-(np.dot(E1,E2)) / (norm(E1) * norm(E2))
        else:
            raise ValueError("Invalid metric")
        
    def fit(self, train_embeddings, train_labels, validate_embeddings, validate_labels):
        self.train_embeddings = train_embeddings
        self.train_labels = train_labels
        self.validate_embeddings = validate_embeddings
        self.validate_labels = validate_labels

    def data_split(self,data):
        self.labels = np.array(data[:, 3])
        self.embeddings = np.array(data[:,1:3])
        resnet = np.array(data[:,1])
        self.resnet = [res[0] for res in resnet]
        self.resnet = np.array(self.resnet)
        vit = np.array(data[:,2])
        self.vit = [v[0] for v in vit]
        self.vit = np.array(self.vit)
        num_total_samples = data.shape[0]
        num_training_samples = int(num_total_samples * self.ratio)
        indices = np.array(range(num_total_samples)) # used to check if unshuffled data is giving same results across users
        np.random.shuffle(indices) # permutes the array [0,....n-1] 
        self.indices = indices # saving indices
        self.num_training_samples = num_training_samples # saving number of training samples

    def evaluate(self, embeddings, true_labels):
        predicted_labels = self.predict_array(embeddings)
        F1_score = f1_score(true_labels, predicted_labels, average='macro')
        accuracy = accuracy_score(true_labels, predicted_labels)
        precision = precision_score(true_labels, predicted_labels, average='macro',zero_division=0)
        recall = recall_score(true_labels, predicted_labels, average='macro',zero_division=0)
        return F1_score, accuracy, precision, recall

    def train(self, encoder_type):
        if encoder_type == 'vit':
            train_index = self.indices[:self.num_training_samples]
            validate_index = self.indices[self.num_training_samples:]
            data_train = self.vit[train_index]
            data_validate = self.vit[validate_index]
            label_train = self.labels[train_index]
            label_validate = self.labels[validate_index]
            self.fit(data_train, label_train, data_validate, label_validate)
        elif encoder_type == 'resnet':
            train_index = self.indices[:self.num_training_samples]
            validate_index = self.indices[self.num_training_samples:]
            data_train = self.resnet[train_index]
            data_validate = self.resnet[validate_index]
            label_train = self.labels[train_index]
            label_validate = self.labels[validate_index]
            self.fit(data_train, label_train, data_validate, label_validate)

    def predict_sample(self, E):
        distances = [self.get_distance(embedding, E) for embedding in self.train_embeddings]
        sorted_indices = np.argsort(distances)
        nearest_index = sorted_indices[:self.k]
        nearest_labels = self.train_labels[nearest_index]
        classified_label = self.get_measure(nearest_labels)
        return classified_label

    def predict_array(self, X):
        predictions = [self.predict_sample(embeddings) for embeddings in X]
        return np.array(predictions)
    
    def print_answer(self):
        self.data_split(data)
        self.train(self.encoder_type)  
        F1_score, accuracy, precision, recall = self.evaluate(self.validate_embeddings, self.validate_labels)
        print_data = [['Accuracy',accuracy],['Precision',precision],['Recall',recall],['f1_score',F1_score]]
        df = pd.DataFrame(print_data,columns=['Measure','Value'])
        print(df.to_string(index=False))
    
Initial_KNN_Object = Initial_KNN(3,'vit','manhattan')
Initial_KNN_Object.print_answer(data)

# Most Optimized KNN Model

In [None]:
class Optimized_KNN:
    def __init__(self, k, encoder_type, metric,ratio=0.8):
        self.k = k
        self.encoder_type = encoder_type
        self.metric = metric
        self.measure = 'mode'
        self.ratio = ratio

    def get_encoder_type(self):
        return self.encoder_type
    def set_encoder_type(self,encoder_type):
        self.encoder_type = encoder_type

    def get_metric(self):
        return self.metric
    def set_metric(self,metric):
        self.metric = metric

    def get_k(self): # get method for K
        return self.k
    def set_k(self,k): # set method for K
        self.k = k

    def get_ratio(self):
        return self.ratio
    def set_ratio(self,ratio):
        self.ratio = ratio

    def get_measure(self, A):
        if self.measure == 'mode':
            unique_values, counts = np.unique(np.array(A), return_counts=True)
            mode_index = np.argmax(counts)
            mode = unique_values[mode_index]
            return mode

    def get_distance(self, E1, E2):
        if self.metric == 'manhattan':
            return norm(np.array(E1)-np.array(E2),axis=1,ord=1)
        elif self.metric == 'euclidean':
            return norm(np.array(E1)-np.array(E2),axis=1,ord=2)
        elif self.metric == 'cosine':
            return 1-(np.dot(np.array(E1),np.array(E2))) / (norm(np.array(E1),axis=1) * norm(np.array(E2)))
        else:
            raise ValueError("Invalid metric")
        
    def fit(self, train_embeddings, train_labels, validate_embeddings, validate_labels):
        self.train_embeddings = train_embeddings
        self.train_labels = train_labels
        self.validate_embeddings = validate_embeddings
        self.validate_labels = validate_labels

    def data_split(self,data):
        self.labels = np.array(data[:, 3])
        self.embeddings = np.array(data[:,1:3])
        resnet = np.array(data[:,1])
        self.resnet = [res[0] for res in resnet]
        self.resnet = np.array(self.resnet)
        vit = np.array(data[:,2])
        self.vit = [v[0] for v in vit]
        self.vit = np.array(self.vit)
        num_total_samples = data.shape[0]
        num_training_samples = int(num_total_samples * self.ratio)
        indices = np.array(range(num_total_samples)) # used to check if unshuffled data is giving same results across users
        np.random.shuffle(indices) # permutes the array [0,....n-1] 
        self.indices = indices # saving indices
        self.num_training_samples = num_training_samples # saving number of training samples

    def evaluate(self, embeddings, true_labels):
        predicted_labels = self.predict_array(embeddings)
        F1_score = f1_score(true_labels, predicted_labels, average='macro')
        accuracy = accuracy_score(true_labels, predicted_labels)
        precision = precision_score(true_labels, predicted_labels, average='macro',zero_division=0)
        recall = recall_score(true_labels, predicted_labels, average='macro',zero_division=0)
        return F1_score, accuracy, precision, recall

    def train(self, encoder_type):
        if encoder_type == 'vit':
            train_index = self.indices[:self.num_training_samples]
            validate_index = self.indices[self.num_training_samples:]
            data_train = self.vit[train_index]
            data_validate = self.vit[validate_index]
            label_train = self.labels[train_index]
            label_validate = self.labels[validate_index]
            self.fit(data_train, label_train, data_validate, label_validate)
        elif encoder_type == 'resnet':
            train_index = self.indices[:self.num_training_samples]
            validate_index = self.indices[self.num_training_samples:]
            data_train = self.resnet[train_index]
            data_validate = self.resnet[validate_index]
            label_train = self.labels[train_index]
            label_validate = self.labels[validate_index]
            self.fit(data_train, label_train, data_validate, label_validate)

    def predict_sample(self, E):
        distances = self.get_distance(self.train_embeddings, E)
        sorted_indices = np.argsort(distances)
        nearest_index = sorted_indices[:self.k]
        nearest_labels = self.train_labels[nearest_index]
        classified_label = self.get_measure(nearest_labels)
        return classified_label

    def predict_array(self, X):
        predictions = [self.predict_sample(embeddings) for embeddings in X]
        return np.array(predictions)
    
    def print_answer(self):
        self.data_split(data)
        self.train(self.encoder_type)  
        F1_score, accuracy, precision, recall = self.evaluate(self.validate_embeddings, self.validate_labels)
        print_data = [['Accuracy',accuracy],['Precision',precision],['Recall',recall],['f1_score',F1_score]]
        df = pd.DataFrame(print_data,columns=['Measure','Value'])
        print(df.to_string(index=False))
    
Optimized_KNN_Object = Optimized_KNN(3,'vit','manhattan')
Optimized_KNN_Object.print_answer(data)

# Best KNN Model

In [None]:
class Best_KNN:
    def __init__(self, k, encoder_type, metric,ratio=0.8):
        self.k = k
        self.encoder_type = encoder_type
        self.metric = metric
        self.measure = 'mode'
        self.ratio = ratio

    def get_encoder_type(self):
        return self.encoder_type
    def set_encoder_type(self,encoder_type):
        self.encoder_type = encoder_type

    def get_metric(self):
        return self.metric
    def set_metric(self,metric):
        self.metric = metric

    def get_k(self): # get method for K
        return self.k
    def set_k(self,k): # set method for K
        self.k = k

    def get_ratio(self):
        return self.ratio
    def set_ratio(self,ratio):
        self.ratio = ratio

    def get_measure(self, A):
        if self.measure == 'mode':
            unique_values, counts = np.unique(np.array(A), return_counts=True)
            mode_index = np.argmax(counts)
            mode = unique_values[mode_index]
            return mode

    def get_distance(self, E1, E2):
        if self.metric == 'manhattan':
            return norm(np.array(E1)-np.array(E2),axis=1,ord=1)
        elif self.metric == 'euclidean':
            return norm(np.array(E1)-np.array(E2),axis=1,ord=2)
        elif self.metric == 'cosine':
            return 1-(np.dot(np.array(E1),np.array(E2))) / (norm(np.array(E1),axis=1) * norm(np.array(E2)))
        else:
            raise ValueError("Invalid metric")
        
    def fit(self, train_embeddings, train_labels, validate_embeddings, validate_labels):
        self.train_embeddings = train_embeddings
        self.train_labels = train_labels
        self.validate_embeddings = validate_embeddings
        self.validate_labels = validate_labels

    def data_split(self,data):
        self.labels = np.array(data[:, 3])
        self.embeddings = np.array(data[:,1:3])
        resnet = np.array(data[:,1])
        self.resnet = [res[0] for res in resnet]
        self.resnet = np.array(self.resnet)
        vit = np.array(data[:,2])
        self.vit = [v[0] for v in vit]
        self.vit = np.array(self.vit)
        num_total_samples = data.shape[0]
        num_training_samples = int(num_total_samples * self.ratio)
        indices = np.array(range(num_total_samples)) # used to check if unshuffled data is giving same results across users
        np.random.shuffle(indices) # permutes the array [0,....n-1] 
        self.indices = indices # saving indices
        self.num_training_samples = num_training_samples # saving number of training samples

    def evaluate(self, embeddings, true_labels):
        predicted_labels = self.predict_array(embeddings)
        F1_score = f1_score(true_labels, predicted_labels, average='macro')
        accuracy = accuracy_score(true_labels, predicted_labels)
        precision = precision_score(true_labels, predicted_labels, average='macro',zero_division=0)
        recall = recall_score(true_labels, predicted_labels, average='macro',zero_division=0)
        return F1_score, accuracy, precision, recall

    def train(self, encoder_type):
        if encoder_type == 'vit':
            train_index = self.indices[:self.num_training_samples]
            validate_index = self.indices[self.num_training_samples:]
            data_train = self.vit[train_index]
            data_validate = self.vit[validate_index]
            label_train = self.labels[train_index]
            label_validate = self.labels[validate_index]
            self.fit(data_train, label_train, data_validate, label_validate)
        elif encoder_type == 'resnet':
            train_index = self.indices[:self.num_training_samples]
            validate_index = self.indices[self.num_training_samples:]
            data_train = self.resnet[train_index]
            data_validate = self.resnet[validate_index]
            label_train = self.labels[train_index]
            label_validate = self.labels[validate_index]
            self.fit(data_train, label_train, data_validate, label_validate)

    def predict_sample(self, E):
        distances = self.get_distance(self.train_embeddings, E)
        sorted_indices = np.argsort(distances)
        nearest_index = sorted_indices[:self.k]
        nearest_labels = self.train_labels[nearest_index]
        classified_label = self.get_measure(nearest_labels)
        return classified_label

    def predict_array(self, X):
        predictions = [self.predict_sample(embeddings) for embeddings in X]
        return np.array(predictions)
    
    def print_answer(self):
        self.data_split(data)
        self.train(self.encoder_type)  
        F1_score, accuracy, precision, recall = self.evaluate(self.validate_embeddings, self.validate_labels)
        print_data = [['Accuracy',accuracy],['Precision',precision],['Recall',recall],['f1_score',F1_score]]
        df = pd.DataFrame(print_data,columns=['Measure','Value'])
        print(df.to_string(index=False))
    
Best_KNN_Object = Best_KNN(3,'vit','manhattan')
Best_KNN_Object.print_answer(data)

# Hyperparameter Tuning

In [None]:
Hyper_parameter_KNN = Optimized_KNN(3,'vit','manhattan')
K = np.array(range(40))+1
metric = ['manhattan','euclidean']
encoder_type = ['vit']
Hyper_parameter_KNN.data_split(data)
tuples = []
for k in K:
    for metric_type in metric:
        for encoder in encoder_type:
            Hyper_parameter_KNN.set_encoder_type(encoder)
            Hyper_parameter_KNN.set_k(k)
            Hyper_parameter_KNN.set_metric(metric_type)
            Hyper_parameter_KNN.train(Hyper_parameter_KNN.get_encoder_type())
            F1_score, accuracy, precision, recall = Hyper_parameter_KNN.evaluate(Hyper_parameter_KNN.validate_embeddings, Hyper_parameter_KNN.validate_labels)
            tuples.append((accuracy,k,metric_type,encoder))

sorted_tuples = sorted(tuples, key=lambda x: x[0], reverse=True)
best_triplet = sorted_tuples[0]
print_data = [[' Best Accuracy',best_triplet[0]],['K',best_triplet[1]],['metric_type',best_triplet[2]],['encoder_type',best_triplet[3]]]
df = pd.DataFrame(print_data,columns=['Measure','Value'])
print(df.to_string(index=False))
best_triplets = sorted_tuples[:1]
df = pd.DataFrame(best_triplets, columns=['Best Accuracy', 'k', 'Metric', 'Encoder-Type'])

print(df.to_string(index=False))
print('------------------------------')
top_triplets = sorted_tuples[:20]

df = pd.DataFrame(top_triplets, columns=['Accuracy', 'k', 'Metric', 'Encoder-Type'])

print(df.to_string(index=False))
given_metric = 'manhattan'
given_encoder_type = 'vit'
k_tuples = sorted(sorted_tuples, key=lambda x: x[1])
filtered_tuples = [(accuracy, k, metric, encoder_type) for accuracy, k, metric, encoder_type in k_tuples
                 if metric == given_metric and encoder_type == given_encoder_type]
filtered_df = pd.DataFrame(filtered_tuples, columns=['Accuracy', 'k', 'Metric', 'Encoder Type'])
print('------------------------------')
plt.figure(figsize=(10, 6))
plt.plot(filtered_df['k'], filtered_df['Accuracy'], marker='o')
plt.title(f'K vs Accuracy for {given_encoder_type} with {given_metric} distance')
plt.xlabel('k')
plt.ylabel('Accuracy')
plt.grid(True)
plt.show()

# Inference Time vs Model

In [None]:
sklearn_knn = KNeighborsClassifier(n_neighbors=3,metric='manhattan')
models = [Initial_KNN_Object,Optimized_KNN_Object,Best_KNN_Object]
inference_times = []
for model in models:
    start_time = time.time()
    model.data_split(data)
    model.train(model.encoder_type)  
    model.predict_array(model.validate_embeddings)
    end_time = time.time()
    inference_time = end_time - start_time
    inference_times.append(inference_time)
# model_names = ['Initial KNN', 'Best KNN', 'Optimized KNN', 'sklearn KNN']
start_time = time.time()
sklearn_knn.fit(Initial_KNN_Object.train_embeddings,Initial_KNN_Object.train_labels)
sklearn_knn.predict(Initial_KNN_Object.validate_embeddings)
end_time = time.time()
inference_time = end_time - start_time
inference_times.append(inference_time)
model_names = ['Initial KNN','Optimized KNN','Best KNN','sklearn KNN']
plt.bar(model_names, inference_times)
plt.ylabel('Inference Time (seconds)')
plt.title('Inference Time Comparison for Different KNN Models')
plt.show()

# Train_dataset_size vs Inference Time

In [None]:
ratios = [0.2,0.4,0.6,0.8]
sklearn_knn = KNeighborsClassifier(n_neighbors=3,metric='manhattan')
models = [Initial_KNN_Object,Optimized_KNN_Object,Best_KNN_Object,sklearn_knn]
model_names = ['Initial KNN', 'Optimized KNN', 'Best KNN', 'sklearn KNN']
inference_times = {model: [] for model in models}
size = data.shape[0]
for ratio in ratios:    
    for model in models:
        if model is sklearn_knn:
            break
        start_time = time.time()
        model.set_ratio(ratio)
        model.data_split(data)
        model.train(model.encoder_type)  
        model.predict_array(model.validate_embeddings)
        end_time = time.time()
        inference_time = end_time - start_time
        inference_times[model].append(inference_time)

sklearn_knn_inference_times = []
for ratio in ratios:
    start_time = time.time()
    Initial_KNN_Object.set_ratio(ratio)
    Initial_KNN_Object.data_split(data)
    Initial_KNN_Object.train(Initial_KNN_Object.encoder_type)
    sklearn_knn.fit(Initial_KNN_Object.train_embeddings,Initial_KNN_Object.train_labels)
    sklearn_knn.predict(Initial_KNN_Object.validate_embeddings)
    end_time = time.time()
    inference_time = end_time - start_time
    sklearn_knn_inference_times.append(inference_time)

inference_times[sklearn_knn] = sklearn_knn_inference_times

for model, times in inference_times.items():
    plt.plot(ratios, times, label=model_names[models.index(model)])
    
plt.xlabel('Training Dataset Size')
plt.ylabel('Inference Time (seconds)')
plt.title('Inference Time vs Training Dataset Size')
plt.legend()
plt.show()

#  Decision Tree Implementation in Python

Data Visualization

In [None]:
import pandas as pd
import numpy as np
from sklearn import preprocessing
import matplotlib.pyplot as plt
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score, accuracy_score, precision_score, recall_score , confusion_matrix
from sklearn.preprocessing import MultiLabelBinarizer
data = pd.read_csv('advertisement.csv')
print(data.to_string(index=False))
data.dropna()
print(data.shape)
print(data.head())
print(data.info())
print(list(data.columns))

data_labels = np.array(data['labels'])
values, counts = np.unique(data_labels, return_counts=True)

plt.figure(figsize=(10, 60))
plt.barh(values, counts)
plt.xlabel('Frequency')
plt.ylabel('Label')
plt.title('Distribution of Labels')
plt.tight_layout()
plt.show()

plt.figure(figsize=(10, 6))
plt.scatter(data['age'], data['purchase_amount'], alpha=0.5)
plt.title('Age vs Purchase Amount')
plt.xlabel('Age')
plt.ylabel('Purchase Amount')
plt.tight_layout()
plt.show()

correlation_matrix = data.corr()
plt.figure(figsize=(10, 6))
plt.imshow(correlation_matrix, cmap='viridis', interpolation='nearest')
plt.colorbar()
plt.title('Correlation Matrix')
plt.xticks(range(len(correlation_matrix)), correlation_matrix.columns, rotation=90)
plt.yticks(range(len(correlation_matrix)), correlation_matrix.columns)
plt.tight_layout()
plt.show()

# Data preprocessing

In [None]:
data.dropna(inplace=True)

# Data featurization

In [None]:
categorical_features = ['gender', 'education', 'married', 'city', 'occupation', 'most bought item']
numerical_features = ['age', 'income', 'children', 'purchase_amount']
target_columns = ['labels']
one_hot_encoding = pd.get_dummies(data,columns=categorical_features,drop_first=True)
# print(one_hot_encoding.head())

# Train val test splitting - MultiOutput Formulation

In [None]:
def Multi_Output_data_split():
        X = one_hot_encoding.drop('labels',axis=1)
        y = one_hot_encoding['labels']
        X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2, random_state=42)
        mlb = MultiLabelBinarizer()
        y_train = mlb.fit_transform(y_train.str.split(' '))
        y_test = mlb.transform(y_test.str.split(' '))
        return X_train,X_test,y_train,y_test


class Multi_Output_DecisionTree:
    def __init__(self,criterion='gini',max_depth=3,max_features=None):
        self.criterion = criterion
        self.max_depth = max_depth
        self.max_features = max_features
        self.classifier = DecisionTreeClassifier(criterion=criterion,max_depth=max_depth,max_features=max_features,random_state=42)
    def fit(self,X_train,y_train):
        self.classifier.fit(X_train,y_train)
    def predict(self,X_test):
        predicted_labels = self.classifier.predict(X_test)
        return predicted_labels
    def print_answer(self,y_test,predicted_labels):
        accuracy = accuracy_score(y_test,predicted_labels)
        f1_micro = f1_score(y_test, predicted_labels, average='micro',zero_division=0)
        f1_macro = f1_score(y_test, predicted_labels, average='macro',zero_division=0)
        precision = precision_score(y_test, predicted_labels, average='macro',zero_division=0)
        recall = recall_score(y_test, predicted_labels, average='macro',zero_division=0)
        Confusion_matrix = confusion_matrix(y_test.argmax(axis=1),predicted_labels.argmax(axis=1))
        return accuracy,f1_micro,f1_macro,precision,recall,Confusion_matrix
    def change_parameters(self,criterion,max_depth,max_features):
        self.criterion = criterion
        self.max_depth = max_depth
        self.max_features = max_features
        self.classifier.set_params(criterion=self.criterion,max_depth=self.max_depth,max_features=self.max_features,random_state=42)

# Hyperparameter Tuning - MultiOutput Formulation

In [None]:
criterion = ['gini','entropy']
max_depths = [3,5,10,20,30]
max_features = [3,5,7,9,11]
tuples = []
clf = Multi_Output_DecisionTree()
X_train,X_test,y_train,y_test = Multi_Output_data_split()
for criteria in criterion:
    for depth in max_depths:
        for feature in max_features:
            clf.change_parameters(criteria,depth,feature)
            clf.fit(X_train,y_train)
            predicted_labels = clf.predict(X_test)
            accuracy,f1_micro,f1_macro,precision,recall,Confusion_matrix = clf.print_answer(y_test,predicted_labels)
            tuples.append((criteria,depth,feature,accuracy,f1_micro,f1_macro,precision,recall,Confusion_matrix))
# (accuracy,f1_micro,f1_macro,precision,recall)
df = pd.DataFrame(tuples, columns=['Criteria','Max-Depth','Max-features','Accuracy', 'f1_micro_score', 'f1_macro_score', 'precision','recall','Confusion_matrix'])
# print(df.to_string(index=False))
sorted_f1_micro_tuples = sorted(tuples, key=lambda x: x[4], reverse=True)
sorted_f1_macro_tuples = sorted(tuples, key=lambda x: x[5], reverse=True)
top_f1_micro_tuples = sorted_f1_micro_tuples[:3]
top_f1_macro_tuples = sorted_f1_macro_tuples[:3]
print('----------------------------')
print('Top 3 performing set of hyperparamters according to F1-micro Score')
df = pd.DataFrame(top_f1_micro_tuples, columns=['Criteria','Max-Depth','Max-features','Accuracy', 'f1_micro_score', 'f1_macro_score', 'precision','recall','Confusion_matrix'])
print(df.to_string(index=False))
print('----------------------------')
print('Top 3 performing set of hyperparamters according to F1-macro Score')
df = pd.DataFrame(top_f1_macro_tuples, columns=['Criteria','Max-Depth','Max-features','Accuracy', 'f1_micro_score', 'f1_macro_score', 'precision','recall','Confusion_matrix'])
print(df.to_string(index=False))

----------------------------
Top 3 performing set of hyperparamters according to F1-micro Score
Criteria  Max-Depth  Max-features  Accuracy  f1_micro_score  f1_macro_score  precision   recall                                                                                                                                                                                                      Confusion_matrix
 entropy         30            11     0.020        0.279412        0.278655   0.405746 0.216752 [[54, 7, 2, 4, 0, 1, 0, 0], [35, 4, 5, 1, 1, 0, 0, 0], [18, 1, 4, 2, 2, 0, 0, 0], [12, 2, 3, 3, 0, 2, 0, 0], [11, 0, 1, 5, 1, 0, 0, 0], [6, 1, 0, 0, 0, 1, 0, 0], [2, 1, 1, 1, 0, 0, 0, 0], [4, 0, 1, 1, 0, 0, 0, 0]]
    gini         30            11     0.005        0.228731        0.220438   0.402976 0.156048 [[61, 3, 2, 2, 0, 0, 0, 0], [40, 0, 1, 3, 2, 0, 0, 0], [24, 3, 0, 0, 0, 0, 0, 0], [15, 3, 1, 2, 0, 1, 0, 0], [14, 0, 2, 2, 0, 0, 0, 0], [6, 1, 1, 0, 0, 0, 0, 0], [5, 0, 0, 0, 0, 0, 0, 0], 

# Train val test splitting - Powerset Formulation

In [None]:
def Power_set_data_split():
    X = one_hot_encoding.drop('labels',axis=1)
    all_labels = set()
    for label_set in data['labels']:
        labels = label_set.split()
        all_labels.update(labels)
    powerset_labels = []
    for r in range(2 ** len(all_labels)):
        label_indices = [i for i in range(len(all_labels)) if (r & (1 << i)) > 0]
        label_combination = [list(all_labels)[idx] for idx in label_indices]
        powerset_labels.append(label_combination)
    sorted_powerset_labels = [' '.join(sorted(label_set)) for label_set in powerset_labels]
    sorted_powerset_labels.sort()
    label_to_idx = {label: idx for idx, label in enumerate(sorted_powerset_labels)}
    label_vectors = np.zeros((len(data), len(powerset_labels)))
    for idx, label_set in enumerate(data['labels']):
        labels = label_set.split()
        sorted_labels = ' '.join(sorted(labels))
        label_vectors[idx, label_to_idx[sorted_labels]] = 1
    y = label_vectors
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    return X_train,X_test,y_train,y_test

class Powerset_DecisionTree:
    def __init__(self,criterion='gini',max_depth=3,max_features=None):
        self.criterion = criterion
        self.max_depth = max_depth
        self.max_features = max_features
        self.classifier = DecisionTreeClassifier(criterion=criterion,max_depth=max_depth,max_features=max_features,random_state=42)
    def fit(self,X_train,y_train):
        self.classifier.fit(X_train,y_train)
    def predict(self,X_test):
        predicted_labels = self.classifier.predict(X_test)
        return predicted_labels
    def print_answer(self,y_test,predicted_labels):
        accuracy = accuracy_score(y_test,predicted_labels)
        f1_micro = f1_score(y_test, predicted_labels, average='micro',zero_division=0)
        f1_macro = f1_score(y_test, predicted_labels, average='macro',zero_division=0)
        precision = precision_score(y_test, predicted_labels, average='macro',zero_division=0)
        recall = recall_score(y_test, predicted_labels, average='macro',zero_division=0)
        Confusion_matrix = confusion_matrix(y_test.argmax(axis=1),predicted_labels.argmax(axis=1))
        return accuracy,f1_micro,f1_macro,precision,recall,Confusion_matrix
    def change_parameters(self,criterion,max_depth,max_features):
        self.criterion = criterion
        self.max_depth = max_depth
        self.max_features = max_features
        self.classifier.set_params(criterion=self.criterion,max_depth=self.max_depth,max_features=self.max_features,random_state=42)

# Hyperparameter Tuning - Powerset Formulation

In [None]:
criterion = ['gini','entropy']
max_depths = [3,5,10,20,30]
max_features = [3,5,7,9,11]
tuples = []
clf = Powerset_DecisionTree()
X_train,X_test,y_train,y_test = Power_set_data_split()
for criteria in criterion:
    for depth in max_depths:
        for feature in max_features:
            clf.change_parameters(criteria,depth,feature)
            clf.fit(X_train,y_train)
            predicted_labels = clf.predict(X_test)
            accuracy,f1_micro,f1_macro,precision,recall,Confusion_matrix = clf.print_answer(y_test,predicted_labels)
            tuples.append((criteria,depth,feature,accuracy,f1_micro,f1_macro,precision,recall,Confusion_matrix))
# (accuracy,f1_micro,f1_macro,precision,recall)
df = pd.DataFrame(tuples, columns=['Criteria','Max-Depth','Max-features','Accuracy', 'f1_micro_score', 'f1_macro_score', 'precision','recall','Confusion_matrix'])
# print(df.to_string(index=False))
sorted_f1_micro_tuples = sorted(tuples, key=lambda x: x[4], reverse=True)
sorted_f1_macro_tuples = sorted(tuples, key=lambda x: x[5], reverse=True)
top_f1_micro_tuples = sorted_f1_micro_tuples[:3]
top_f1_macro_tuples = sorted_f1_macro_tuples[:3]
print('----------------------------')
print('Top 3 performing set of hyperparamters according to F1-micro Score')
df = pd.DataFrame(top_f1_micro_tuples, columns=['Criteria','Max-Depth','Max-features','Accuracy', 'f1_micro_score', 'f1_macro_score', 'precision','recall','Confusion_matrix'])
print(df.to_string(index=False))
print('----------------------------')
print('Top 3 performing set of hyperparamters according to F1-macro Score')
df = pd.DataFrame(top_f1_macro_tuples, columns=['Criteria','Max-Depth','Max-features','Accuracy', 'f1_micro_score', 'f1_macro_score', 'precision','recall','Confusion_matrix'])
print(df.to_string(index=False))

----------------------------
Top 3 performing set of hyperparamters according to F1-micro Score
Criteria  Max-Depth  Max-features  Accuracy  f1_micro_score  f1_macro_score  precision   recall                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                         