Data Visualization

In [2]:
import pandas as pd
import numpy as np
from sklearn import preprocessing
import matplotlib.pyplot as plt
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split , KFold
from sklearn.metrics import f1_score, accuracy_score, precision_score, recall_score , confusion_matrix
from sklearn.preprocessing import MultiLabelBinarizer
data = pd.read_csv('advertisement.csv')
print(data.to_string(index=False))
data.dropna()
print(data.shape)
print(data.head())
print(data.info())
print(list(data.columns))

data_labels = np.array(data['labels'])
values, counts = np.unique(data_labels, return_counts=True)

# Create a horizontal bar plot
plt.figure(figsize=(10, 60))
plt.barh(values, counts)
plt.xlabel('Frequency')
plt.ylabel('Label')
plt.title('Distribution of Labels')
plt.tight_layout()
plt.show()

plt.figure(figsize=(10, 6))
plt.scatter(data['age'], data['purchase_amount'], alpha=0.5)
plt.title('Age vs Purchase Amount')
plt.xlabel('Age')
plt.ylabel('Purchase Amount')
plt.tight_layout()
plt.show()

correlation_matrix = data.corr()
plt.figure(figsize=(10, 6))
plt.imshow(correlation_matrix, cmap='viridis', interpolation='nearest')
plt.colorbar()
plt.title('Correlation Matrix')
plt.xticks(range(len(correlation_matrix)), correlation_matrix.columns, rotation=90)
plt.yticks(range(len(correlation_matrix)), correlation_matrix.columns)
plt.tight_layout()
plt.show()

Data preprocessing

In [3]:
data.dropna(inplace=True)

Data featurization

In [4]:
categorical_features = ['gender', 'education', 'married', 'city', 'occupation', 'most bought item']
numerical_features = ['age', 'income', 'children', 'purchase_amount']
target_columns = ['labels']
one_hot_encoding = pd.get_dummies(data,columns=categorical_features,drop_first=True)
# print(one_hot_encoding.head())

Train val test splitting - MultiOutput Formulation

In [5]:
def Multi_Output_data_split():
        X = one_hot_encoding.drop('labels',axis=1)
        y = one_hot_encoding['labels']
        X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2, random_state=42)
        mlb = MultiLabelBinarizer()
        y_train = mlb.fit_transform(y_train.str.split(' '))
        y_test = mlb.transform(y_test.str.split(' '))
        return X_train,X_test,y_train,y_test


class Multi_Output_DecisionTree:
    def __init__(self,criterion='gini',max_depth=3,max_features=None):
        self.criterion = criterion
        self.max_depth = max_depth
        self.max_features = max_features
        self.classifier = DecisionTreeClassifier(criterion=criterion,max_depth=max_depth,max_features=max_features,random_state=42)
    def fit(self,X_train,y_train):
        self.classifier.fit(X_train,y_train)
    def predict(self,X_test):
        predicted_labels = self.classifier.predict(X_test)
        return predicted_labels
    def print_answer(self,y_test,predicted_labels):
        accuracy = accuracy_score(y_test,predicted_labels)
        f1_micro = f1_score(y_test, predicted_labels, average='micro',zero_division=0)
        f1_macro = f1_score(y_test, predicted_labels, average='macro',zero_division=0)
        precision = precision_score(y_test, predicted_labels, average='macro',zero_division=0)
        recall = recall_score(y_test, predicted_labels, average='macro',zero_division=0)
        Confusion_matrix = confusion_matrix(y_test.argmax(axis=1),predicted_labels.argmax(axis=1))
        return accuracy,f1_micro,f1_macro,precision,recall,Confusion_matrix
    def change_parameters(self,criterion,max_depth,max_features):
        self.criterion = criterion
        self.max_depth = max_depth
        self.max_features = max_features
        self.classifier.set_params(criterion=self.criterion,max_depth=self.max_depth,max_features=self.max_features,random_state=42)

Hyperparameter Tuning - MultiOutput Formulation

In [6]:
criterion = ['gini','entropy']
max_depths = [3,5,10,20,30]
max_features = [3,5,7,9,11]
tuples = []
clf = Multi_Output_DecisionTree()
X_train,X_test,y_train,y_test = Multi_Output_data_split()
for criteria in criterion:
    for depth in max_depths:
        for feature in max_features:
            clf.change_parameters(criteria,depth,feature)
            clf.fit(X_train,y_train)
            predicted_labels = clf.predict(X_test)
            accuracy,f1_micro,f1_macro,precision,recall,Confusion_matrix = clf.print_answer(y_test,predicted_labels)
            tuples.append((criteria,depth,feature,accuracy,f1_micro,f1_macro,precision,recall,Confusion_matrix))
# (accuracy,f1_micro,f1_macro,precision,recall)
df = pd.DataFrame(tuples, columns=['Criteria','Max-Depth','Max-features','Accuracy', 'f1_micro_score', 'f1_macro_score', 'precision','recall','Confusion_matrix'])
# print(df.to_string(index=False))
sorted_f1_micro_tuples = sorted(tuples, key=lambda x: x[4], reverse=True)
sorted_f1_macro_tuples = sorted(tuples, key=lambda x: x[5], reverse=True)
top_f1_micro_tuples = sorted_f1_micro_tuples[:3]
top_f1_macro_tuples = sorted_f1_macro_tuples[:3]
print('----------------------------')
print('Top 3 performing set of hyperparamters according to F1-micro Score')
df = pd.DataFrame(top_f1_micro_tuples, columns=['Criteria','Max-Depth','Max-features','Accuracy', 'f1_micro_score', 'f1_macro_score', 'precision','recall','Confusion_matrix'])
print(df.to_string(index=False))
print('----------------------------')
print('Top 3 performing set of hyperparamters according to F1-macro Score')
df = pd.DataFrame(top_f1_macro_tuples, columns=['Criteria','Max-Depth','Max-features','Accuracy', 'f1_micro_score', 'f1_macro_score', 'precision','recall','Confusion_matrix'])
print(df.to_string(index=False))

Train val test splitting - Powerset Formulation

In [7]:
def Power_set_data_split():
    X = one_hot_encoding.drop('labels',axis=1)
    all_labels = set()
    for label_set in data['labels']:
        labels = label_set.split()
        all_labels.update(labels)
    powerset_labels = []
    for r in range(2 ** len(all_labels)):
        label_indices = [i for i in range(len(all_labels)) if (r & (1 << i)) > 0]
        label_combination = [list(all_labels)[idx] for idx in label_indices]
        powerset_labels.append(label_combination)
    sorted_powerset_labels = [' '.join(sorted(label_set)) for label_set in powerset_labels]
    sorted_powerset_labels.sort()
    label_to_idx = {label: idx for idx, label in enumerate(sorted_powerset_labels)}
    label_vectors = np.zeros((len(data), len(powerset_labels)))
    for idx, label_set in enumerate(data['labels']):
        labels = label_set.split()
        sorted_labels = ' '.join(sorted(labels))
        label_vectors[idx, label_to_idx[sorted_labels]] = 1
    y = label_vectors
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    return X_train,X_test,y_train,y_test

class Powerset_DecisionTree:
    def __init__(self,criterion='gini',max_depth=3,max_features=None):
        self.criterion = criterion
        self.max_depth = max_depth
        self.max_features = max_features
        self.classifier = DecisionTreeClassifier(criterion=criterion,max_depth=max_depth,max_features=max_features,random_state=42)
    def fit(self,X_train,y_train):
        self.classifier.fit(X_train,y_train)
    def predict(self,X_test):
        predicted_labels = self.classifier.predict(X_test)
        return predicted_labels
    def print_answer(self,y_test,predicted_labels):
        accuracy = accuracy_score(y_test,predicted_labels)
        f1_micro = f1_score(y_test, predicted_labels, average='micro',zero_division=0)
        f1_macro = f1_score(y_test, predicted_labels, average='macro',zero_division=0)
        precision = precision_score(y_test, predicted_labels, average='macro',zero_division=0)
        recall = recall_score(y_test, predicted_labels, average='macro',zero_division=0)
        Confusion_matrix = confusion_matrix(y_test.argmax(axis=1),predicted_labels.argmax(axis=1))
        return accuracy,f1_micro,f1_macro,precision,recall,Confusion_matrix
    def change_parameters(self,criterion,max_depth,max_features):
        self.criterion = criterion
        self.max_depth = max_depth
        self.max_features = max_features
        self.classifier.set_params(criterion=self.criterion,max_depth=self.max_depth,max_features=self.max_features,random_state=42)

# Hyperparameter Tuning - Powerset Formulation

In [8]:
criterion = ['gini','entropy']
max_depths = [3,5,10,20,30]
max_features = [3,5,7,9,11]
tuples = []
clf = Powerset_DecisionTree()
X_train,X_test,y_train,y_test = Power_set_data_split()
for criteria in criterion:
    for depth in max_depths:
        for feature in max_features:
            clf.change_parameters(criteria,depth,feature)
            clf.fit(X_train,y_train)
            predicted_labels = clf.predict(X_test)
            accuracy,f1_micro,f1_macro,precision,recall,Confusion_matrix = clf.print_answer(y_test,predicted_labels)
            tuples.append((criteria,depth,feature,accuracy,f1_micro,f1_macro,precision,recall,Confusion_matrix))
# (accuracy,f1_micro,f1_macro,precision,recall)
df = pd.DataFrame(tuples, columns=['Criteria','Max-Depth','Max-features','Accuracy', 'f1_micro_score', 'f1_macro_score', 'precision','recall','Confusion_matrix'])
# print(df.to_string(index=False))
sorted_f1_micro_tuples = sorted(tuples, key=lambda x: x[4], reverse=True)
sorted_f1_macro_tuples = sorted(tuples, key=lambda x: x[5], reverse=True)
top_f1_micro_tuples = sorted_f1_micro_tuples[:3]
top_f1_macro_tuples = sorted_f1_macro_tuples[:3]
print('----------------------------')
print('Top 3 performing set of hyperparamters according to F1-micro Score')
df = pd.DataFrame(top_f1_micro_tuples, columns=['Criteria','Max-Depth','Max-features','Accuracy', 'f1_micro_score', 'f1_macro_score', 'precision','recall','Confusion_matrix'])
print(df.to_string(index=False))
print('----------------------------')
print('Top 3 performing set of hyperparamters according to F1-macro Score')
df = pd.DataFrame(top_f1_macro_tuples, columns=['Criteria','Max-Depth','Max-features','Accuracy', 'f1_micro_score', 'f1_macro_score', 'precision','recall','Confusion_matrix'])
print(df.to_string(index=False))

# K-Fold Validation - Powerset Formulation

In [9]:
from sklearn.model_selection import KFold
X = one_hot_encoding.drop('labels',axis=1)
all_labels = set()
for label_set in data['labels']:
    labels = label_set.split()
    all_labels.update(labels)
powerset_labels = []
for r in range(2 ** len(all_labels)):
    label_indices = [i for i in range(len(all_labels)) if (r & (1 << i)) > 0]
    label_combination = [list(all_labels)[idx] for idx in label_indices]
    powerset_labels.append(label_combination)
sorted_powerset_labels = [' '.join(sorted(label_set)) for label_set in powerset_labels]
sorted_powerset_labels.sort()
label_to_idx = {label: idx for idx, label in enumerate(sorted_powerset_labels)}
label_vectors = np.zeros((len(data), len(powerset_labels)))
for idx, label_set in enumerate(data['labels']):
    labels = label_set.split()
    sorted_labels = ' '.join(sorted(labels))
    label_vectors[idx, label_to_idx[sorted_labels]] = 1
y = label_vectors
kf = KFold(n_splits=5)
clf = Powerset_DecisionTree()
clf.change_parameters('entropy',20,11)
Micro = []
Macro = []
Accuracy = []
Precision = []
Recall = []
for i, (train_index, test_index) in enumerate(kf.split(X,y)):
    X_train = X.iloc[train_index] 
    X_test = X.iloc[test_index]
    y_train = y[train_index]
    y_test = y[test_index]
    clf.fit(X_train,y_train)
    predicted_labels = clf.predict(X_test)
    accuracy,f1_micro,f1_macro,precision,recall,Confusion_matrix = clf.print_answer(y_test,predicted_labels)
    Micro.append(f1_micro)
    Macro.append(f1_macro)
    Accuracy.append(accuracy)
    Precision.append(precision)
    Recall.append(recall)
print(np.mean(Micro),np.mean(Macro),np.mean(Accuracy),np.mean(Precision),np.mean(Recall))    

0.0031568132997104696 0.0004836309523809524 0.002 0.0005859375 0.0004557291666666666


# K-Fold Validation - MultiOutput Formulation

In [12]:
X = one_hot_encoding.drop('labels',axis=1)
y = one_hot_encoding['labels']
mlb = MultiLabelBinarizer()
y = mlb.fit_transform(y.str.split(' '))
clf = Multi_Output_DecisionTree()
clf.change_parameters('entropy',30,1100)
kf = KFold(n_splits=5)
Micro = []
Macro = []
Accuracy = []
Precision = []
Recall = []
for i, (train_index, test_index) in enumerate(kf.split(X,y)):
    X_train = X.iloc[train_index] 
    X_test = X.iloc[test_index]
    y_train = y[train_index]
    y_test = y[test_index]
    clf.fit(X_train,y_train)
    predicted_labels = clf.predict(X_test)
    accuracy,f1_micro,f1_macro,precision,recall,Confusion_matrix = clf.print_answer(y_test,predicted_labels)
    Micro.append(f1_micro)
    Macro.append(f1_macro)
    Accuracy.append(accuracy)
    Precision.append(precision)
    Recall.append(recall)
print(np.mean(Micro),np.mean(Macro),np.mean(Accuracy),np.mean(Precision),np.mean(Recall))    

0.508448223210882 0.5065104529890514 0.036000000000000004 0.520425667812449 0.497084238359497
