Data Visualization

In [28]:
import pandas as pd
import numpy as np
from sklearn import preprocessing
import matplotlib.pyplot as plt
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score, accuracy_score, precision_score, recall_score , confusion_matrix
from sklearn.preprocessing import MultiLabelBinarizer
data = pd.read_csv('advertisement.csv')
print(data.to_string(index=False))
data.dropna()
print(data.shape)
print(data.head())
print(data.info())
print(list(data.columns))
# print(data['labels'].value_counts())

 age gender       income   education  married  children                  city  occupation  purchase_amount most bought item                                                labels
  45   Male 61271.953359      Master    False         3           Lake Sheila      Doctor        87.697118          monitor                           electronics clothing sports
  24 Female 53229.101074 High School    False         1          Crystalburgh Businessman       115.135586         lipstick                                      furniture beauty
  45 Female 30066.046684    Bachelor     True         3         Margaretburgh    Engineer       101.694559         biscuits                      clothing electronics food sports
  19   Male 48950.246384         PhD    False         0         Williamshaven      Lawyer        97.964887            maggi                                                  food
  29 Female 44792.627094      Master    False         0              New Paul Businessman        86.847281    

Data preprocessing

In [29]:
data.dropna(inplace=True)

Data featurization

In [30]:
categorical_features = ['gender', 'education', 'married', 'city', 'occupation', 'most bought item']
numerical_features = ['age', 'income', 'children', 'purchase_amount']
target_columns = ['labels']
one_hot_encoding = pd.get_dummies(data,columns=categorical_features)
# print(one_hot_encoding.head())



Train val test splitting - MultiOutput Setting

In [31]:
X = one_hot_encoding.drop('labels',axis=1)
y = one_hot_encoding['labels']
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2, random_state=42)
print(y_train)
# y_train = y_train.str.get_dummies(sep=' ')
# y_test = y_test.str.get_dummies(sep=' ')
mlb = MultiLabelBinarizer()
print(y_train.str.split(' '))
print(y_test.str.split(' '))
y_train = mlb.fit_transform(y_train.str.split(' '))
y_test = mlb.transform(y_test.str.split(' '))
print(list(mlb.classes_))
print(y_train)
# print(y_test)
criterion = 'gini' 
clf = DecisionTreeClassifier(criterion=criterion)
clf.fit(X_train,y_train)
predicted_labels = clf.predict(X_test)
print(predicted_labels)
accuracy = accuracy_score(y_test,predicted_labels)
print(f'Validation Accuracy: {accuracy:.2f}')
print(list(mlb.classes_))

29           sports electronics furniture books
535                                       books
695                                       books
557                          sports electronics
836                home electronics food beauty
                         ...                   
106                             clothing beauty
270    beauty electronics furniture food sports
860                     home food sports beauty
435                              food furniture
102                                beauty books
Name: labels, Length: 800, dtype: object
29            [sports, electronics, furniture, books]
535                                           [books]
695                                           [books]
557                             [sports, electronics]
836                 [home, electronics, food, beauty]
                            ...                      
106                                [clothing, beauty]
270    [beauty, electronics, furniture, food, sports]

Hyperparameter Tuning - MultiOutput Setting

In [32]:
criterion = ['gini','entropy']
max_depths = [3,5,10,20,30]
max_features = [3,5,7,9,11]
tuples = []
for criteria in criterion:
    for depth in max_depths:
        for feature in max_features:
            DT = DecisionTreeClassifier(criterion=criteria,max_features=feature,max_depth=depth)
            clf.fit(X_train,y_train)
            predicted_labels = clf.predict(X_test)
            accuracy = accuracy_score(y_test,predicted_labels)
            f1_micro = f1_score(y_test, predicted_labels, average='micro')
            f1_macro = f1_score(y_test, predicted_labels, average='macro')
            precision = precision_score(y_test, predicted_labels, average='macro',zero_division=0)
            recall = recall_score(y_test, predicted_labels, average='macro',zero_division=0)
            Confusion_matrix = confusion_matrix(y_test.argmax(axis=1),predicted_labels.argmax(axis=1))
            tuples.append((criteria,depth,feature,accuracy,f1_micro,f1_macro,precision,recall,Confusion_matrix))
# (accuracy,f1_micro,f1_macro,precision,recall)
df = pd.DataFrame(tuples, columns=['Criteria','Max-Depth','Max-features','Accuracy', 'f1_micro_score', 'f1_macro_score', 'precision','recall','Confusion_matrix'])
# print(df.to_string(index=False))
sorted_f1_micro_tuples = sorted(tuples, key=lambda x: x[4], reverse=True)
sorted_f1_macro_tuples = sorted(tuples, key=lambda x: x[5], reverse=True)
top_f1_micro_tuples = sorted_f1_micro_tuples[:3]
top_f1_macro_tuples = sorted_f1_macro_tuples[:3]
print('----------------------------')
print('Top 3 performing set of hyperparamters according to F1-micro Score')
df = pd.DataFrame(top_f1_micro_tuples, columns=['Criteria','Max-Depth','Max-features','Accuracy', 'f1_micro_score', 'f1_macro_score', 'precision','recall','Confusion_matrix'])
print(df.to_string(index=False))
print('----------------------------')
print('Top 3 performing set of hyperparamters according to F1-macro Score')
df = pd.DataFrame(top_f1_macro_tuples, columns=['Criteria','Max-Depth','Max-features','Accuracy', 'f1_micro_score', 'f1_macro_score', 'precision','recall','Confusion_matrix'])
print(df.to_string(index=False))

----------------------------
Top 3 performing set of hyperparamters according to F1-micro Score
Criteria  Max-Depth  Max-features  Accuracy  f1_micro_score  f1_macro_score  precision   recall                                                                                                                                                                                                      Confusion_matrix
 entropy         30             5     0.065        0.544046        0.540727   0.553501 0.529774 [[38, 13, 7, 8, 1, 1, 0, 0], [11, 21, 4, 3, 1, 4, 0, 2], [7, 1, 12, 2, 0, 2, 0, 3], [5, 1, 2, 8, 2, 3, 0, 1], [4, 2, 3, 1, 3, 1, 2, 2], [1, 4, 1, 0, 1, 1, 0, 0], [2, 1, 0, 0, 1, 0, 1, 0], [1, 1, 0, 0, 0, 1, 0, 3]]
    gini          3            11     0.055        0.536204        0.531086   0.549391 0.515380 [[35, 10, 11, 5, 3, 2, 1, 1], [10, 24, 5, 2, 1, 1, 1, 2], [8, 3, 8, 2, 0, 3, 0, 3], [5, 2, 3, 7, 2, 2, 0, 1], [3, 1, 2, 0, 5, 1, 4, 2], [1, 3, 1, 0, 1, 1, 0, 1], [1, 1, 0, 0, 1, 1, 1, 0], 