In [1]:
import pandas as pd
import numpy as np

In [2]:
import os
from tqdm.notebook import tqdm

In [3]:
from metrics import multiclass_stats

In [4]:
os.chdir('yelp_academic')

In [5]:
br = pd.read_csv('data/business_ambience.csv')

In [6]:
br.head()

Unnamed: 0,photo_id,touristy,hipster,romantic,divey,intimate,upscale
0,ZlTwL6uWx6rW_L9Df5RT8A,False,False,True,False,True,False
1,fHbSMxueQfXFRb9e-6bJuw,False,False,False,True,False,False
2,74oWvVVIjms9LjfHQOgxMQ,False,False,False,False,False,True
3,QY6c1OKsIpujF4MDHQdbag,False,True,False,False,False,False
4,0AYEzNJYFF2PeXo71cpKuw,False,True,False,False,False,False


In [7]:
feature_cols = ['touristy', 'hipster', 'romantic', 'divey', 'intimate', 'upscale']

In [8]:
from sklearn.model_selection import train_test_split

In [9]:
X_train_files, X_test_files, y_train, y_test = train_test_split(br.photo_id, br[feature_cols], train_size = 0.9, random_state=420, stratify=br[feature_cols])

In [10]:
features = np.load('data/transfer_features/densenet_features.npz', allow_pickle = True)['arr_0'][()]

In [11]:
X_train, X_test = [], []

In [12]:
for filename in tqdm(X_train_files):
    X_train.append(features[filename])

  0%|          | 0/29635 [00:00<?, ?it/s]

In [13]:
for filename in tqdm(X_test_files):
    X_test.append(features[filename])

  0%|          | 0/3293 [00:00<?, ?it/s]

In [14]:
X_train = np.array(X_train)
X_test = np.array(X_test)

In [15]:
X_test.shape

(3293, 1024)

In [16]:
y_train = y_train.to_numpy(dtype='int')
y_test = y_test.to_numpy(dtype='int')

In [17]:
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.multioutput import MultiOutputClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import BernoulliNB
from sklearn.svm import SVC
from sklearn.metrics import classification_report, hamming_loss, f1_score, roc_auc_score

In [18]:
lr_clf = make_pipeline(StandardScaler(),
                       MultiOutputClassifier(
                           LogisticRegression(max_iter=10000,
                                              random_state=42,
                                              class_weight='balanced'))
                      ).fit(X_train, y_train)

In [19]:
y_pred = lr_clf.predict(X_test)

In [20]:
test_filenames = list(X_test_files)

In [21]:
mispredicted_list = []

In [22]:
for i in range(len(test_filenames)):
    mispredicted = {}
    if not np.array_equal(y_pred[i], y_test[i]):
        mispredicted['filename'] = test_filenames[i]
        mispredicted['actual'] = y_test[i]
        mispredicted['prediction'] = y_pred[i]
        mispredicted_list.append(mispredicted)

In [23]:
import pickle
with open('comparison/ambience_lr_densenet_transfer_mispredicted.pickle', 'wb') as a:
    pickle.dump(mispredicted_list, a)

In [39]:
report, stats = multiclass_stats(y_test, y_pred)
stats

  _warn_prf(average, modifier, msg_start, len(result))


{'hamming_score': 0.35008097985626074,
 'hamming_loss': 0.3136957181901002,
 'f1_score': 0.4186371855651641,
 'roc_auc_score': 0.6695194381389631}

In [40]:
print(report)

              precision    recall  f1-score   support

           0       0.39      0.64      0.49       730
           1       0.54      0.64      0.59      1253
           2       0.22      0.59      0.33       464
           3       0.22      0.68      0.33       304
           4       0.21      0.62      0.31       440
           5       0.35      0.70      0.47       585

   micro avg       0.33      0.64      0.44      3776
   macro avg       0.32      0.64      0.42      3776
weighted avg       0.38      0.64      0.46      3776
 samples avg       0.36      0.65      0.44      3776



In [41]:
nb_clf = make_pipeline(StandardScaler(),
                       MultiOutputClassifier(
                           BernoulliNB())
                       ).fit(X_train, y_train)

In [42]:
y_pred = nb_clf.predict(X_test)

In [43]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.29      0.56      0.39       730
           1       0.49      0.60      0.54      1253
           2       0.18      0.56      0.27       464
           3       0.15      0.72      0.25       304
           4       0.17      0.60      0.27       440
           5       0.25      0.59      0.35       585

   micro avg       0.26      0.60      0.36      3776
   macro avg       0.26      0.61      0.34      3776
weighted avg       0.31      0.60      0.39      3776
 samples avg       0.27      0.60      0.35      3776



  _warn_prf(average, modifier, msg_start, len(result))


In [44]:
report, stats = multiclass_stats(y_test, y_pred)
stats

  _warn_prf(average, modifier, msg_start, len(result))


{'hamming_score': 0.2584826399433141,
 'hamming_loss': 0.4052029557647535,
 'f1_score': 0.34447734004171543,
 'roc_auc_score': 0.6006224498444909}

In [45]:
print(report)

              precision    recall  f1-score   support

           0       0.29      0.56      0.39       730
           1       0.49      0.60      0.54      1253
           2       0.18      0.56      0.27       464
           3       0.15      0.72      0.25       304
           4       0.17      0.60      0.27       440
           5       0.25      0.59      0.35       585

   micro avg       0.26      0.60      0.36      3776
   macro avg       0.26      0.61      0.34      3776
weighted avg       0.31      0.60      0.39      3776
 samples avg       0.27      0.60      0.35      3776



In [46]:
svc_clf = make_pipeline(StandardScaler(),
                        MultiOutputClassifier(
                            SVC(gamma='auto'))
                       ).fit(X_train, y_train)

In [47]:
y_pred = svc_clf.predict(X_test)

In [48]:
report, stats = multiclass_stats(y_test, y_pred)
stats

  _warn_prf(average, modifier, msg_start, len(result))


{'hamming_score': 0.24625468164794007,
 'hamming_loss': 0.16580625569389615,
 'f1_score': 0.20593172005479876,
 'roc_auc_score': 0.5597658722837919}

In [49]:
print(report)

              precision    recall  f1-score   support

           0       0.85      0.25      0.39       730
           1       0.67      0.46      0.54      1253
           2       0.93      0.03      0.05       464
           3       0.75      0.01      0.02       304
           4       1.00      0.01      0.02       440
           5       0.69      0.12      0.21       585

   micro avg       0.71      0.23      0.34      3776
   macro avg       0.82      0.15      0.21      3776
weighted avg       0.78      0.23      0.30      3776
 samples avg       0.26      0.25      0.25      3776

