In [1]:
import pandas as pd
import numpy as np

In [2]:
import os
from tqdm.notebook import tqdm

In [3]:
from metrics import multiclass_stats

In [4]:
os.chdir('yelp_academic')

In [5]:
br = pd.read_csv('data/business_ambience.csv')

In [6]:
br.head()

Unnamed: 0,photo_id,touristy,hipster,romantic,divey,intimate,upscale
0,ZlTwL6uWx6rW_L9Df5RT8A,False,False,True,False,True,False
1,fHbSMxueQfXFRb9e-6bJuw,False,False,False,True,False,False
2,74oWvVVIjms9LjfHQOgxMQ,False,False,False,False,False,True
3,QY6c1OKsIpujF4MDHQdbag,False,True,False,False,False,False
4,0AYEzNJYFF2PeXo71cpKuw,False,True,False,False,False,False


In [7]:
feature_cols = ['touristy', 'hipster', 'romantic', 'divey', 'intimate', 'upscale']

In [8]:
from sklearn.model_selection import train_test_split

In [9]:
X_train_files, X_test_files, y_train, y_test = train_test_split(br.photo_id, br[feature_cols], train_size = 0.9, random_state=420, stratify=br[feature_cols])

In [10]:
features = np.load('data/transfer_features/densenet_features.npz', allow_pickle = True)['arr_0'][()]

In [11]:
X_train, X_test = [], []

In [12]:
for filename in tqdm(X_train_files):
    X_train.append(features[filename])

  0%|          | 0/29635 [00:00<?, ?it/s]

In [13]:
for filename in tqdm(X_test_files):
    X_test.append(features[filename])

  0%|          | 0/3293 [00:00<?, ?it/s]

In [14]:
X_train = np.array(X_train)
X_test = np.array(X_test)

In [15]:
X_test.shape

(3293, 1024)

In [16]:
y_train = y_train.to_numpy(dtype='int')
y_test = y_test.to_numpy(dtype='int')

In [26]:
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.multioutput import MultiOutputClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import BernoulliNB
from sklearn.svm import SVC
from sklearn.metrics import multilabel_confusion_matrix, classification_report, hamming_loss, f1_score, roc_auc_score

In [23]:
lr_clf = make_pipeline(StandardScaler(),
                       MultiOutputClassifier(
                           LogisticRegression(max_iter=10000,
                                              random_state=42,
                                              class_weight='balanced'))
                      ).fit(X_train, y_train)

In [27]:
y_pred = lr_clf.predict(X_test)

In [28]:
print(report)

              precision    recall  f1-score   support

           0       0.39      0.64      0.49       730
           1       0.54      0.64      0.59      1253
           2       0.22      0.59      0.33       464
           3       0.22      0.68      0.33       304
           4       0.21      0.62      0.31       440
           5       0.35      0.70      0.47       585

   micro avg       0.33      0.64      0.44      3776
   macro avg       0.32      0.64      0.42      3776
weighted avg       0.38      0.64      0.46      3776
 samples avg       0.36      0.65      0.44      3776



In [29]:
multilabel_confusion_matrix(y_test, y_pred)

array([[[1841,  722],
        [ 263,  467]],

       [[1365,  675],
        [ 452,  801]],

       [[1876,  953],
        [ 189,  275]],

       [[2263,  726],
        [  98,  206]],

       [[1839, 1014],
        [ 169,  271]],

       [[1948,  760],
        [ 177,  408]]])