In [1]:
import pandas as pd
import numpy as np

In [2]:
import os
from tqdm.notebook import tqdm

In [3]:
os.chdir('yelp_academic')

In [4]:
br = pd.read_csv('data/business_restaurant.csv')

In [5]:
br.head()

Unnamed: 0,photo_id,is_business
0,3V7tgMx3Qw5L9ZjRLNbthA,True
1,fZo1owoYqwAHW7uZlTz1XQ,False
2,zwOCQ8w3gFuF3zi_dyIWpw,False
3,hQBfeDngFMpB9HX2CPKtag,True
4,Fjh4N5B38vJWVbuQk-v3aQ,True


In [6]:
from sklearn.model_selection import train_test_split

In [7]:
X_train_files, X_test_files, y_train, y_test = train_test_split(br, br.is_business, train_size = 0.9, random_state=420, stratify=br.is_business)

In [19]:
features = np.load('data/transfer_features/vgg_features.npz', allow_pickle = True)['arr_0'][()]

In [20]:
X_train, X_test = [], []

In [21]:
for filename in tqdm(X_train_files.photo_id):
    X_train.append(features[filename])

  0%|          | 0/53600 [00:00<?, ?it/s]

In [22]:
for filename in tqdm(X_test_files.photo_id):
    X_test.append(features[filename])

  0%|          | 0/5956 [00:00<?, ?it/s]

In [23]:
X_train = np.array(X_train)
X_test = np.array(X_test)

In [24]:
X_test.shape

(5956, 4096)

In [25]:
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import BernoulliNB
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix, f1_score

In [38]:
lr_clf = make_pipeline(StandardScaler(), LogisticRegression(max_iter=10000, random_state=42)).fit(X_train, y_train)

In [39]:
lr_clf.score(X_test, y_test)

0.6937541974479516

In [40]:
confusion_matrix(y_test, lr_clf.predict(X_test))

array([[1935, 1043],
       [ 781, 2197]])

In [41]:
tn, fp, fn, tp = confusion_matrix(y_test, lr_clf.predict(X_test)).ravel()
print(f"tn: {tn}, fp: {fp}, fn: {fn}, tp: {tp}")

tn: 1935, fp: 1043, fn: 781, tp: 2197


In [42]:
f1_score(y_test, lr_clf.predict(X_test))

0.7066580894178193

In [43]:
nb_clf = make_pipeline(StandardScaler(), BernoulliNB()).fit(X_train, y_train)

In [44]:
nb_clf.score(X_test, y_test)

0.630456682337139

In [45]:
confusion_matrix(y_test, nb_clf.predict(X_test))

array([[1705, 1273],
       [ 928, 2050]])

In [46]:
tn, fp, fn, tp = confusion_matrix(y_test, nb_clf.predict(X_test)).ravel()
print(f"tn: {tn}, fp: {fp}, fn: {fn}, tp: {tp}")

tn: 1705, fp: 1273, fn: 928, tp: 2050


In [47]:
f1_score(y_test, nb_clf.predict(X_test))

0.6506903666084749

In [15]:
svc_clf = make_pipeline(StandardScaler(), SVC(gamma='auto')).fit(X_train, y_train)

In [None]:
svc_clf.score(X_test, y_test)

In [16]:
y_pred = svc_clf.predict(X_test)

In [None]:
f1_score(y_test, y_pred)

In [17]:
test_filenames = list(X_test_files.photo_id)
y_test = list(y_test)

In [18]:
mispredicted_list = []

In [19]:
for i in range(len(test_filenames)):
    mispredicted = {}
    if not y_pred[i] == y_test[i]:
        mispredicted['filename'] = test_filenames[i]
        mispredicted['actual'] = y_test[i]
        mispredicted['prediction'] = y_pred[i]
        mispredicted_list.append(mispredicted)

In [20]:
import pickle
with open('comparison/br_svc_densenet_transfer_mispredicted.pickle', 'wb') as a:
    pickle.dump(mispredicted_list, a)

In [26]:
rf_clf = RandomForestClassifier(random_state=42).fit(X_train, y_train)

In [27]:
rf_clf.score(X_test, y_test)

0.6831766286098052

In [28]:
y_pred = rf_clf.predict(X_test)

In [29]:
f1_score(y_test, y_pred)

0.7078495123084068