In [1]:
import json
import pickle
import pandas as pd
import numpy as np

In [2]:
import os
from tqdm.notebook import tqdm

In [3]:
os.chdir('../')

In [4]:
with open('config.json', 'r') as f:
    config = json.load(f)['sklearn']

In [5]:
br = pd.read_csv(config['br_data_path'])

In [6]:
br.head()

Unnamed: 0,photo_id,is_business
0,3V7tgMx3Qw5L9ZjRLNbthA,True
1,fZo1owoYqwAHW7uZlTz1XQ,False
2,zwOCQ8w3gFuF3zi_dyIWpw,False
3,hQBfeDngFMpB9HX2CPKtag,True
4,Fjh4N5B38vJWVbuQk-v3aQ,True


In [7]:
from sklearn.model_selection import train_test_split

In [8]:
X_train_files, X_test_files, y_train, y_test = train_test_split(br, br.is_business, train_size = 0.9, random_state=420, stratify=br.is_business)

In [9]:
COCO_INSTANCE_CATEGORY_NAMES = [
    '__background__', 'person', 'bicycle', 'car', 'motorcycle', 'airplane', 'bus',
    'train', 'truck', 'boat', 'traffic light', 'fire hydrant', 'N/A', 'stop sign',
    'parking meter', 'bench', 'bird', 'cat', 'dog', 'horse', 'sheep', 'cow',
    'elephant', 'bear', 'zebra', 'giraffe', 'N/A', 'backpack', 'umbrella', 'N/A', 'N/A',
    'handbag', 'tie', 'suitcase', 'frisbee', 'skis', 'snowboard', 'sports ball',
    'kite', 'baseball bat', 'baseball glove', 'skateboard', 'surfboard', 'tennis racket',
    'bottle', 'N/A', 'wine glass', 'cup', 'fork', 'knife', 'spoon', 'bowl',
    'banana', 'apple', 'sandwich', 'orange', 'broccoli', 'carrot', 'hot dog', 'pizza',
    'donut', 'cake', 'chair', 'couch', 'potted plant', 'bed', 'N/A', 'dining table',
    'N/A', 'N/A', 'toilet', 'N/A', 'tv', 'laptop', 'mouse', 'remote', 'keyboard', 'cell phone',
    'microwave', 'oven', 'toaster', 'sink', 'refrigerator', 'N/A', 'book',
    'clock', 'vase', 'scissors', 'teddy bear', 'hair drier', 'toothbrush'
]

In [10]:
with open(config['obj_feats_path'], 'rb') as io:
    objects = pickle.load(io)

In [11]:
trf_features = np.load(config['trf_feats_path'], allow_pickle = True)['arr_0'][()]

In [12]:
list(objects.items())[:2]

[(('LYiu06twTYN5_HndA_b-Cg',),
  [([(166.24751, 158.46727), (223.51633, 339.36807)], 46, 0.9968928),
   ([(115.754234, 100.15254), (175.79247, 289.48053)], 44, 0.9934689),
   ([(5.7081475, 143.40843), (57.751545, 327.2468)], 46, 0.99128866),
   ([(84.24994, 52.882652), (170.16812, 136.35014)], 1, 0.9865284),
   ([(4.960636, 227.64551), (220.60417, 397.3526)], 67, 0.96494514),
   ([(0.8416633, 4.649881), (220.56383, 127.64469)], 79, 0.78191566),
   ([(0.0, 222.43925), (18.95716, 284.204)], 47, 0.67385864),
   ([(100.82414, 291.31363), (144.11972, 337.40964)], 61, 0.5718027)]),
 (('EyRUH511mIh3We4Ce0PFVg',),
  [([(268.95145, 183.03993), (300.0, 228.78711)], 8, 0.79613465)])]

In [13]:
vector_size = len(COCO_INSTANCE_CATEGORY_NAMES)

In [13]:
binary_feature_vectors = {}

In [14]:
for name, boxes in tqdm(objects.items()):
    confidence_vector = np.zeros(vector_size)
    counts_vector = np.zeros(vector_size)
    for box in boxes:
        if box:
            _, idx, confidence = box
            confidence_vector[idx] = max(confidence_vector[idx], confidence)
            counts_vector[idx] += 1
    binary_feature_vectors[name[0]] = np.concatenate((confidence_vector, counts_vector))

  0%|          | 0/84222 [00:00<?, ?it/s]

In [15]:
all_vectors = np.array(list(binary_feature_vectors.values()))

In [16]:
empty_columns = []

trans_arr = all_vectors.T
for i in range(trans_arr.shape[0]):
    if np.all(trans_arr[i] == trans_arr[i][0]):
        empty_columns.append(i)

In [17]:
empty_columns

[0,
 12,
 26,
 29,
 30,
 45,
 66,
 68,
 69,
 71,
 83,
 91,
 103,
 117,
 120,
 121,
 136,
 157,
 159,
 160,
 162,
 174]

In [18]:
for c in empty_columns[::-1]:
    if c < len(COCO_INSTANCE_CATEGORY_NAMES):
        del COCO_INSTANCE_CATEGORY_NAMES[c]
    all_vectors = np.delete(all_vectors, c, 1)

In [19]:
names = list(binary_feature_vectors.keys())

In [20]:
features = {
    names[i]: np.concatenate((trf_features[names[i]], all_vectors[i])) for i in range(len(names))
}

In [21]:
X_train, X_test = [], []

In [22]:
len(features)

84222

In [23]:
for filename in tqdm(X_train_files.photo_id):
    X_train.append(features[filename])

  0%|          | 0/53600 [00:00<?, ?it/s]

In [24]:
for filename in tqdm(X_test_files.photo_id):
    X_test.append(features[filename])

  0%|          | 0/5956 [00:00<?, ?it/s]

In [25]:
X_train = np.array(X_train)
X_test = np.array(X_test)

In [26]:
X_test.shape

(5956, 1184)

In [27]:
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import BernoulliNB
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix, f1_score, balanced_accuracy_score

In [28]:
lr_clf = make_pipeline(StandardScaler(), LogisticRegression(max_iter=10000, random_state=42)).fit(X_train, y_train)

In [28]:
lr_clf.score(X_test, y_test)  # same as balanced_accuracy_score because classes are equally represented

0.699798522498321

In [29]:
f1_score(y_test, lr_clf.predict(X_test))

0.7130016051364366

In [31]:
nb_clf = make_pipeline(StandardScaler(), BernoulliNB()).fit(X_train, y_train)

In [32]:
nb_clf.score(X_test, y_test)  # same as balanced_accuracy_score because classes are equally represented

0.6334788448623238

In [33]:
f1_score(y_test, nb_clf.predict(X_test))

0.6533269811021121

In [28]:
svc_clf = make_pipeline(StandardScaler(), SVC(gamma='auto')).fit(X_train, y_train)

In [41]:
y_pred = svc_clf.predict(X_test)

In [42]:
balanced_accuracy_score(y_test, y_pred)

0.727165883143049

In [43]:
f1_score(y_test, y_pred)

0.7446173188747447

In [35]:
rf_clf = make_pipeline(StandardScaler(), RandomForestClassifier(random_state=42)).fit(X_train, y_train)

In [37]:
y_pred = rf_clf.predict(X_test)

In [38]:
f1_score(y_test, y_pred)

0.7175596159801797

In [39]:
balanced_accuracy_score(y_test, y_pred)

0.6937541974479516