# Yelp Restaurant Photo Classification

### Задание 4
Предсказание тегов ресторанов по фотографиям
Обучить модель предсказания тегов ресторанов по набору фото. Исходные данные и валидация на Kaggle в рамках контеста Yelp Restaurant Photo Classification(https://www.kaggle.com/c/yelp-restaurant-photo-classification). Шаблон ноутбука для подготовки решения: yelp-hw.ipynb. <br>
Решение необходимо прислать в виде ссылки на ipython-ноутбука с указанием значения метрики на Leaderboard. Задание засчитывается при значение метрики на Leaderboard больше 0.7.

Здесь соревнование - классификация ресторанов по фото на 8 разных классов (хорош для ужина/завтрака/детей и тд)

In [1]:
import os
import cv2
import numpy as np
import pandas as pd
from tensorflow.contrib import keras

preprocess_input = keras.applications.vgg16.preprocess_input
VGG16 = keras.applications.VGG16

  from ._conv import register_converters as _register_converters


## Загружаем разметку

In [2]:
train_biz_df = pd.read_csv('train.csv')
train_photos_df = pd.read_csv('train_photo_to_biz_ids.csv')
train_df = train_photos_df.merge(train_biz_df)

test_photos_df = pd.read_csv('test_photo_to_biz.csv')

In [3]:
def get_image(path, img_id, img_size=(224, 224)):
    img = cv2.imread(os.path.join(path, '%s.jpg' % img_id))[:,:,::-1]
    img = preprocess_input(img.astype(np.float32))
    return cv2.resize(img, img_size)

def to_dense(labels):
    result = [0] * 9
    for i in labels:
        result[i] = 1.
    return result

def train_generator(df, img_size=(224, 224), batch_size=32):
    while True:
        df = df.sample(frac=1).reset_index(drop=True)
        for i in range(0, len(df) // batch_size * batch_size, batch_size):
            X, y = [], []
            for _, row in df[i:i + batch_size].iterrows():
                X.append(get_image('train_photos', row['photo_id'], img_size))
                y.append(to_dense(map(int, str(row['labels']).split())))
            y = np.array(y)
            yield np.array(X), [y[:, i] for i in range(9)]

In [4]:
from tensorflow.python.keras.models import Model, Sequential
from tensorflow.python.keras.layers import Activation, Dropout, Flatten, Dense
from tensorflow.python.keras.optimizers import Adam
from sklearn.cross_validation import train_test_split



In [5]:
categories = {0: 'good_for_lunch', 1: 'good_for_dinner', 2: 'takes_reservations',  3: 'outdoor_seating',
                  4: 'restaurant_is_expensive', 5: 'has_alcohol', 6: 'has_table_service', 7: 'ambience_is_classy',
                  8: 'good_for_kids'}
IMG_SIZE = 224

In [6]:
train_df.dropna(inplace=True)

In [7]:
train_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 234545 entries, 0 to 234841
Data columns (total 3 columns):
photo_id       234545 non-null int64
business_id    234545 non-null int64
labels         234545 non-null object
dtypes: int64(2), object(1)
memory usage: 7.2+ MB


In [102]:
# TODO: build CNN model
# TODO: train CNN model using train_generator
# нужно придумать архитектуру нейронной сети и обучить ее!!!!

# здесь предлагается распараллелить данные

# лучше сделать предсказание по тэгам, а не классам (SoftMax?)

In [103]:
vgg16_net = VGG16(weights='imagenet', include_top=False, input_shape=(IMG_SIZE, IMG_SIZE, 3))
vgg16_net.trainable = False

In [104]:
for layer in vgg16_net.layers:
    layer.trainable = False

In [105]:
vgg16_net.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_8 (InputLayer)         (None, 224, 224, 3)       0         
_________________________________________________________________
block1_conv1 (Conv2D)        (None, 224, 224, 64)      1792      
_________________________________________________________________
block1_conv2 (Conv2D)        (None, 224, 224, 64)      36928     
_________________________________________________________________
block1_pool (MaxPooling2D)   (None, 112, 112, 64)      0         
_________________________________________________________________
block2_conv1 (Conv2D)        (None, 112, 112, 128)     73856     
_________________________________________________________________
block2_conv2 (Conv2D)        (None, 112, 112, 128)     147584    
_________________________________________________________________
block2_pool (MaxPooling2D)   (None, 56, 56, 128)       0         
__________

In [146]:
model_p = Sequential()
# Добавляем в модель сеть VGG16 вместо слоя
model_p.add(vgg16_net)
model_p.add(Flatten())
model_p.add(Dense(256,activation='relu'))
# model_p.add(Activation('relu'))
model_p.add(Dropout(0.5))

model_p.add(Dense(1,activation='softmax')) 
# #val_loss improved from inf to 72.21709 - лучший результат, был сохранен в yelp_weights1.h5
# #10/11903 [..............................] - ETA: 116:49:21

# model_p.add(Dense(1,activation='sigmoid')) 
# #val_loss improved from inf to 64.72608

In [147]:
x = model_p.output

predictions = []
for i in range(9):
    predictions.append(x)
model = Model(inputs=model_p.input, outputs=predictions)

In [148]:
# model_save_name = 'yelp_weights1.h5'
# Пробуем составить еще одну модель
model_save_name = 'yelp_weights2.h5'

In [149]:
model.compile(loss='binary_crossentropy',
              optimizer='adam', 
              metrics=['accuracy'])

In [150]:
# откладываем часть данных для теста
train_df1, test_df1 = train_test_split(train_df,
                                     test_size=0.11,
                                     random_state=123)
# мониторинг процесса обучения
lr_reduce = keras.callbacks.ReduceLROnPlateau(monitor='val_acc', 
                                              factor=0.1,
                                              epsilon=1e-5, 
                                              patience=5, 
                                              verbose=1)

checkpoint = keras.callbacks.ModelCheckpoint(model_save_name,
                                             save_best_only=True,
                                             verbose=1)

In [151]:
# закружаем валидационные изображения в память
test_gen = train_generator(test_df1, batch_size = 100) #len(test_df1))
test_X, test_y = next(test_gen)
batch_size = 100
train_generator_df = train_generator(train_df1, batch_size = batch_size)

In [152]:
# Попытка дообучить ни к чему не привела - val_loss падал
# if os.path.exists(model_save_name):
#     model.load_weights(model_save_name)

In [153]:
# Количество эпох и шагов было уменьшено по причине длительности обучения
epochs=5
steps_per_epoch=10

In [154]:
model.fit_generator(train_generator_df,
                    steps_per_epoch,
                    epochs=epochs,
                    shuffle=True,
                   # use_multiprocessing=True,
                    validation_data=(test_X, test_y),
                    callbacks=[lr_reduce, checkpoint])

Epoch 1/5

--- Logging error ---
Traceback (most recent call last):
  File "/anaconda3/lib/python3.6/logging/__init__.py", line 992, in emit
    msg = self.format(record)
  File "/anaconda3/lib/python3.6/logging/__init__.py", line 838, in format
    return fmt.format(record)
  File "/anaconda3/lib/python3.6/logging/__init__.py", line 575, in format
    record.message = record.getMessage()
  File "/anaconda3/lib/python3.6/logging/__init__.py", line 338, in getMessage
    msg = msg % self.args
TypeError: not all arguments converted during string formatting
Call stack:
  File "/anaconda3/lib/python3.6/runpy.py", line 193, in _run_module_as_main
    "__main__", mod_spec)
  File "/anaconda3/lib/python3.6/runpy.py", line 85, in _run_code
    exec(code, run_globals)
  File "/anaconda3/lib/python3.6/site-packages/ipykernel_launcher.py", line 16, in <module>
    app.launch_new_instance()
  File "/anaconda3/lib/python3.6/site-packages/traitlets/config/application.py", line 658, in launch_instance
    app.s


Epoch 00001: val_loss improved from inf to 64.56666, saving model to yelp_weights2.h5
Epoch 2/5
Epoch 00002: val_loss did not improve


--- Logging error ---
Traceback (most recent call last):
  File "/anaconda3/lib/python3.6/logging/__init__.py", line 992, in emit
    msg = self.format(record)
  File "/anaconda3/lib/python3.6/logging/__init__.py", line 838, in format
    return fmt.format(record)
  File "/anaconda3/lib/python3.6/logging/__init__.py", line 575, in format
    record.message = record.getMessage()
  File "/anaconda3/lib/python3.6/logging/__init__.py", line 338, in getMessage
    msg = msg % self.args
TypeError: not all arguments converted during string formatting
Call stack:
  File "/anaconda3/lib/python3.6/runpy.py", line 193, in _run_module_as_main
    "__main__", mod_spec)
  File "/anaconda3/lib/python3.6/runpy.py", line 85, in _run_code
    exec(code, run_globals)
  File "/anaconda3/lib/python3.6/site-packages/ipykernel_launcher.py", line 16, in <module>
    app.launch_new_instance()
  File "/anaconda3/lib/python3.6/site-packages/traitlets/config/application.py", line 658, in launch_instance
    app.s

Epoch 3/5
 2/10 [=====>........................] - ETA: 4:31 - loss: 66.7189 - dense_71_loss: 8.0509 - dense_71_acc: 0.2350 - dense_71_acc_1: 0.6000 - dense_71_acc_2: 0.6550 - dense_71_acc_3: 0.5100 - dense_71_acc_4: 0.4000 - dense_71_acc_5: 0.7550 - dense_71_acc_6: 0.7500 - dense_71_acc_7: 0.4150 - dense_71_acc_8: 0.4950

KeyboardInterrupt: 

Последний показатель val_loss - 64.56666 <br>
Cохранен в модель yelp_weights2.h5

## Получаем предсказания

In [98]:
from datetime import datetime

In [99]:
model.load_weights('yelp_weights1.h5')

In [100]:
class TestSequence(keras.utils.Sequence):
    """ Класс для чтения батча """
    def __init__(self, df, batch_size):
        self._df = df
        self._batch_size = batch_size

    def __len__(self):
        return int(np.ceil(len(self._df) / float(self._batch_size)))

    def __getitem__(self, idx):
        result = []
        sample = self._df[idx * self._batch_size:(idx + 1) * self._batch_size]
        for _, row in sample.iterrows():
            result.append(get_image('test_photos', row['photo_id']))
        return np.array(result)

In [101]:
%%time
print('Started at : {}'.format(datetime.now()))
# читаем данные параллельно в 20 потоков и применяем модель yelp_weights1.h5
# В связи с огромным временем обучения - процесс был прерван
preds = model.predict_generator(TestSequence(test_photos_df, batch_size=100), 
#                                  use_multiprocessing=True, 
                                workers=20, verbose=1)

Started at : 2018-08-05 17:12:16.667998
   10/11903 [..............................] - ETA: 116:49:21

KeyboardInterrupt: 

### ИТОГ
Для данного соревнования дообучение рукомендуемой модели VGG16 является достаточно ресурсоемким и требует вычесления на больших мощностях, нежели есть в наличии.

In [24]:
test_photos_df['labels'] = [[float(preds[j][i]) for j in range(9)] 
                            for i in range(len(test_photos_df))]

In [29]:
test_photos_df.head()

Unnamed: 0,photo_id,business_id,labels
0,317818,003sg,"[0.15017016232, 0.720844268799, 0.786582291126..."
1,30679,003sg,"[0.358560830355, 0.313709139824, 0.35823640227..."
2,455084,003sg,"[0.306661188602, 0.248710289598, 0.37077081203..."
3,371381,003sg,"[0.0900943800807, 0.979245781898, 0.9886131286..."
4,86224,003sg,"[0.0362958088517, 0.917712509632, 0.9531230926..."


In [30]:
business_df = test_photos_df.groupby('business_id')

In [61]:
# Группируем лейблы для субмита - тоже можно подумать, как лучше. Здесь по средней 
submission = []
for k, labels in business_df['labels'].apply(list).iteritems():
    labels = np.array(labels).mean(axis=0)
    labels = np.where(labels > 0.5)[0]
    submission.append((k, ' '.join(map(str, labels))))

In [70]:
submission_df = pd.DataFrame(submission, columns=('business_id', 'labels'))
submission_df.to_csv('submission.csv', index=False)
# 0.70970