In [None]:
!nvidia-smi

In [None]:
!pip install transformers > /dev/null

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import torch
import transformers
import re
from dataclasses import dataclass
import cv2
from PIL import Image
from transformers import BertTokenizer
from tqdm import tqdm
tqdm.pandas()


from keras.preprocessing.image import load_img
from keras.applications.resnet50 import preprocess_input 
from keras.applications.resnet50 import ResNet50
from keras.models import Model

In [None]:
#DATADIR = "drive/MyDrive/atma10/input/"
#OUTPUTDIR = "drive/MyDrive/atma10/feature/"
DATADIR = "../input/"
OUTPUTDIR = "../feature/"

palette = pd.read_csv(DATADIR + "palette.csv")

train_data = pd.read_csv(DATADIR + "train.csv")
test_data = pd.read_csv(DATADIR + "test.csv")
all_df = pd.concat([train_data, test_data], axis=0).reset_index(drop=True)

In [None]:
def reduce_mem_usage(df, verbose=True):
    numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
    start_mem = df.memory_usage().sum() / 1024**2 
    dfs = []
    for col in df.columns: #columns毎に処理
        col_type = df[col].dtypes
        if col_type in numerics: #numericsのデータ型の範囲内のときに処理を実行. データの最大最小値を元にデータ型を効率的なものに変更
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    dfs.append(df[col].astype(np.int8))
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    dfs.append(df[col].astype(np.int16))
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    dfs.append(df[col].astype(np.int32))
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    dfs.append(df[col].astype(np.int64) ) 
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    dfs.append(df[col].astype(np.float16))
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    dfs.append(df[col].astype(np.float32))
                else:
                    dfs.append(df[col].astype(np.float64))
        else:
            dfs.append(df[col])
    
    df_out = pd.concat(dfs, axis=1)
    if verbose:
        end_mem = df_out.memory_usage().sum() / 1024**2
        num_reduction = str(100 * (start_mem - end_mem) / start_mem)
        print(f'Mem. usage decreased to {str(end_mem)[:3]}Mb:  {num_reduction[:2]}% reduction')
    return df_out

In [None]:
def extract_features(numpy_img, model):
    img = numpy_img.copy()
    reshaped_img = img.reshape(1, 224, 224, 3) 
    img_pp = preprocess_input(reshaped_img)
    features = model.predict(img_pp, use_multiprocessing=True) # get the feature vector
    features = features.squeeze() 
    return features

In [None]:
model = ResNet50()
model = Model(inputs = model.inputs, outputs = model.layers[-1].output)

In [None]:
exist_palette_list = palette['object_id'].unique().tolist()

_all_df = all_df[all_df['object_id'].isin(exist_palette_list)]
_all_df = _all_df[["object_id"]].reset_index(drop=True)

In [None]:
class Config:
    img_width = 224 #512
    img_height = 224 #512

img_width = Config.img_width
img_height = Config.img_height
total = img_width * img_height

output_list = []
for _id, _df in tqdm(palette.groupby("object_id")):
    idx_list = len(_df)
    prob = _df["ratio"].values
    idx = np.random.choice(a=idx_list, 
                       size=total, 
                       p=prob)
    rgb = _df.iloc[idx][["color_r",	"color_g",	"color_b"]].values
    sampling_img = rgb.reshape(img_width, img_height, 3)
    feat = extract_features(sampling_img, model)

    output = [_id] + feat.tolist()
    output_list.append(output)
  
    #pil_img = Image.fromarray(sampling_img)
    #pil_img.save(OUTPUTDIR + f'{_id}.png')
    #plt.imshow(sampling_img)
    #plt.show()

In [None]:
df_out = pd.DataFrame(output_list).rename(columns={0:"object_id"})
df_out = reduce_mem_usage(df_out)
df_out.to_pickle(OUTPUTDIR + "ResNet50_palette_embedding1000.pkl")