In [1]:
import tensorflow as tf
import pandas as pd
# from keras.preprocessing.image import ImageDataGenerator
# from keras.models import Sequential, Model
# from keras.layers import BatchNormalization, Conv2D, Activation, Dense, GlobalAveragePooling2D, MaxPooling2D, ZeroPadding2D, Add, Input, Flatten

# import time
import os
import numpy as np
from sklearn.preprocessing import MultiLabelBinarizer
# import matplotlib.pyplot as plt
# import matplotlib.style as style
# import seaborn as sns
# import cv2
# from utils import *

# BATCH_SIZE = 256  # Big enough to measure an F1-score
# # Adapt preprocessing and prefetching dynamically
# AUTOTUNE = tf.data.experimental.AUTOTUNE
# SHUFFLE_BUFFER_SIZE = 8192


In [4]:
dir_path = './database_test'

file_list = os.listdir(dir_path)
ID = []
Labels = []

for file_name in file_list:
    file_name = file_name[0: len(file_name) - 4]
    label_list = file_name.split('+')
    if (label_list[0] == ''):
        continue
    ID.append(label_list[0])
    del label_list[0]

    valid_label_list = []
    for label in label_list:
        label = label.lower()
        labels = label.split(', ')
        valid_label_list.extend(labels)

    while True:
        try:
            valid_label_list.remove('')
        except ValueError:
            break

    Labels.append(valid_label_list)

artworks = pd.DataFrame({"ID": ID, "Labels": Labels, "File": file_list})
artworks.dropna()

# Get label frequencies in descending order
label_freq = artworks['Labels'].apply(
    lambda s: [x for x in s]).explode().value_counts().sort_values(ascending=False)


In [5]:
# Create a list of rare labels
# rare = list(label_freq[label_freq < 50].index)
rare = list(label_freq[label_freq < 25].index)
print("Number of used labels:", len(label_freq) - len(rare))
print("Number of ignored labels:", len(rare))
label_number = len(label_freq) - len(rare)

artworks['Labels'] = artworks['Labels'].apply(
    lambda s: [x for x in s if x not in rare])
artworks['Labels'] = artworks['Labels'].apply(
    lambda s: s if len(s) != 0 else np.nan)

artworks.dropna(inplace=True)
artworks.head(10)


Number of used labels: 58
Number of ignored labels: 97


Unnamed: 0,ID,Labels,File
0,10,"[allegorical painting, mannerism (late renaiss...",10+allegorical painting+Mannerism (Late Renais...
1,100,"[still life, naïve art (primitivism), oil]","100+still life+Naïve Art (Primitivism)+oil, co..."
2,1000,"[religious painting, baroque, tenebrism, oil, ...","1000+religious painting+Baroque, Tenebrism+oil..."
3,1001,"[religious painting, baroque, tenebrism, oil, ...","1001+religious painting+Baroque, Tenebrism+oil..."
4,1002,"[religious painting, baroque, tenebrism, oil, ...","1002+religious painting+Baroque, Tenebrism+oil..."
5,1003,"[religious painting, baroque, tenebrism, oil, ...","1003+religious painting+Baroque, Tenebrism+oil..."
6,1004,"[religious painting, baroque, tenebrism, oil, ...","1004+religious painting+Baroque, Tenebrism+oil..."
7,1005,"[mythological painting, baroque, tenebrism, oi...","1005+mythological painting+Baroque, Tenebrism+..."
8,1006,"[still life, baroque, tenebrism, oil, canvas]","1006+still life+Baroque, Tenebrism+oil, canvas..."
9,1007,"[religious painting, baroque, tenebrism, oil, ...","1007+religious painting+Baroque, Tenebrism+oil..."


In [6]:
from sklearn.model_selection import train_test_split
# 파일 경로 및 label 집합으로 train data와 valid data parsing
X_train, X_val, y_train, y_val = train_test_split(
    artworks['File'], artworks['Labels'], test_size=0.1, random_state=23)

X_train = [os.path.join('./database_test', str(f)) for f in X_train]
X_val = [os.path.join('./database_test', str(f)) for f in X_val]
X_train[:3]


['./database_test\\2105+sculpture+Baroque+.jpg',
 './database_test\\1231+portrait+Pop Art+oil, canvas.jpg',
 './database_test\\1972+portrait+Neoclassicism+pastel.jpg']

In [7]:
# MultiLabelBinarizer로 다중 label 이진화
mlb = MultiLabelBinarizer()
mlb.fit(artworks['Labels'])

# Loop over all labels and show them
N_LABELS = len(mlb.classes_)

y_train_bin = mlb.transform(y_train)
mlb.fit(y_val)
y_val_bin = mlb.transform(y_val)

for _ 는 변수없이 반복문 수행

In [8]:
from itertools import product

node_train_df = pd.DataFrame(columns=['class'])
for (i, label) in enumerate(mlb.classes_):
  node_train_df.loc[i] = label

node_train_df.reset_index()

class_combinations = list(product(mlb.classes_, repeat=2))
edge_train_df = pd.DataFrame(columns=['class1', 'class2', 'correlation'])

# edge_df 초기화
for (i, class_combination) in enumerate(class_combinations):
  edge_train_df.loc[i] = [class_combination[0], class_combination[1], 0]

edge_train_df.reset_index()

# 각 edge_df 값 주입.
for label in Labels:
  label_combinations = list(product(label, repeat=2))
  for label_combination in label_combinations:
    condition = (edge_train_df['class1'] == label_combination[0]) & (
        edge_train_df['class2'] == label_combination[1])

    if (edge_train_df[condition].index.tolist()):
      index = edge_train_df[condition].index.tolist()[0]
      edge_train_df.loc[index, 'correlation'] += 1
     
merge_train_df = node_train_df.reset_index().set_index(
    'class').rename(columns={'index': 'class1_idx'})
edge_train_df = pd.merge(
    edge_train_df, merge_train_df['class1_idx'], how='left', left_on='class1', right_index=True)
merge_train_df = merge_train_df.rename(columns={'class1_idx': 'class2_idx'})
edge_train_df = pd.merge(
    edge_train_df, merge_train_df['class2_idx'], how='left', left_on='class2', right_index=True)

print(edge_train_df)


        class1                class2  correlation  class1_idx  class2_idx
0     abstract              abstract          205           0           0
1     abstract          abstract art          135           0           1
2     abstract  allegorical painting            0           0           2
3     abstract  art nouveau (modern)            0           0           3
4     abstract               baroque            0           0           4
...        ...                   ...          ...         ...         ...
3131      wood            surrealism            0          55          51
3132      wood               tempera           22          55          52
3133      wood             tenebrism            1          55          53
3134      wood            watercolor            0          55          54
3135      wood                  wood           76          55          55

[3136 rows x 5 columns]


In [7]:
import tensorflow_gnn as tfgnn

def create_graph_tensor(node_df, edge_df):
    graph_tensor = tfgnn.GraphTensor.from_pieces(
        node_sets={
            "Label": tfgnn.NodeSet.from_fields(
                sizes=[len(node_df)],
                features={
                    'index': np.array(node_df.index, dtype='int32').reshape(len(node_df), 1)
                    # 'feature': tf.constant(y_train_bin)
                }
            )
        },
        edge_sets={
            "Correlation": tfgnn.EdgeSet.from_fields(
                sizes=[len(edge_df)],
                features={
                    'connection-strength': np.array(edge_df['correlation'], dtype='float').reshape(len(edge_df), 1)
                },
                adjacency=tfgnn.Adjacency.from_indices(
                    source=("Label", np.array(
                        edge_df['class1_idx'], dtype='int32')),
                    target=("Label", np.array(
                        edge_df['class2_idx'], dtype='int32'))
                )
            )
        }
    )
    return graph_tensor

train_tensor = create_graph_tensor(node_train_df, edge_train_df)
dataset = tf.data.Dataset.from_tensors(train_tensor)
print(dataset)


<TensorDataset element_spec=GraphTensorSpec({'context': ContextSpec({'features': {}, 'sizes': TensorSpec(shape=(1,), dtype=tf.int32, name=None)}, TensorShape([]), tf.int32, None), 'node_sets': {'Label': NodeSetSpec({'features': {'index': TensorSpec(shape=(42, 1), dtype=tf.int32, name=None)}, 'sizes': TensorSpec(shape=(1,), dtype=tf.int32, name=None)}, TensorShape([]), tf.int32, None)}, 'edge_sets': {'Correlation': EdgeSetSpec({'features': {'connection-strength': TensorSpec(shape=(1764, 1), dtype=tf.float64, name=None)}, 'sizes': TensorSpec(shape=(1,), dtype=tf.int32, name=None), 'adjacency': AdjacencySpec({'#index.0': TensorSpec(shape=(1764,), dtype=tf.int32, name=None), '#index.1': TensorSpec(shape=(1764,), dtype=tf.int32, name=None)}, TensorShape([]), tf.int32, {'#index.0': 'Label', '#index.1': 'Label'})}, TensorShape([]), tf.int32, None)}}, TensorShape([]), tf.int32, None)>


In [30]:
def node_batch_merge(graph):
    graph = graph.merge_batch_to_components()
    node_features = graph.node_sets['Label'].get_features_dict()
    edge_features = graph.edge_sets['Correlation'].get_features_dict()
    
    label = node_features['index']
    new_graph = graph.replace_features(
        node_sets={'Label': node_features},
        edge_sets={'Correlation': edge_features})
    return new_graph, label

def edge_batch_merge(graph):
    graph = graph.merge_batch_to_components()
    node_features = graph.node_sets['Label'].get_features_dict()
    edge_features = graph.edge_sets['Correlation'].get_features_dict()
    
    new_graph = graph.replace_features(
        node_sets={'Label': node_features},
        edge_sets={'Correlation': edge_features})
    return new_graph

def create_dataset(graph,function):
    dataset = tf.data.Dataset.from_tensors(graph)
    dataset = dataset.batch(32)
    return dataset.map(function)

train_node_dataset = create_dataset(train_tensor, node_batch_merge)
train_edge_dataset = create_dataset(train_tensor, edge_batch_merge)

print(train_node_dataset)
print(train_edge_dataset)


<MapDataset element_spec=(GraphTensorSpec({'context': ContextSpec({'features': {}, 'sizes': TensorSpec(shape=(None,), dtype=tf.int32, name=None)}, TensorShape([]), tf.int32, None), 'node_sets': {'Label': NodeSetSpec({'features': {'index': TensorSpec(shape=(None, 1), dtype=tf.int32, name=None)}, 'sizes': TensorSpec(shape=(None,), dtype=tf.int32, name=None)}, TensorShape([]), tf.int32, None)}, 'edge_sets': {'Correlation': EdgeSetSpec({'features': {'connection-strength': TensorSpec(shape=(None, 1), dtype=tf.float64, name=None)}, 'sizes': TensorSpec(shape=(None,), dtype=tf.int32, name=None), 'adjacency': AdjacencySpec({'#index.0': TensorSpec(shape=(None,), dtype=tf.int32, name=None), '#index.1': TensorSpec(shape=(None,), dtype=tf.int32, name=None)}, TensorShape([]), tf.int32, {'#index.0': 'Label', '#index.1': 'Label'})}, TensorShape([]), tf.int32, None)}}, TensorShape([]), tf.int32, None), TensorSpec(shape=(None, 1), dtype=tf.int32, name=None))>
<MapDataset element_spec=GraphTensorSpec({'c