In [1]:
import os
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

import tensorflow as tf
from tensorflow.keras.layers import Dense,Input
from tensorflow.keras.models import Model
from tensorflow.keras.optimizers import Adam

from transformers import TFElectraModel
from transformers import ElectraTokenizer

from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

In [2]:
strategy = tf.distribute.get_strategy()

In [3]:
train = pd.read_csv('data/train.csv')
test = pd.read_csv('data/test_cat1_result.csv')

In [4]:
train_1 = train[train['cat1']=='자연']
train_2 = train[train['cat1']=='레포츠']
train_3 = train[train['cat1']=='음식']
train_4 = train[train['cat1']=='인문(문화/예술/역사)']
train_5 = train[train['cat1']=='숙박']
train_6 = train[train['cat1']=='쇼핑']

In [5]:
test.head()

Unnamed: 0,id,img_path,overview,cat1_result
0,TEST_00000,./image/test/TEST_00000.jpg,신선한 재료로 정성을 다해 만들었다. 늘 변함없는 맛과 서비스로 모실것을 약속한다.,음식
1,TEST_00001,./image/test/TEST_00001.jpg,"청청한 해역 등량만과 율포해수욕장이 한눈에 내려다 보이는 위치에 있으며, 막 잡은 ...",음식
2,TEST_00002,./image/test/TEST_00002.jpg,장터설렁탕은 남녀노소 누구나 즐길 수 있는 전통 건강식으로 좋은 재료와 전통 조리방...,음식
3,TEST_00003,./image/test/TEST_00003.jpg,다양한 형태의 청소년수련활동을 제공함으로써 청소년들이 민주사회의 주역이 될 수 있도...,레포츠
4,TEST_00004,./image/test/TEST_00004.jpg,팔공산은 경산시의 북쪽에 위치한 해발 1192.3 m의 높은 산으로 신라시대에는 중...,자연


In [6]:
test_1 = test[test['cat1_result']=='자연']
test_2 = test[test['cat1_result']=='레포츠']
test_3 = test[test['cat1_result']=='음식']
test_4 = test[test['cat1_result']=='인문(문화/예술/역사)']
test_5 = test[test['cat1_result']=='숙박']
test_6 = test[test['cat1_result']=='쇼핑']

In [7]:
electra_tokenizer = ElectraTokenizer.from_pretrained("monologg/koelectra-base-v3-discriminator")

In [6]:
def cat3_1():
    with strategy.scope():
        encoder = TFElectraModel.from_pretrained("monologg/koelectra-base-v3-discriminator", from_pt=True)

        input_layer = Input(shape=(650,), dtype=tf.int32, name="input_layer")
        sequence_output = encoder(input_layer)[0]

        cls_token = sequence_output[:, 0, :]

        output_layer = Dense(20, activation='softmax')(cls_token)

        model = Model(inputs=input_layer, outputs=output_layer)
        model.compile(Adam(lr=1e-5), loss='sparse_categorical_crossentropy', metrics=['accuracy'])
    
    return model

In [7]:
model = cat3_1()
model.load_weights('checkpoints/koelectra_cat3_detail_1/')

Some weights of the PyTorch model were not used when initializing the TF 2.0 model TFElectraModel: ['electra.embeddings.position_ids', 'discriminator_predictions.dense.bias', 'discriminator_predictions.dense.weight', 'discriminator_predictions.dense_prediction.bias', 'discriminator_predictions.dense_prediction.weight']
- This IS expected if you are initializing TFElectraModel from a PyTorch model trained on another task or with another architecture (e.g. initializing a TFBertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFElectraModel from a PyTorch model that you expect to be exactly identical (e.g. initializing a TFBertForSequenceClassification model from a BertForSequenceClassification model).
All the weights of TFElectraModel were initialized from the PyTorch model.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFElectraModel for predictions without further train

Please report this to the TensorFlow team. When filing the bug, set the verbosity to 10 (on Linux, `export AUTOGRAPH_VERBOSITY=10`) and attach the full output.
Cause: '<' not supported between instances of 'Literal' and 'str'
Please report this to the TensorFlow team. When filing the bug, set the verbosity to 10 (on Linux, `export AUTOGRAPH_VERBOSITY=10`) and attach the full output.
Cause: '<' not supported between instances of 'Literal' and 'str'
Instructions for updating:
The `validate_indices` argument has no effect. Indices are always validated on CPU and never validated on GPU.




<tensorflow.python.training.tracking.util.CheckpointLoadStatus at 0x2c6b1801550>

In [13]:
test_data = test_1['overview'].values.tolist()
test_encoded_electra = electra_tokenizer.batch_encode_plus(test_data)
test_encoded_electra = electra_tokenizer.batch_encode_plus(test_data, max_length=650, pad_to_max_length='left')

Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


In [14]:
x_test = test_encoded_electra['input_ids']
test_dataset = (tf.data.Dataset.from_tensor_slices(x_test).batch(strategy.num_replicas_in_sync))
len(x_test)

807

In [18]:
pred = model.predict(test_dataset, verbose=1)
pred_arg = pred.argmax(axis=1)



In [20]:
encoder = LabelEncoder()

encoder.fit(train_1['cat3'])
result_1 = encoder.inverse_transform(pred_arg)

In [22]:
test_1['result'] = result_1

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test_1['result'] = result_1


In [24]:
test_1.head()

Unnamed: 0,id,img_path,overview,cat1_result,result
4,TEST_00004,./image/test/TEST_00004.jpg,팔공산은 경산시의 북쪽에 위치한 해발 1192.3 m의 높은 산으로 신라시대에는 중...,자연,국립공원
12,TEST_00012,./image/test/TEST_00012.jpg,거창 서변은 높은 산들로 둘러싸인 분지형으로 예로부터 농업이 발달하고 인물이 많이 ...,자연,해수욕장
17,TEST_00017,./image/test/TEST_00017.jpg,강원도 양양군 현남면 동산리에 위치한 해수욕장이다. 7번 국도를 따라 북분리해수욕장...,자연,해수욕장
18,TEST_00018,./image/test/TEST_00018.jpg,"시무지기 폭포는 규봉암 아래 해발 고도 700m에 위치하고 있는 천연폭포이며, 전체...",자연,폭포
23,TEST_00023,./image/test/TEST_00023.jpg,"요트가 있는 풍경, 보령방조제 충남 보령시 오천면 충청수영로 소성삼거리에서 천북면을...",자연,자연생태관광지


In [39]:
test_1.to_csv('data/tset_cat3_1.csv', index=False, encoding='UTF8')

In [32]:
def cat3_2():
    with strategy.scope():
        encoder = TFElectraModel.from_pretrained("monologg/koelectra-base-v3-discriminator", from_pt=True)

        input_layer = Input(shape=(650,), dtype=tf.int32, name="input_layer")
        sequence_output = encoder(input_layer)[0]

        cls_token = sequence_output[:, 0, :]

        output_layer = Dense(31, activation='softmax')(cls_token)

        model = Model(inputs=input_layer, outputs=output_layer)
        model.compile(Adam(lr=1e-5), loss='sparse_categorical_crossentropy', metrics=['accuracy'])
    
    return model

model = cat3_2()
model.load_weights('checkpoints/koelectra_cat3_detail_2/')

Some weights of the PyTorch model were not used when initializing the TF 2.0 model TFElectraModel: ['electra.embeddings.position_ids', 'discriminator_predictions.dense.bias', 'discriminator_predictions.dense.weight', 'discriminator_predictions.dense_prediction.bias', 'discriminator_predictions.dense_prediction.weight']
- This IS expected if you are initializing TFElectraModel from a PyTorch model trained on another task or with another architecture (e.g. initializing a TFBertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFElectraModel from a PyTorch model that you expect to be exactly identical (e.g. initializing a TFBertForSequenceClassification model from a BertForSequenceClassification model).
All the weights of TFElectraModel were initialized from the PyTorch model.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFElectraModel for predictions without further train

<tensorflow.python.training.tracking.util.CheckpointLoadStatus at 0x2c8244a7550>

In [33]:
test_data = test_2['overview'].values.tolist()
test_encoded_electra = electra_tokenizer.batch_encode_plus(test_data)
test_encoded_electra = electra_tokenizer.batch_encode_plus(test_data, max_length=650, pad_to_max_length='left')



In [34]:
x_test = test_encoded_electra['input_ids']
test_dataset = (tf.data.Dataset.from_tensor_slices(x_test).batch(strategy.num_replicas_in_sync))
len(x_test)

1034

In [35]:
pred = model.predict(test_dataset, verbose=1)
pred_arg = pred.argmax(axis=1)



In [36]:
encoder = LabelEncoder()

encoder.fit(train_2['cat3'])
result_2 = encoder.inverse_transform(pred_arg)

In [37]:
test_2['result'] = result_2
test_2.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test_2['result'] = result_2


Unnamed: 0,id,img_path,overview,cat1_result,result
3,TEST_00003,./image/test/TEST_00003.jpg,다양한 형태의 청소년수련활동을 제공함으로써 청소년들이 민주사회의 주역이 될 수 있도...,레포츠,수련시설
19,TEST_00019,./image/test/TEST_00019.jpg,"도봉구에 위치한 창동문화체육센터는 50,075㎡의 대규모 시설을 갖추고 있으며, 다...",레포츠,수련시설
22,TEST_00022,./image/test/TEST_00022.jpg,가족소풍은 담양군 용면 용연리에 위치한 곳으로 가든과 펜션이 함께 조성된 캠핑장이다...,레포츠,"야영장,오토캠핑장"
32,TEST_00032,./image/test/TEST_00032.jpg,보은글램핑은 충북 보은군 속리산면에 자리 잡고 있다. 보은군청을 기점으로 10㎞가량...,레포츠,"야영장,오토캠핑장"
38,TEST_00038,./image/test/TEST_00038.jpg,충청남도 서천군 종천면에 위치한 희리산 해송자연휴양림 일반야영장은 사계절 해송으로 ...,레포츠,"야영장,오토캠핑장"


In [40]:
test_2.to_csv('data/tset_cat3_2.csv', index=False, encoding='UTF8')

In [9]:
def cat3_3():
    with strategy.scope():
        encoder = TFElectraModel.from_pretrained("monologg/koelectra-base-v3-discriminator", from_pt=True)

        input_layer = Input(shape=(650,), dtype=tf.int32, name="input_layer")
        sequence_output = encoder(input_layer)[0]

        cls_token = sequence_output[:, 0, :]

        output_layer = Dense(8, activation='softmax')(cls_token)

        model = Model(inputs=input_layer, outputs=output_layer)
        model.compile(Adam(lr=1e-5), loss='sparse_categorical_crossentropy', metrics=['accuracy'])
    
    return model

model = cat3_3()
model.load_weights('checkpoints/koelectra_cat3_detail_3/')

Some weights of the PyTorch model were not used when initializing the TF 2.0 model TFElectraModel: ['discriminator_predictions.dense.weight', 'discriminator_predictions.dense_prediction.bias', 'discriminator_predictions.dense_prediction.weight', 'discriminator_predictions.dense.bias', 'electra.embeddings.position_ids']
- This IS expected if you are initializing TFElectraModel from a PyTorch model trained on another task or with another architecture (e.g. initializing a TFBertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFElectraModel from a PyTorch model that you expect to be exactly identical (e.g. initializing a TFBertForSequenceClassification model from a BertForSequenceClassification model).
All the weights of TFElectraModel were initialized from the PyTorch model.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFElectraModel for predictions without further train

Please report this to the TensorFlow team. When filing the bug, set the verbosity to 10 (on Linux, `export AUTOGRAPH_VERBOSITY=10`) and attach the full output.
Cause: '<' not supported between instances of 'str' and 'Literal'
Please report this to the TensorFlow team. When filing the bug, set the verbosity to 10 (on Linux, `export AUTOGRAPH_VERBOSITY=10`) and attach the full output.
Cause: '<' not supported between instances of 'str' and 'Literal'
Instructions for updating:
The `validate_indices` argument has no effect. Indices are always validated on CPU and never validated on GPU.




<tensorflow.python.training.tracking.util.CheckpointLoadStatus at 0x2959019bb80>

In [10]:
test_data = test_3['overview'].values.tolist()
test_encoded_electra = electra_tokenizer.batch_encode_plus(test_data)
test_encoded_electra = electra_tokenizer.batch_encode_plus(test_data, max_length=650, pad_to_max_length='left')

Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


In [11]:
x_test = test_encoded_electra['input_ids']
test_dataset = (tf.data.Dataset.from_tensor_slices(x_test).batch(strategy.num_replicas_in_sync))
len(x_test)

2107

In [12]:
pred = model.predict(test_dataset, verbose=1)
pred_arg = pred.argmax(axis=1)



In [13]:
encoder = LabelEncoder()

encoder.fit(train_3['cat3'])
result_3 = encoder.inverse_transform(pred_arg)

In [14]:
test_3['result'] = result_3
test_3.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test_3['result'] = result_3


Unnamed: 0,id,img_path,overview,cat1_result,result
0,TEST_00000,./image/test/TEST_00000.jpg,신선한 재료로 정성을 다해 만들었다. 늘 변함없는 맛과 서비스로 모실것을 약속한다.,음식,한식
1,TEST_00001,./image/test/TEST_00001.jpg,"청청한 해역 등량만과 율포해수욕장이 한눈에 내려다 보이는 위치에 있으며, 막 잡은 ...",음식,일식
2,TEST_00002,./image/test/TEST_00002.jpg,장터설렁탕은 남녀노소 누구나 즐길 수 있는 전통 건강식으로 좋은 재료와 전통 조리방...,음식,한식
5,TEST_00005,./image/test/TEST_00005.jpg,30여 년의 세월이 느껴지는 실내 분위기가 냉면 맛을 더욱 살린다.,음식,한식
6,TEST_00006,./image/test/TEST_00006.jpg,코리달리스는 경기도 가평에 위치하고 있는 카페이다. 청명하고 맑은 호수 전경이 아름...,음식,바/까페


In [15]:
test_3.to_csv('data/tset_cat3_3.csv', index=False, encoding='UTF8')

In [16]:
def cat3_4():
    with strategy.scope():
        encoder = TFElectraModel.from_pretrained("monologg/koelectra-base-v3-discriminator", from_pt=True)

        input_layer = Input(shape=(650,), dtype=tf.int32, name="input_layer")
        sequence_output = encoder(input_layer)[0]

        cls_token = sequence_output[:, 0, :]

        output_layer = Dense(53, activation='softmax')(cls_token)

        model = Model(inputs=input_layer, outputs=output_layer)
        model.compile(Adam(lr=1e-5), loss='sparse_categorical_crossentropy', metrics=['accuracy'])
    
    return model

model = cat3_4()
model.load_weights('checkpoints/koelectra_cat3_detail_4/')

Some weights of the PyTorch model were not used when initializing the TF 2.0 model TFElectraModel: ['discriminator_predictions.dense.weight', 'discriminator_predictions.dense_prediction.bias', 'discriminator_predictions.dense_prediction.weight', 'discriminator_predictions.dense.bias', 'electra.embeddings.position_ids']
- This IS expected if you are initializing TFElectraModel from a PyTorch model trained on another task or with another architecture (e.g. initializing a TFBertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFElectraModel from a PyTorch model that you expect to be exactly identical (e.g. initializing a TFBertForSequenceClassification model from a BertForSequenceClassification model).
All the weights of TFElectraModel were initialized from the PyTorch model.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFElectraModel for predictions without further train

<tensorflow.python.training.tracking.util.CheckpointLoadStatus at 0x2958f344850>

In [17]:
test_data = test_4['overview'].values.tolist()
test_encoded_electra = electra_tokenizer.batch_encode_plus(test_data)
test_encoded_electra = electra_tokenizer.batch_encode_plus(test_data, max_length=650, pad_to_max_length='left')

Token indices sequence length is longer than the specified maximum sequence length for this model (542 > 512). Running this sequence through the model will result in indexing errors


In [18]:
x_test = test_encoded_electra['input_ids']
test_dataset = (tf.data.Dataset.from_tensor_slices(x_test).batch(strategy.num_replicas_in_sync))
len(x_test)

2417

In [19]:
pred = model.predict(test_dataset, verbose=1)
pred_arg = pred.argmax(axis=1)



In [20]:
encoder = LabelEncoder()

encoder.fit(train_4['cat3'])
result_4 = encoder.inverse_transform(pred_arg)

test_4['result'] = result_4
test_4.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test_4['result'] = result_4


Unnamed: 0,id,img_path,overview,cat1_result,result
8,TEST_00008,./image/test/TEST_00008.jpg,정유재란(1597年) 당시 육전에서 패퇴한 왜군선봉장 宇喜多秀家(우끼다히데이)와 藤...,인문(문화/예술/역사),성
10,TEST_00010,./image/test/TEST_00010.jpg,토지를 한 눈에 본다. 토지문화관 토지문화관은 토지문화재단에서 학술·문화행사 및 연...,인문(문화/예술/역사),도서관
13,TEST_00013,./image/test/TEST_00013.jpg,화림선원은 재단법인 선학원 소속의 사찰이다. 경기도 안산시 상록구 일동 수리산(修理...,인문(문화/예술/역사),사찰
21,TEST_00021,./image/test/TEST_00021.jpg,"보령 해수욕장 관광특구는 보령을 대표하는 대천해수욕장과 무창포해수욕장, 죽도 관광지...",인문(문화/예술/역사),관광단지
25,TEST_00025,./image/test/TEST_00025.jpg,방랑시인 김삿갓이 생을 마쳤던 전남 화순지역의 대표적인 정자 물염정은 화순적벽에서 ...,인문(문화/예술/역사),유명건물


In [21]:
test_4.to_csv('data/tset_cat3_4.csv', index=False, encoding='UTF8')

In [22]:
def cat3_5():
    with strategy.scope():
        encoder = TFElectraModel.from_pretrained("monologg/koelectra-base-v3-discriminator", from_pt=True)

        input_layer = Input(shape=(650,), dtype=tf.int32, name="input_layer")
        sequence_output = encoder(input_layer)[0]

        cls_token = sequence_output[:, 0, :]

        output_layer = Dense(9, activation='softmax')(cls_token)

        model = Model(inputs=input_layer, outputs=output_layer)
        model.compile(Adam(lr=1e-5), loss='sparse_categorical_crossentropy', metrics=['accuracy'])
    
    return model

model = cat3_5()
model.load_weights('checkpoints/koelectra_cat3_detail_5/')

Some weights of the PyTorch model were not used when initializing the TF 2.0 model TFElectraModel: ['discriminator_predictions.dense.weight', 'discriminator_predictions.dense_prediction.bias', 'discriminator_predictions.dense_prediction.weight', 'discriminator_predictions.dense.bias', 'electra.embeddings.position_ids']
- This IS expected if you are initializing TFElectraModel from a PyTorch model trained on another task or with another architecture (e.g. initializing a TFBertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFElectraModel from a PyTorch model that you expect to be exactly identical (e.g. initializing a TFBertForSequenceClassification model from a BertForSequenceClassification model).
All the weights of TFElectraModel were initialized from the PyTorch model.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFElectraModel for predictions without further train

<tensorflow.python.training.tracking.util.CheckpointLoadStatus at 0x295a91b3d30>

In [23]:
test_data = test_5['overview'].values.tolist()
test_encoded_electra = electra_tokenizer.batch_encode_plus(test_data)
test_encoded_electra = electra_tokenizer.batch_encode_plus(test_data, max_length=650, pad_to_max_length='left')



In [24]:
x_test = test_encoded_electra['input_ids']
test_dataset = (tf.data.Dataset.from_tensor_slices(x_test).batch(strategy.num_replicas_in_sync))
len(x_test)

615

In [25]:
pred = model.predict(test_dataset, verbose=1)
pred_arg = pred.argmax(axis=1)



In [26]:
encoder = LabelEncoder()

encoder.fit(train_5['cat3'])
result_5 = encoder.inverse_transform(pred_arg)

test_5['result'] = result_5
test_5.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test_5['result'] = result_5


Unnamed: 0,id,img_path,overview,cat1_result,result
20,TEST_00020,./image/test/TEST_00020.jpg,"라마다호텔&스위트평창은 고루포기산의 능경봉을 등지고 36,000㎡의 대지 주변으로 ...",숙박,한옥스테이
24,TEST_00024,./image/test/TEST_00024.jpg,#본 업소는 외국인관광 도시민박업으로 외국인만 이용이 가능하며 내국인은 이용할 수 ...,숙박,게스트하우스
42,TEST_00042,./image/test/TEST_00042.jpg,#본 업소는 외국인관광 도시민박업으로 외국인만 이용이 가능하며 내국인은 이용할 수 ...,숙박,게스트하우스
50,TEST_00050,./image/test/TEST_00050.jpg,충남에 위치한 칠갑산 샬레 호텔은 칠갑산의 자연 풍광이 한눈에 들어온다. 정문에서...,숙박,모텔
54,TEST_00054,./image/test/TEST_00054.jpg,"민통선 가까이 있는 휴식공간으로 강화군 송해면 숭뢰리 일대 5,000㎡부지에 객실 ...",숙박,펜션


In [27]:
test_5.to_csv('data/tset_cat3_5.csv', index=False, encoding='UTF8')

In [8]:
def cat3_6():
    with strategy.scope():
        encoder = TFElectraModel.from_pretrained("monologg/koelectra-base-v3-discriminator", from_pt=True)

        input_layer = Input(shape=(650,), dtype=tf.int32, name="input_layer")
        sequence_output = encoder(input_layer)[0]

        cls_token = sequence_output[:, 0, :]

        output_layer = Dense(7, activation='softmax')(cls_token)

        model = Model(inputs=input_layer, outputs=output_layer)
        model.compile(Adam(lr=1e-5), loss='sparse_categorical_crossentropy', metrics=['accuracy'])
    
    return model

model = cat3_6()
model.load_weights('checkpoints/koelectra_cat3_detail_6/')

Some weights of the PyTorch model were not used when initializing the TF 2.0 model TFElectraModel: ['discriminator_predictions.dense_prediction.bias', 'discriminator_predictions.dense_prediction.weight', 'discriminator_predictions.dense.bias', 'electra.embeddings.position_ids', 'discriminator_predictions.dense.weight']
- This IS expected if you are initializing TFElectraModel from a PyTorch model trained on another task or with another architecture (e.g. initializing a TFBertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFElectraModel from a PyTorch model that you expect to be exactly identical (e.g. initializing a TFBertForSequenceClassification model from a BertForSequenceClassification model).
All the weights of TFElectraModel were initialized from the PyTorch model.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFElectraModel for predictions without further train

Please report this to the TensorFlow team. When filing the bug, set the verbosity to 10 (on Linux, `export AUTOGRAPH_VERBOSITY=10`) and attach the full output.
Cause: '<' not supported between instances of 'str' and 'Literal'
Please report this to the TensorFlow team. When filing the bug, set the verbosity to 10 (on Linux, `export AUTOGRAPH_VERBOSITY=10`) and attach the full output.
Cause: '<' not supported between instances of 'str' and 'Literal'
Instructions for updating:
The `validate_indices` argument has no effect. Indices are always validated on CPU and never validated on GPU.




<tensorflow.python.training.tracking.util.CheckpointLoadStatus at 0x29eb28dc040>

In [9]:
test_data = test_6['overview'].values.tolist()
test_encoded_electra = electra_tokenizer.batch_encode_plus(test_data)
test_encoded_electra = electra_tokenizer.batch_encode_plus(test_data, max_length=650, pad_to_max_length='left')

Token indices sequence length is longer than the specified maximum sequence length for this model (537 > 512). Running this sequence through the model will result in indexing errors
Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


In [10]:
x_test = test_encoded_electra['input_ids']
test_dataset = (tf.data.Dataset.from_tensor_slices(x_test).batch(strategy.num_replicas_in_sync))
len(x_test)

300

In [11]:
pred = model.predict(test_dataset, verbose=1)
pred_arg = pred.argmax(axis=1)



In [13]:
encoder = LabelEncoder()

encoder.fit(train_6['cat3'])
result_6 = encoder.inverse_transform(pred_arg)

test_6['result'] = result_6
test_6.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test_6['result'] = result_6


Unnamed: 0,id,img_path,overview,cat1_result,result
9,TEST_00009,./image/test/TEST_00009.jpg,약 50여개의 점포가 있는 골목형 시장이다. 시장 내에 개성 있는 인테리어의 카레전...,쇼핑,상설시장
16,TEST_00016,./image/test/TEST_00016.jpg,부산 서구의 전통시장인 충무동 새벽시장은 부산 남항과 인접해 있다. 과거에는 새벽 ...,쇼핑,상설시장
56,TEST_00056,./image/test/TEST_00056.jpg,서울 영등포구 대림동에 위치한 대림중앙시장은 한국속 작은 중국이라 불릴 정도로 조선...,쇼핑,상설시장
70,TEST_00070,./image/test/TEST_00070.jpg,중앙유통단지는 공장에서 직접납품을 받아 판매하므로 중간단계의 운송비 및 물류비 절감...,쇼핑,전문상가
136,TEST_00136,./image/test/TEST_00136.jpg,전면적의 82%가 산인 진안은 오늘날 전국에서 인삼이 가장 많이 생산되는 고장이다....,쇼핑,특산물판매점


In [14]:
test_6.to_csv('data/tset_cat3_6.csv', index=False, encoding='UTF8')

In [16]:
test_1 = pd.read_csv('data/tset_cat3_1.csv')
test_2 = pd.read_csv('data/tset_cat3_2.csv')
test_3 = pd.read_csv('data/tset_cat3_3.csv')
test_4 = pd.read_csv('data/tset_cat3_4.csv')
test_5 = pd.read_csv('data/tset_cat3_5.csv')
test_6 = pd.read_csv('data/tset_cat3_6.csv')

In [20]:
result_df = pd.concat([test_1,test_2,test_3,test_4,test_5,test_6]).sort_values('id')

In [22]:
result_df.head()

Unnamed: 0,id,img_path,overview,cat1_result,result
0,TEST_00000,./image/test/TEST_00000.jpg,신선한 재료로 정성을 다해 만들었다. 늘 변함없는 맛과 서비스로 모실것을 약속한다.,음식,한식
1,TEST_00001,./image/test/TEST_00001.jpg,"청청한 해역 등량만과 율포해수욕장이 한눈에 내려다 보이는 위치에 있으며, 막 잡은 ...",음식,일식
2,TEST_00002,./image/test/TEST_00002.jpg,장터설렁탕은 남녀노소 누구나 즐길 수 있는 전통 건강식으로 좋은 재료와 전통 조리방...,음식,한식
0,TEST_00003,./image/test/TEST_00003.jpg,다양한 형태의 청소년수련활동을 제공함으로써 청소년들이 민주사회의 주역이 될 수 있도...,레포츠,수련시설
0,TEST_00004,./image/test/TEST_00004.jpg,팔공산은 경산시의 북쪽에 위치한 해발 1192.3 m의 높은 산으로 신라시대에는 중...,자연,국립공원


In [24]:
sub = pd.DataFrame({
    'id': result_df['id'],
    'cat3': result_df['result']
})

In [26]:
sub

Unnamed: 0,id,cat3
0,TEST_00000,한식
1,TEST_00001,일식
2,TEST_00002,한식
0,TEST_00003,수련시설
0,TEST_00004,국립공원
...,...,...
2106,TEST_07275,한식
2415,TEST_07276,공연장
1033,TEST_07277,"야영장,오토캠핑장"
614,TEST_07278,모텔


In [27]:
sub.to_csv('submission/third_sub.csv', index=False, encoding='UTF8')