In [1]:
import tensorflow as tf
import pandas as pd
import numpy as np
import transformers
from transformers import BertTokenizer, TFBertModel

In [2]:
df = pd.read_csv('/kaggle/input/amazon-ml/train.csv')

In [3]:
df.dropna(subset=['DESCRIPTION', 'BULLET_POINTS'], inplace=True)

In [4]:
df = df.sample(n = 1000, random_state = 7)

In [5]:
train_df = df.sample(frac=0.8, random_state=42)
val_df = df.drop(train_df.index)

In [6]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

In [7]:
%%time

train_inputs = tokenizer(list(train_df['TITLE'] + train_df['BULLET_POINTS'] + train_df['DESCRIPTION']), 
                          padding=True, truncation=True, max_length=512, 
                          return_tensors='tf')

val_inputs = tokenizer(list(val_df['TITLE'] + val_df['BULLET_POINTS'] + val_df['DESCRIPTION']), 
                        padding=True, truncation=True, max_length=512, 
                        return_tensors='tf')


CPU times: user 8.34 s, sys: 676 ms, total: 9.01 s
Wall time: 11.4 s


In [8]:
train_outputs = np.array(train_df['PRODUCT_LENGTH'])
val_outputs = np.array(val_df['PRODUCT_LENGTH'])

In [None]:
# from sklearn.preprocessing import MinMaxScaler
# scaler = MinMaxScaler()
# train_outputs_scaled = scaler.fit_transform(train_outputs.reshape(-1, 1)).flatten()
# val_outputs_scaled = scaler.transform(val_outputs.reshape(-1, 1)).flatten()


In [9]:
from sklearn.preprocessing import RobustScaler

scaler = RobustScaler()
train_outputs_scaled = scaler.fit_transform(train_outputs.reshape(-1, 1)).flatten()
val_outputs_scaled = scaler.transform(val_outputs.reshape(-1, 1)).flatten()

In [10]:
val_outputs_scaled

array([-5.04032258e-02, -6.51209677e-01, -3.95161290e-01,  5.99324598e+00,
        6.67439518e-01,  2.11693548e+00,  5.54435484e-01, -1.51209677e-01,
        1.00806452e-01, -2.52016129e-01, -1.51209677e-01, -8.56854839e-01,
        1.65322581e+00, -4.03225807e-01,  1.15020161e+00, -2.27449596e-01,
       -5.23185484e-01, -1.39112903e-01,  1.00292340e-01, -4.03225807e-01,
        1.11895161e+01, -8.68850807e-01,  6.29032260e-01,  9.49092744e-01,
       -2.52016129e-01, -4.28447580e-01, -1.51209677e-01,  6.55241936e-01,
        1.51209677e-01,  4.36995969e-01,  6.04838710e-01,  4.98991936e-01,
       -1.00806452e-01, -5.04032258e-01, -6.55241936e-01, -6.04838710e-01,
       -4.33568548e-01,  6.29032260e-01, -1.00806452e-01,  2.44959679e-01,
       -7.05645162e-01,  0.00000000e+00,  5.01008066e-01, -2.03124999e-01,
       -3.95161290e-01, -2.67137096e-01, -1.51209677e-01,  5.04032258e-01,
        4.36995969e-01, -5.54435484e-01,  1.10887097e+00, -5.04032258e-01,
        4.08568549e+00, -

In [11]:
train_inputs = (train_inputs['input_ids'].numpy(), train_inputs['token_type_ids'].numpy(), train_inputs['attention_mask'].numpy())
val_inputs = (val_inputs['input_ids'].numpy(), val_inputs['token_type_ids'].numpy(), val_inputs['attention_mask'].numpy())
    
train_outputs = np.array(train_df['PRODUCT_LENGTH'])
val_outputs = np.array(val_df['PRODUCT_LENGTH'])

In [31]:
bert_model = TFBertModel.from_pretrained('bert-base-uncased')

input_ids = tf.keras.layers.Input(shape=(512,), dtype='int32')
token_type_ids = tf.keras.layers.Input(shape=(512,), dtype='int32')
attention_mask = tf.keras.layers.Input(shape=(512,), dtype='int32')

# embedding = bert_model(input_ids, token_type_ids=token_type_ids, attention_mask=attention_mask)[1]
# output = tf.keras.layers.Dense(1, activation='linear')(embedding)

embedding = bert_model(input_ids, token_type_ids=token_type_ids, attention_mask=attention_mask)[1]
dense1 = tf.keras.layers.Dense(256, activation='linear')(embedding)
dense2 = tf.keras.layers.Dense(128, activation='linear')(dense1)
output = tf.keras.layers.Dense(1, activation='linear')(dense2)


# embedding = bert_model(input_ids, token_type_ids=token_type_ids, attention_mask=attention_mask)[1]
# dense1 = tf.keras.layers.Dense(64, activation='relu')(embedding)
# dense2 = tf.keras.layers.Dense(64, activation='relu')(dense1)
# output = tf.keras.layers.Dense(1, activation='linear')(dense2)


model = tf.keras.models.Model(inputs=[input_ids, token_type_ids, attention_mask], outputs=output)



Some layers from the model checkpoint at bert-base-uncased were not used when initializing TFBertModel: ['nsp___cls', 'mlm___cls']
- This IS expected if you are initializing TFBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
All the layers of TFBertModel were initialized from the model checkpoint at bert-base-uncased.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBertModel for predictions without further training.


In [32]:
# optimizer = tf.keras.optimizers.Adam(lr=0.00001)
optimizer = tf.keras.optimizers.Adam(lr=0.001)
model.compile(loss='mse', optimizer=optimizer)

In [33]:
model.fit(train_inputs, train_outputs_scaled, validation_data=(val_inputs, val_outputs_scaled), 
          epochs=3, batch_size=8)

Epoch 1/3
Epoch 2/3
Epoch 3/3


<keras.callbacks.History at 0x7b2beb0b26d0>

In [34]:
val_pred_scaled = model.predict(val_inputs)
val_pred = scaler.inverse_transform(val_pred_scaled.reshape(-1, 1)).flatten()
val_rmse = np.sqrt(np.mean(np.square(val_pred - val_outputs)))
print('Validation RMSE:', val_rmse)

Validation RMSE: 3907.9151295427105


In [39]:
train_pred_scaled = model.predict(train_inputs)
train_pred = scaler.inverse_transform(train_pred_scaled.reshape(-1, 1)).flatten()



In [40]:
submission_df = train_df[['PRODUCT_ID']].copy()
submission_df['PRODUCT_LENGTH'] = train_pred
submission_df.to_csv('submission_train.csv', index=False)

print(submission_df.shape) # (734736, 2)


(800, 2)


In [35]:
test_df = pd.read_csv('/kaggle/input/amazon-ml/test.csv')

test_df.dropna(subset=['DESCRIPTION', 'BULLET_POINTS'], inplace=True)

test_df = test_df.sample(n=1000, random_state = 42)

In [36]:
%%time 

tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

test_inputs = tokenizer(list(test_df['TITLE'] + test_df['BULLET_POINTS'] + test_df['DESCRIPTION']), 
                          padding=True, truncation=True, max_length=512, 
                          return_tensors='tf')

test_inputs = (test_inputs['input_ids'].numpy(), test_inputs['token_type_ids'].numpy(), test_inputs['attention_mask'].numpy())


CPU times: user 6.86 s, sys: 8.54 ms, total: 6.87 s
Wall time: 7.04 s


In [37]:
test_pred_scaled = model.predict(test_inputs)
test_pred = scaler.inverse_transform(test_pred_scaled.reshape(-1, 1)).flatten()



In [38]:
submission_df = test_df[['PRODUCT_ID']].copy()
submission_df['PRODUCT_LENGTH'] = test_pred
submission_df.to_csv('submission_test.csv', index=False)

print(submission_df.shape) # (734736, 2)


(1000, 2)
