In [19]:
import tensorflow as tf
import tensorflow_hub as hub
import pandas as pd

In [20]:
test_df = pd.read_csv('../test_essays.csv')
submission_df = pd.read_csv('../sample_submission.csv')
train_df = pd.read_csv("../train_v2_drcat_02.csv")
kf_df = pd.read_csv('../kf_df.csv')

In [21]:
kf_df = kf_df.rename(columns={'prompt_title': 'prompt_name'})
kf_df['label'] = 1
kf_df['source'] = 'kf'
kf_df['RDizzl3_seven'] = False

In [22]:
train_df = pd.concat([train_df, kf_df[train_df.columns].sample(30000, random_state=42)])

In [23]:
train_df = train_df.drop_duplicates(subset=['text'])
train_df.reset_index(drop=True, inplace=True)

In [24]:
bert_preprocess = hub.KerasLayer("https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3")
bert_encoder = hub.KerasLayer("https://tfhub.dev/tensorflow/bert_en_uncased_L-12_H-768_A-12/4")

In [25]:
def get_sentence_embeding(sentences):
    preprocessed_text = bert_preprocess(sentences)
    return bert_encoder(preprocessed_text)['pooled_output']

get_sentence_embeding([
    "500$ discount. hurry up", 
    "Bhavin, are you up for a volleybal game tomorrow?"]
)

<tf.Tensor: shape=(2, 768), dtype=float32, numpy=
array([[-0.84351707, -0.5132727 , -0.8884574 , ..., -0.7474886 ,
        -0.7531475 ,  0.91964495],
       [-0.8720836 , -0.5054399 , -0.9444669 , ..., -0.85847527,
        -0.71745366,  0.8808299 ]], dtype=float32)>

In [26]:
# Bert layers
text_input = tf.keras.layers.Input(shape=(), dtype=tf.string, name='text')
preprocessed_text = bert_preprocess(text_input)
outputs = bert_encoder(preprocessed_text)

# Neural network layers
l = tf.keras.layers.Dropout(0.1, name="dropout")(outputs['pooled_output'])
l = tf.keras.layers.Dense(1, activation='sigmoid', name="output")(l)

# Use inputs and outputs to construct a final model
model = tf.keras.Model(inputs=[text_input], outputs = [l])

In [27]:
model.summary()

Model: "model_1"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 text (InputLayer)              [(None,)]            0           []                               
                                                                                                  
 keras_layer_4 (KerasLayer)     {'input_word_ids':   0           ['text[0][0]']                   
                                (None, 128),                                                      
                                 'input_mask': (Non                                               
                                e, 128),                                                          
                                 'input_type_ids':                                                
                                (None, 128)}                                                

In [28]:
METRICS = [
      tf.keras.metrics.BinaryAccuracy(name='accuracy'),
      tf.keras.metrics.Precision(name='precision'),
      tf.keras.metrics.Recall(name='recall')
]

model.compile(optimizer='adam',
              loss='binary_crossentropy',
              metrics=METRICS)

In [ ]:
model.fit(train_df['text'], train_df['label'], epochs=10)

Epoch 1/10
  66/2340 [..............................] - ETA: 2:07:31 - loss: 0.5935 - accuracy: 0.6742 - precision: 0.6942 - recall: 0.8676

In [ ]:
predictions = model.predict(test_df)

submission_df['generated'] = predictions
submission_df.to_csv('../submission.csv', index=False)