In [None]:
import numpy as np
import pandas as pd
import tensorflow as tf
from tqdm import tqdm

In [None]:
train_data = pd.read_csv(
    '/kaggle/input/nlp-getting-started/train.csv', 
    usecols=['text', 'target'], 
    dtype={'text': str, 'target': np.int64}
)

test_data = pd.read_csv(
    '/kaggle/input/nlp-getting-started/test.csv', 
    usecols=['text', 'id'], 
    dtype={'text': str, 'id': str}
)

In [None]:
def print_1_0(d):
    f = f1 = 0
    for i in d:
        if i == 1:
            f = f + 1
        else:
            f1 = f1 + 1
    print('1 = ',f)
    print('0 = ',f1)

In [None]:
print_1_0(train_data['target'])

In [None]:
train_data.head()

In [None]:
test_data.head()

In [None]:
train_text = train_data['text']
train_target = train_data['target']

test_text = test_data['text']
#test_target = test_data['target']

In [None]:
train_data['text'].apply(lambda x:len(str(x).split())).max()

In [None]:
test_data['text'].apply(lambda x:len(str(x).split())).max()

In [None]:
trunc_type='post'
padding_type='post'
oov_tok = "<OOV>"
# using keras tokenizer here
#token = text.Tokenizer(num_words=None,oov_token=oov_tok)
#max_len = 1100
max_len = 40
#token.fit_on_texts(list(train_text))
#word_index = token.word_index
#len(word_index)

In [None]:
token = tf.keras.preprocessing.text.Tokenizer()

token.fit_on_texts(list(train_text))
word_index = token.word_index
len(word_index)
    


In [None]:
xtrain_seq = token.texts_to_sequences(train_text)
xtest_seq = token.texts_to_sequences(test_text)

#zero pad the sequences
preproc_train = tf.keras.preprocessing.sequence.pad_sequences(xtrain_seq, maxlen=max_len,padding=padding_type)
preproc_test = tf.keras.preprocessing.sequence.pad_sequences(xtest_seq, maxlen=max_len,padding=padding_type)

In [None]:
embeddings_index = {}
f = open('/kaggle/input/glove840b300dtxt/glove.840B.300d.txt','r',encoding='utf-8')
for line in tqdm(f):
    values = line.split(' ')
    word = values[0]
    coefs = np.asarray([float(val) for val in values[1:]])
    embeddings_index[word] = coefs
f.close()

print('Found %s word vectors.' % len(embeddings_index))


In [None]:
# create an embedding matrix for the words we have in the dataset
embedding_matrix = np.zeros((len(word_index) + 1, 300))
for word, i in tqdm(word_index.items()):
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        embedding_matrix[i] = embedding_vector

In [None]:
def build_model():
    # A simple bidirectional LSTM with glove embeddings and one dense layer
    model = tf.keras.Sequential()
    model.add(tf.keras.layers.Embedding(len(word_index) + 1,
                     300,
                     weights=[embedding_matrix],
                     input_length=max_len,
                     trainable=False))
    model.add(tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(300, dropout=0.4, recurrent_dropout=0.4,input_shape = [1,300],return_sequences=True)))
    model.add(tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(64, recurrent_dropout=0.4)))
    model.add(tf.keras.layers.Dropout(0.5))
    model.add(tf.keras.layers.Dense(1, activation='sigmoid'))
    
    model.compile(
    loss=tf.keras.losses.BinaryCrossentropy(),
    optimizer=tf.keras.optimizers.Adam(1e-4),
    metrics=['accuracy']
    )
    print(model.summary())
    return model

In [None]:
#Submission Two
def build_model():
    # A simple bidirectional LSTM with glove embeddings and one dense layer
    model = tf.keras.Sequential()
    model.add(tf.keras.layers.Embedding(len(word_index) + 1,
                     300,
                     weights=[embedding_matrix],
                     input_length=max_len,
                     trainable=False))
    model.add(tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(256, dropout=0.4, recurrent_dropout=0.4,input_shape = [1,300],return_sequences=True)))
    model.add(tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(64, recurrent_dropout=0.4)))
    #model.add(tf.keras.layers.Dropout(0.5))
    model.add(tf.keras.layers.Dense(1, activation='sigmoid'))
    
    model.compile(
    loss=tf.keras.losses.BinaryCrossentropy(),
    optimizer=tf.keras.optimizers.Adam(1e-4),
    metrics=['accuracy']
    )
    print(model.summary())
    return model

In [None]:
#Submission 3
def build_model():
    # A simple bidirectional LSTM with glove embeddings and one dense layer
    model = tf.keras.Sequential()
    model.add(tf.keras.layers.Embedding(len(word_index) + 1,
                     300,
                     weights=[embedding_matrix],
                     input_length=max_len,
                     trainable=False))
    model.add(tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(300, input_shape = [1,300],return_sequences=True))) #dropout=0.4, recurrent_dropout=0.4,input_shape = [1,300],return_sequences=True)))
    model.add(tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(64))) #recurrent_dropout=0.4)))
    model.add(tf.keras.layers.Dropout(0.5))
    model.add(tf.keras.layers.Dense(1, activation='sigmoid'))
    
    model.compile(
    loss=tf.keras.losses.BinaryCrossentropy(),
    optimizer=tf.keras.optimizers.Adam(1e-4),
    metrics=['accuracy']
    )
    print(model.summary())
    return model

In [None]:
#Model 4
def build_model():
    # A simple bidirectional LSTM with glove embeddings and one dense layer
    model = tf.keras.Sequential()
    model.add(tf.keras.layers.Embedding(len(word_index) + 1,
                     300,
                     weights=[embedding_matrix],
                     input_length=max_len,
                     trainable=False))
    model.add(tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(300, dropout=0.4, recurrent_dropout=0.4,input_shape = [1,300],return_sequences=True)))
    model.add(tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(64, recurrent_dropout=0.4)))
    model.add(tf.keras.layers.Dropout(0.5))
    model.add(tf.keras.layers.Dense(1, activation='softmax'))
    
    model.compile(
    loss=tf.keras.losses.BinaryCrossentropy(),
    optimizer=tf.keras.optimizers.Adam(1e-4),
    metrics=['accuracy']
    )
    print(model.summary())
    return model

In [None]:
model = build_model()

In [None]:
H = model.fit(preproc_train,
         train_target,
         batch_size=32,
         epochs=20,
         verbose=1)

In [None]:
pred = model.predict(preproc_test)

In [None]:
predictions = np.concatenate(pred).round().astype(int)
#Write the submission to a csv file.



In [None]:


submission = pd.DataFrame(data={'target': predictions}, index=test_data['id'])
submission.index = submission.index.rename('id')
submission.to_csv('submission.csv')
submission.head()


In [None]:
submission

# BERT BASED APPROACH

In [None]:
!pip install transformers

In [None]:
# Loading Dependencies
import os
import tensorflow as tf
from tensorflow.keras.layers import Dense, Input
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.models import Model
from tensorflow.keras.callbacks import ModelCheckpoint
#from kaggle_datasets import KaggleDatasets
import transformers

from tokenizers import BertWordPieceTokenizer

In [None]:
import pandas as pd
import math
import numpy as np
#from seqeval.metrics import f1_score
#from seqeval.metrics import classification_report,accuracy_score,f1_score
#import torch.nn.functional as F

In [None]:
#import torch
import os
from tqdm import tqdm,trange
#from torch.optim import Adam
#from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
from keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split
from transformers import BertTokenizer, BertConfig
from transformers import BertForTokenClassification, AdamW

In [None]:
# Detect hardware, return appropriate distribution strategy
try:
    # TPU detection. No parameters necessary if TPU_NAME environment variable is
    # set: this is always the case on Kaggle.
    tpu = tf.distribute.cluster_resolver.TPUClusterResolver()
    print('Running on TPU ', tpu.master())
except ValueError:
    tpu = None

if tpu:
    tf.config.experimental_connect_to_cluster(tpu)
    tf.tpu.experimental.initialize_tpu_system(tpu)
    strategy = tf.distribute.experimental.TPUStrategy(tpu)
else:
    # Default distribution strategy in Tensorflow. Works on CPU and single GPU.
    strategy = tf.distribute.get_strategy()

print("REPLICAS: ", strategy.num_replicas_in_sync)

In [None]:
train_data = pd.read_csv(
    '/kaggle/input/nlp-getting-started/train.csv', 
    usecols=['text', 'target'], 
    dtype={'text': str, 'target': np.int64}
)

test_data = pd.read_csv(
    '/kaggle/input/nlp-getting-started/test.csv', 
    usecols=['text', 'id'], 
    dtype={'text': str, 'id': str}
)

In [None]:
train_data.head(10)

In [None]:
test_data.head(10)

In [None]:
def fast_encode(texts, tokenizer, chunk_size=256, maxlen=512):
    """
    Encoder for encoding the text into sequence of integers for BERT Input
    """
    tokenizer.enable_truncation(max_length=maxlen)
    tokenizer.enable_padding(max_length=maxlen)
    all_ids = []
    
    for i in tqdm(range(0, len(texts), chunk_size)):
        text_chunk = texts[i:i+chunk_size].tolist()
        encs = tokenizer.encode_batch(text_chunk)
        all_ids.extend([enc.ids for enc in encs])
    
    return np.array(all_ids)

In [None]:
#IMP DATA FOR CONFIG

AUTO = tf.data.experimental.AUTOTUNE


# Configuration
EPOCHS = 3
BATCH_SIZE = 16 * strategy.num_replicas_in_sync
MAX_LEN = 45
print(BATCH_SIZE)

In [None]:
# First load the real tokenizer
tokenizer = transformers.BertTokenizer.from_pretrained('bert-base-cased')
# Save the loaded tokenizer locally
tokenizer.save_pretrained('.')
# Reload it with the huggingface tokenizers library
fast_tokenizer = BertWordPieceTokenizer('vocab.txt', lowercase=False)
fast_tokenizer


In [None]:
x_train = fast_encode(train_data.text.astype(str), fast_tokenizer, maxlen=MAX_LEN)
#x_valid = fast_encode(valid.comment_text.astype(str), fast_tokenizer, maxlen=MAX_LEN)
x_test = fast_encode(test_data.text.astype(str), fast_tokenizer, maxlen=MAX_LEN)



In [None]:
y_train = train_data.target.values
#y_valid = valid.toxic.values

In [None]:
train_dataset = (
    tf.data.Dataset
    .from_tensor_slices((x_train, y_train))
    .repeat()
    .shuffle(2048)
    .batch(BATCH_SIZE)
    .prefetch(AUTO)
)

"""
valid_dataset = (
    tf.data.Dataset
    .from_tensor_slices((x_valid, y_valid))
    .batch(BATCH_SIZE)
    .cache()
    .prefetch(AUTO)
)
"""

test_dataset = (
    tf.data.Dataset
    .from_tensor_slices(x_test)
    .batch(BATCH_SIZE)
)

In [None]:
def build_model(transformer, max_len=512):
    """
    function for training the BERT model
    """
    input_word_ids = Input(shape=(max_len,), dtype=tf.int32, name="input_word_ids")
    sequence_output = transformer(input_word_ids)[0]
    cls_token = sequence_output[:, 0, :]
    out = Dense(1, activation='sigmoid')(cls_token)
    
    model = Model(inputs=input_word_ids, outputs=out)
    model.compile(Adam(lr=1e-5), loss='binary_crossentropy', metrics=['accuracy'])
    
    return model

In [None]:
%%time
with strategy.scope():
    transformer_layer = (
        transformers.TFBertModel
        .from_pretrained('bert-base-cased')
    )
    model = build_model(transformer_layer, max_len=MAX_LEN)
model.summary()


In [None]:
n_steps = x_train.shape[0] // BATCH_SIZE
train_history = model.fit(
    train_dataset,
    steps_per_epoch=n_steps,
    epochs=EPOCHS*3,
    verbose=1,
)

In [None]:
predictions = model.predict(test_dataset, verbose=1)


In [None]:
predictions = np.concatenate(predictions).round().astype(int)
#Write the submission to a csv file.

submission = pd.DataFrame(data={'target': predictions}, index=test_data['id'])
submission.index = submission.index.rename('id')
submission.to_csv('submission.csv')
submission.head()


In [None]:
submission.head(50)

In [None]:
sub.to_csv('submission.csv', index=False)
