In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in 

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# Any results you write to the current directory are saved as output.

In [None]:
!wget --quiet https://raw.githubusercontent.com/tensorflow/models/master/official/nlp/bert/tokenization.py

In [None]:
# Importing Necessary Libraries
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from xgboost import XGBClassifier
import tensorflow_hub as hub
import tokenization
from tensorflow.keras.layers import Dense, Input, Bidirectional, SpatialDropout1D, Embedding, add, concatenate
from tensorflow.keras.layers import GRU, GlobalAveragePooling1D, LSTM, GlobalMaxPooling1D
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint, LearningRateScheduler
from tensorflow.keras.models import Model
import tensorflow as tf
import matplotlib.pyplot as plt
from wordcloud import WordCloud, STOPWORDS 
from sklearn.feature_extraction.text import TfidfVectorizer

A special thanks to **Martin Görner** to provide the code to setup TPUs in kaggle. The next cell is the code from here

In [None]:
# Detect hardware, return appropriate distribution strategy
try:
    tpu = tf.distribute.cluster_resolver.TPUClusterResolver()  # TPU detection. No parameters necessary if TPU_NAME environment variable is set. On Kaggle this is always the case.
    print('Running on TPU ', tpu.master())
except ValueError:
    tpu = None

if tpu:
    tf.config.experimental_connect_to_cluster(tpu)
    tf.tpu.experimental.initialize_tpu_system(tpu)
    strategy = tf.distribute.experimental.TPUStrategy(tpu)
else:
    strategy = tf.distribute.get_strategy() # default distribution strategy in Tensorflow. Works on CPU and single GPU.

print("REPLICAS: ", strategy.num_replicas_in_sync)

In [None]:
# Reading the dataset
df = pd.read_csv('../input/nlp-getting-started/train.csv')
test = pd.read_csv('../input/nlp-getting-started/test.csv')
submission = pd.read_csv('../input/nlp-getting-started/sample_submission.csv')

In [None]:
df

In [None]:
# Checking if there is any null values
df.isnull().any()

# Data Visualisation

In [None]:
# Seeing the target Value
df['target'].hist()

In [None]:
comment_words = ' '
stopwords = set(STOPWORDS) 
  

for tweet in df.loc[df.target == 1].text: 
    tokens = tweet.split() 
    for i in range(len(tokens)): 
        tokens[i] = tokens[i].lower() 
          
    for words in tokens: 
        comment_words = comment_words + words + ' '  
 

wordcloud = WordCloud(width = 1000, height = 1000, 
                stopwords = stopwords, 
                min_font_size = 10).generate(comment_words) 
  
                      
plt.figure(figsize = (15, 15)) 
plt.imshow(wordcloud) 
plt.title('Most used word in Disaster Tweet (Target = 1)')
plt.axis("off") 
plt.show() 

In [None]:
comment_words = ' '
stopwords = set(STOPWORDS) 
  

for tweet in df.loc[df.target == 0].text: 
    tokens = tweet.split() 
    for i in range(len(tokens)): 
        tokens[i] = tokens[i].lower() 
          
    for words in tokens: 
        comment_words = comment_words + words + ' '  
 

wordcloud = WordCloud(width = 1000, height = 1000, 
                stopwords = stopwords, 
                min_font_size = 10).generate(comment_words) 
  
                      
plt.figure(figsize = (15, 15)) 
plt.imshow(wordcloud) 
plt.title('Most used word in Non Disaster Tweet (Target = 0)')
plt.axis("off") 
plt.show() 

# Splitting the dataset

In [None]:
X_train, X_test, y_train, y_test = train_test_split(df.iloc[:, 3], df.iloc[:, 4], test_size=0.2)

# TF-IDF

In [None]:
def tfidf(data):
    tfidf_vectorizer = TfidfVectorizer()
    train = tfidf_vectorizer.fit_transform(data)
    return train, tfidf_vectorizer

X_train_tfidf, tfidf_vectorizer = tfidf(X_train)
X_test_tfidf = tfidf_vectorizer.transform(X_test)
submission_tfidf = tfidf_vectorizer.transform(test.iloc[:, 3])

In [None]:
# Training the XGBoost Model 
model = XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, gamma=0,
              learning_rate=0.1, max_delta_step=0, max_depth=7,
              min_child_weight=1, missing=None, n_estimators=100, n_jobs=-1,
              nthread=None, objective='binary:logistic', random_state=0,
              reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
              silent=None, subsample=1, verbosity=1)
model.fit(X_train_tfidf, y_train)

In [None]:
# Evaluating the model in test data
model.score(X_test_tfidf, y_test)

In [None]:
# Making the predictions
predictions = model.predict(submission_tfidf)

In [None]:
submission['target'] = predictions
submission

In [None]:
submission.to_csv('tfidf_submission.csv', index=False)

# BERT

I am also very new to BERT and [**this**](https://www.kaggle.com/xhlulu/disaster-nlp-keras-bert-using-tfhub) kernel help me a lot to Implement BERT in text classification. Special thanks for **xhlulu** for providing this nice kernel

In [None]:
%%time
module_url = "https://tfhub.dev/tensorflow/bert_en_uncased_L-24_H-1024_A-16/1"
bert_layer = hub.KerasLayer(module_url, trainable=True)

In [None]:
def bert_encoder(texts, tokenizer, max_len=512):
    all_tokens = []
    all_masks = []
    all_segments = []
    
    for text in texts:
        text = tokenizer.tokenize(text)
        
        text = text[:max_len-2]
        input_sequence = ["[CLS]"] + text + ["[SEP]"]
        pad_len = max_len - len(input_sequence)
        
        tokens = tokenizer.convert_tokens_to_ids(input_sequence)
        tokens += [0] * pad_len
        pad_masks = [1] * len(input_sequence) + [0] * pad_len
        segments_ids = [0] * max_len
        
        all_tokens.append(tokens)
        all_masks.append(pad_masks)
        all_segments.append(segments_ids)
    
    return np.array(all_tokens), np.array(all_masks), np.array(all_segments)

In [None]:
# Making out model architecture
def build_model(bert_layer, max_len=512):
    with strategy.scope():
        input_word_ids = Input(shape=(max_len,), dtype=tf.int32, name="input_word_ids")
        input_mask = Input(shape=(max_len,), dtype=tf.int32, name="input_mask")
        segment_ids = Input(shape=(max_len,), dtype=tf.int32, name="segment_ids")

        _, sequence_output = bert_layer([input_word_ids, input_mask, segment_ids])
        x = SpatialDropout1D(0.3)(sequence_output)
        x = Bidirectional(GRU(LSTM_UNITS, return_sequences=True))(x)
        x = Bidirectional(GRU(LSTM_UNITS, return_sequences=True))(x)
        hidden = concatenate([GlobalMaxPooling1D()(x),GlobalAveragePooling1D()(x),])
        hidden = add([hidden, Dense(DENSE_HIDDEN_UNITS, activation='relu')(hidden)])
        hidden = add([hidden, Dense(DENSE_HIDDEN_UNITS, activation='relu')(hidden)])
        dropout = tf.keras.layers.Dropout(0.2)(hidden)
        result = Dense(1, activation='sigmoid')(dropout)

        model = Model(inputs=[input_word_ids, input_mask, segment_ids], outputs=result)
        model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
    
    return model

In [None]:
BATCH_SIZE = 16
LSTM_UNITS = 64
EPOCHS = 10
DENSE_HIDDEN_UNITS = 256
model = build_model(bert_layer, max_len=160)

In [None]:
model.summary()

In [None]:
tf.keras.utils.plot_model(model, to_file='model.png')

In [None]:
# Making the training and submission data
vocab_file = bert_layer.resolved_object.vocab_file.asset_path.numpy()
do_lower_case = bert_layer.resolved_object.do_lower_case.numpy()
tokenizer = tokenization.FullTokenizer(vocab_file, do_lower_case)
train_input = bert_encoder(df.iloc[:, 3], tokenizer, max_len=160)
test_input = bert_encoder(test.iloc[:, 3], tokenizer, max_len=160)

In [None]:
# Training the model
import gc
NUM_MODELS = 1
    
BATCH_SIZE = 16
LSTM_UNITS = 64
EPOCHS = 1
DENSE_HIDDEN_UNITS = 256
checkpoint_predictions = []
checkpoint_val_pred = []
weights = []

for model_idx in range(NUM_MODELS):
    model = build_model(bert_layer, max_len=160)
    for global_epoch in range(EPOCHS):
        history = model.fit(
            train_input, df.iloc[: ,4],
            batch_size=BATCH_SIZE,
            epochs=7,
            verbose=1,
            callbacks=[
                LearningRateScheduler(lambda epoch: 2e-6 * (0.6** global_epoch))
            ]
        )
        checkpoint_predictions.append(model.predict(test_input, batch_size=32).flatten())
        weights.append(2 ** global_epoch)
    del model
    gc.collect()

In [None]:
plt.plot(history.history['accuracy'])
plt.title("Accuracy")

In [None]:
plt.plot(history.history['loss'])
plt.title("Loss")

In [None]:
test_pred = np.average(checkpoint_predictions, weights=weights, axis=0)
submission['target'] = test_pred.round().astype(int)
submission.to_csv('submission.csv', index=False)