In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
plt.style.use('ggplot')
import seaborn as sns


# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory
import string
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
!pip install bert-for-tf2
!pip install sentencepiece

In [None]:
import tensorflow as tf
import tensorflow_hub as hub
from tensorflow.keras import layers
import bert

<h1>Data Preprocessing</h1>

In [None]:
train = pd.read_csv("/kaggle/input/nlp-getting-started/train.csv")
test = pd.read_csv("/kaggle/input/nlp-getting-started/test.csv")

display(train.head())
print(len(train))
display(test.head())
print(len(test))

In [None]:
keywords = train["keyword"].value_counts()
plt.grid()
sns.barplot(keywords.index, keywords)
plt.title("Keywords")
print(keywords)

In [None]:
x = train["target"].value_counts()
plt.grid()
sns.barplot(x.index, x)
plt.title("Real or Not")
print(x)

The dataset has almost balanced target variable 

In [None]:
from collections import defaultdict
from nltk.corpus import stopwords
from nltk.util import ngrams

In [None]:
def create_corpus(target):
    corpus = []
    for x in train[train["target"] == target]["text"].str.split():
        print(x)
        for i in x:
            corpus.append(i)
            
    return corpus

In [None]:
import nltk
from nltk.corpus import stopwords
stop_words=stopwords.words('english')

In [None]:
print(stop_words)

In [None]:
import random
from random import shuffle
random.seed(1)

# import these modules 
from nltk.stem import WordNetLemmatizer 
from nltk.tokenize import sent_tokenize, word_tokenize
#cleaning up text
import re
def Preprocess_text(line):

    clean_line = ""

    line = line.replace("’", "")
    line = line.replace("'", "")
    line = line.replace("-", " ") #replace hyphens with spaces
    line = line.replace("\t", " ")
    line = line.replace("\n", " ")
    line = line.lower()

    for char in line:
        if char in 'qwertyuiopasdfghjklzxcvbnm ':
            clean_line += char
        else:
            clean_line += ' '

    clean_line = re.sub(' +',' ',clean_line) #delete extra spaces
    if clean_line[0] == ' ':
        clean_line = clean_line[1:]
    
    #Removing stop words and convert words to base forms
    clean_line=LemmaSentence(clean_line)
    return clean_line

def LemmaSentence(sentence):
    token_words=word_tokenize(sentence)
    token_words
    New_sentence=[]
    updated_word_list = list(set([word for word in token_words if word not in stop_words]))
    for word in token_words:
        lemmatizer = WordNetLemmatizer()
        New_sentence.append(lemmatizer.lemmatize(word))
        New_sentence.append(" ")
        
    return "".join(New_sentence)

Checking an Example text

In [None]:
train['text'][0]

Checking the text post preprocessing

In [None]:
Preprocess_text(train['text'][0])

In [None]:
train['text']=train.text.apply(lambda x:Preprocess_text(x))

In [None]:
test['text']=test.text.apply(lambda x:Preprocess_text(x))

In [None]:
import re
def remove_url(text):
    url = re.compile(r'https?://\S+|www\.\S+')
    return url.sub(r'', text)

def remove_html(text):
    html = re.compile(r'<.*?>')
    return html.sub(r'', text)

def remove_emoji(text):
    emoji_pattern = re.compile("["
                               u"\U0001F600-\U0001F64F" #emoticons
                               u"\U0001F300-\U0001F5FF" #symbols&pics
                               u"\U0001F680-\U0001F6FF" #transportation pic
                               u"\U0001F1E0-\U0001F1FF" #flags
                               u"\U00002702-\U000027B0"
                               u"\U000024C2-\U0001F251"    
                               "]+", flags = re.UNICODE)
    return emoji_pattern.sub(r'', text)

def remove_punctuation(text):
    table = str.maketrans('', '', string.punctuation)
    return text.translate(table)


train["text"] = train["text"].apply(lambda x: remove_url(x))
train["text"] = train["text"].apply(lambda x: remove_html(x))
train["text"] = train["text"].apply(lambda x: remove_emoji(x))
train["text"] = train["text"].apply(lambda x: remove_punctuation(x))
test["text"] = test["text"].apply(lambda x: remove_url(x))
test["text"] = test["text"].apply(lambda x: remove_punctuation(x))
test["text"] = test["text"].apply(lambda x: remove_html(x))
test["text"] = test["text"].apply(lambda x: remove_emoji(x))

In [None]:
x=train["text"]
y= np.array(list(train["target"]))

In [None]:
def bert_encode(text, tokenizer):
    
  num_examples = len(text)
  
  sentence = tf.ragged.constant([encode_sentence(s) for s in np.array(text)])
  

  cls = [tokenizer.convert_tokens_to_ids(['[CLS]'])]*sentence.shape[0]
  input_word_ids = tf.concat([cls, sentence], axis=-1)

  input_mask = tf.ones_like(input_word_ids).to_tensor()

  type_cls = tf.zeros_like(cls)
  type_s = tf.zeros_like(sentence)
  input_type_ids = tf.concat(
      [type_cls, type_s], axis=-1).to_tensor()

  inputs = {
      'input_word_ids': input_word_ids.to_tensor(),
      'input_mask': input_mask,
      'input_type_ids': input_type_ids}

  return inputs

In [None]:
def encode_sentence(s):
   tokens = list(tokenizer.tokenize(s))
   tokens.append('[SEP]')
   return tokenizer.convert_tokens_to_ids(tokens)

In [None]:
os.environ["WANDB_API_KEY"] = "0" ## to silence warning

In [None]:
from transformers import BertTokenizer, TFBertModel
import matplotlib.pyplot as plt
import tensorflow as tf
model_name = 'bert-base-multilingual-cased'
tokenizer = BertTokenizer.from_pretrained(model_name)
train_input = bert_encode(x, tokenizer)

In [None]:
max_len = 30

def build_model():
    bert_encoder = TFBertModel.from_pretrained(model_name)
    input_word_ids = tf.keras.Input(shape=(max_len,), dtype=tf.int32, name="input_word_ids")
    input_mask = tf.keras.Input(shape=(max_len,), dtype=tf.int32, name="input_mask")
    input_type_ids = tf.keras.Input(shape=(max_len,), dtype=tf.int32, name="input_type_ids")
    
    embedding = bert_encoder([input_word_ids, input_mask, input_type_ids])[0]
    output = tf.keras.layers.Dense(3, activation='softmax')(embedding[:,0,:])
    
    model = tf.keras.Model(inputs=[input_word_ids, input_mask, input_type_ids], outputs=output)
    model.compile(tf.keras.optimizers.Adam(lr=1e-5), loss='sparse_categorical_crossentropy', metrics=['accuracy'])
    
    return model

In [None]:
import tensorflow as tf
try:
    tpu = tf.distribute.cluster_resolver.TPUClusterResolver()
    tf.config.experimental_connect_to_cluster(tpu)
    tf.tpu.experimental.initialize_tpu_system(tpu)
    strategy = tf.distribute.experimental.TPUStrategy(tpu)
except ValueError:
    strategy = tf.distribute.get_strategy() 

In [None]:
strategy

In [None]:
from tensorflow.keras.callbacks import EarlyStopping

with strategy.scope():
    model = build_model()
    model.summary()
    early_stopping=EarlyStopping(monitor='val_accuracy',mode='max',patience=5,min_delta=0.01)
    model.fit(train_input, y, epochs = 10, verbose = 1, batch_size = 128, validation_split = 0.2,callbacks=[early_stopping])

In [None]:
test_input=bert_encode(test["text"], tokenizer)

In [None]:
predictions = [np.argmax(i) for i in model.predict(test_input)]

In [None]:
submission = test.id.copy().to_frame()
submission['target'] = predictions

In [None]:
submission.head()

In [None]:
submission.to_csv('submission.csv', header=True, index=False) 