In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

from tqdm import tqdm
import tensorflow as tf
from tensorflow.keras.callbacks import ModelCheckpoint, EarlyStopping
from tensorflow.keras.preprocessing import sequence
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.regularizers import l2, l1_l2

from tensorflow.keras import layers
from tensorflow.keras.models import Sequential

from sklearn.model_selection import train_test_split

import matplotlib.pyplot as plt
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All"
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
train_df = pd.read_csv('/kaggle/input/nlp-getting-started/train.csv')
test_df = pd.read_csv('/kaggle/input/nlp-getting-started/test.csv')

In [None]:
train_df = train_df.fillna("NA")
test_df = test_df.fillna("NA")

In [None]:
train_df.head(n=3)

In [None]:
train_texts = list(train_df['text'])
test_texts = list(test_df['text'])

targets = list(train_df['target'])

In [None]:
print(f"Number of tweets in train set : {len(train_df)}")
print(f"Number of tweets in test set : {len(test_df)}")
print()
print(f"Number of train tweets which have a target of 1: {len(train_df[train_df['target'] == 1])}")
print(f"Number of train tweets which have a target of 0: {len(train_df[train_df['target'] == 0])}")

#### There is no major class imbalance in train set. Tweets with target of '0' are 57% of total train set. Tweets with target of '1' are 43% of total train set.

In [None]:
for tweet in train_df['text'].head(5):
    print(tweet)

#### The texts however are not clean. They contain special characters, quotes, commas etc. We need to clean the text before converting it to a sequence of integers. Following preprocessing functions will do the same.

## Preprocessing functions

In [None]:
import re
import string
from nltk.corpus import stopwords


def strip_and_lowercase(lines):
    """ Removing extra spaces at the end, beginning, or in the middle of a sentence """
    for i in range(len(lines)):
        lines[i] = lines[i].strip()
        lines[i] = ' '.join(lines[i].split())
        lines[i] = lines[i].lower()
    return lines


def expand_words(lines):
    """ Expanding words """
    expansion_dict = {"ain't": "are not", "'s": " is", "aren't": "are not", "don't": "do not",
                      "didn't": "did not", "won't": "will not",
                      "can't": "cannot"}

    for i in range(len(lines)):
        words = lines[i].split()
        for j in range(len(words)):
            if words[j] in expansion_dict:
                words[j] = expansion_dict[words[j]]
        lines[i] = ' '.join(words)
    return lines


def remove_punctuations(lines):
    """ Removing punctuations """
    for i in range(len(lines)):
        lines[i] = lines[i].translate(str.maketrans('', '', string.punctuation))
    return lines


def remove_stopwords(lines):
    """ Removing stopwords """
    stop_words = set(stopwords.words('english'))

    for i in range(len(lines)):
        lines[i] = " ".join([word for word in lines[i].split() if word not in stop_words])
    return lines


def remove_special_chars(lines):
    """ Removing special characters """
    for i in range(len(lines)):
        lines[i] = re.sub('[-+.^:,]', '', lines[i])
    return lines


def remove_emojis(lines):
    """ Removing emojis """
    emojis = re.compile("["
                        u"\U0001F600-\U0001F64F"  # emoticons
                        u"\U0001F300-\U0001F5FF"  # symbols & pictographs
                        u"\U0001F680-\U0001F6FF"  # transport & map symbols
                        u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                        u"\U00002500-\U00002BEF"  # chinese char
                        u"\U00002702-\U000027B0"
                        u"\U00002702-\U000027B0"
                        u"\U000024C2-\U0001F251"
                        u"\U0001f926-\U0001f937"
                        u"\U00010000-\U0010ffff"
                        u"\u2640-\u2642"
                        u"\u2600-\u2B55"
                        u"\u200d"
                        u"\u23cf"
                        u"\u23e9"
                        u"\u231a"
                        u"\ufe0f"  # dingbats
                        u"\u3030"
                        "]+", re.UNICODE)

    for i in range(len(lines)):
        lines[i] = re.sub(emojis, '', lines[i])
    return lines

In [None]:
def preprocess(texts):
    texts = strip_and_lowercase(texts)
    texts = expand_words(texts)
    texts = remove_punctuations(texts)
    texts = remove_stopwords(texts)
    texts = remove_special_chars(texts)
    texts = remove_emojis(texts)
    return texts

In [None]:
# Clean train and text tweets
train_texts = preprocess(train_texts)
test_texts = preprocess(test_texts)

In [None]:
TOP_K = 2000
MAX_SEQUENCE_LENGTH = max(len(max(train_texts, key=len)), len(max(test_texts, key=len)))
batch_size = 256

### Converting texts to sequence of integers using keras tokenizer

In [None]:
tokenizer = Tokenizer(num_words=TOP_K)
tokenizer.fit_on_texts(train_texts)
X = tokenizer.texts_to_sequences(train_texts)
X = sequence.pad_sequences(X, maxlen=MAX_SEQUENCE_LENGTH)
y = np.array(targets)

In [None]:
# splitting data into train and test

X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.1, random_state=1)

In [None]:
# Function to create a tf data object

def input_fn(X: np.ndarray, y: np.ndarray, batch_size: int,  mode: str = 'eval'):
    dataset = tf.data.Dataset.from_tensor_slices((X, y))

    if mode == 'train':
        dataset = dataset.shuffle(buffer_size=batch_size)
        dataset = dataset.repeat()

    dataset = dataset.batch(batch_size=batch_size)
    dataset = dataset.prefetch(buffer_size=batch_size)
    return dataset

In [None]:
train_dataset = input_fn(X=X_train, y=y_train, batch_size=batch_size, mode='train')
val_dataset = input_fn(X=X_val, y=y_val, batch_size=batch_size, mode='eval')

In [None]:
num_features = min(len(tokenizer.word_index) + 1, TOP_K)

In [None]:
cp_callback = ModelCheckpoint(filepath='disaster_tweet_classification.hdf5',
                                          monitor='val_accuracy',
                                          save_freq='epoch', verbose=1, period=1,
                                          save_best_only=True, save_weights_only=True)

early_stopping = EarlyStopping(monitor='val_accuracy',
                               verbose=1, patience=5)

### We will use a hybrid model which has both convolution and LSTM layers

In [None]:
def build_hybrid_model():
    model = Sequential()
    model.add(layers.InputLayer(input_shape=(MAX_SEQUENCE_LENGTH,), name="input"))
    model.add(layers.Embedding(input_dim=num_features,
                               output_dim=150,
                               input_length=MAX_SEQUENCE_LENGTH))
    model.add(layers.Conv1D(filters=32, kernel_size=3, padding='same', activation='relu', kernel_regularizer=l2(0.01), bias_regularizer=l2(0.01)))
    model.add(layers.MaxPooling1D(pool_size=2))
    model.add(layers.LSTM(64, recurrent_dropout=0.5, kernel_regularizer=l2(0.01), recurrent_regularizer=l2(0.01), bias_regularizer=l2(0.01)))
    model.add(layers.Dense(1, activation='sigmoid'))

    optimizer = tf.keras.optimizers.Adam(learning_rate=0.001)
    model.compile(optimizer='adam',
                  loss='binary_crossentropy',
                  metrics=["accuracy"])
    return model

In [None]:
model = build_hybrid_model()
steps_per_epoch = int(y_train.size / batch_size)*1.5
history = model.fit(
    train_dataset,
    validation_data=val_dataset,
    epochs=10,
    steps_per_epoch=steps_per_epoch,
    callbacks=[cp_callback, early_stopping])

In [None]:
plt.plot(history.history['loss'], label='train_loss')
plt.plot(history.history['val_loss'], label='val_loss')
plt.legend()
plt.show()

In [None]:
plt.plot(history.history['accuracy'], label='train_accuracy')
plt.plot(history.history['val_accuracy'], label='val_accuracy')
plt.legend()
plt.show()

In [None]:
model = build_hybrid_model()
model.load_weights('disaster_tweet_classification.hdf5')

In [None]:
test_df = pd.read_csv('/kaggle/input/nlp-getting-started/test.csv')

In [None]:
test_df.head()

In [None]:
lines = tokenizer.texts_to_sequences(test_texts)
lines = sequence.pad_sequences(lines, maxlen=MAX_SEQUENCE_LENGTH)

In [None]:
def predict(review: np.ndarray):
    result = model.predict(np.array([review]))
    result = result[0][0]
    if result > 0.5:
        return 1
    else:
        return 0

In [None]:
result_dict = {'id': list(test_df['id']), 'target': []}

In [None]:
for line in tqdm(lines, desc="Predicting"):
    result_dict['target'].append(predict(line))

In [None]:
result_df = pd.DataFrame(result_dict)

In [None]:
result_df.head(n=5)

In [None]:
result_df.to_csv('submission.csv', index=False)