In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [32]:
import re
import string
import warnings
warnings.filterwarnings("ignore")

import nltk
import numpy as np
import pandas as pd
from nltk.corpus import stopwords

import seaborn as sns
import plotly.graph_objs as go
import matplotlib.pyplot as plt
from wordcloud import WordCloud

from keras.models import Model
import gensim.downloader as api
import tensorflow as tf
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import plot_model
from tensorflow.keras.layers import Dense, Input, LSTM, Embedding, Dropout, Activation, Bidirectional
from tensorflow.keras import initializers, regularizers, constraints, optimizers, layers, Sequential

from sklearn.svm import SVC
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.metrics import classification_report, f1_score, confusion_matrix
from sklearn.model_selection import StratifiedKFold, train_test_split, learning_curve
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer, HashingVectorizer

import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader

In [3]:
train_df = pd.read_csv("/kaggle/input/nlp-getting-started/train.csv")
test_df = pd.read_csv("/kaggle/input/nlp-getting-started/test.csv")

print(f"Train data shape: {train_df.shape}\nTest data shape: {test_df.shape}")

In [4]:
# more information about columns 
print(train_df.info())

In [5]:
print(test_df.info())

In [6]:
# number of missing values per column in train 
train_df.isna().sum()

In [7]:
# number of missing values per column in test
test_df.isna().sum()

In [8]:
# appending the test data to train 
full_df = pd.concat(objs=[train_df, test_df], axis=0)
full_df.head(10) # train data would be above the test data

In [9]:
# counting the number of positive and negative tweets
sns.set(style="whitegrid", color_codes=True)
sns.catplot("target", data=full_df, kind="count", height=8)
plt.title("Distribution of Target Counts", size=20, weight="bold")
plt.xlabel("Target Label", size=14, weight="bold")
plt.ylabel("Counts", size=14, weight="bold")
plt.show()

In [10]:
# creating a new dataframe for missing values percentage
missing_values = dict(zip([col+"_missing_percent" for col in full_df.columns if col != "target"],
                          [round(100*full_df[col].isnull().sum()/len(full_df), 2) for col in full_df.columns
                           if col != "target"]))
missing_values_df = pd.DataFrame(missing_values, index=[0])
missing_values_df = missing_values_df.melt(var_name= "columns", value_name= "percentage")

# plotting missing values chart
plt.figure(figsize=(10, 8))
sns.set(style="whitegrid", color_codes=True)
seaborn_plot = sns.barplot(x="columns", y="percentage", data=missing_values_df)
for p in seaborn_plot.patches:
    seaborn_plot.annotate(format(p.get_height(), '.2f'), (p.get_x() + p.get_width() / 2., p.get_height()), ha = 'center',
                   va = 'center', xytext = (0, 9), textcoords = 'offset points')
plt.title("Percentage of Missing Values in Columns", size=20, weight="bold")
plt.xlabel("Columns", size=14, weight="bold")
plt.ylabel("Percentage", size=14, weight="bold")
plt.show()

In [11]:
plt.subplots(1,2,figsize=(14,4))
plt.subplot(1,2,1); plt.hist(train_df['text'].str.len(), color='orange', alpha=0.6); plt.ylabel(''); plt.title('Words in sentences (train)')
plt.subplot(1,2,2); plt.hist(test_df['text'].str.len(), color='green', alpha=0.6); plt.ylabel(''); plt.title('Words in sentences (test)')
plt.show()

In [12]:
# getting all the keywords from keyword column
all_keywords = " ".join([keyword for keyword in full_df["keyword"].dropna()])

# visualizing keywords
word_cloud= WordCloud(width=800,
                      height=500,
                      max_font_size=112,
                      background_color ='white',
                      random_state=24).generate(all_keywords)

# plotting 
plt.figure(figsize=(10, 8))
plt.title("All keywords in the data", size=20, weight="bold")
plt.imshow(word_cloud, interpolation="bilinear")
plt.axis("off")
plt.show()

In [13]:
# all keywords from tweets associated to a disaster 
all_disaster_keywords = " ".join([keyword for keyword in full_df[full_df["target"]==1]["keyword"].dropna()])

# visualizing keywords
word_cloud= WordCloud(width=800,
                      height=500,
                      max_font_size=112,
                      background_color ='white',
                      random_state=24).generate(all_disaster_keywords)

# plotting 
plt.figure(figsize=(10, 8))
plt.title("All keywords associated to Disasters", size=20, weight="bold")
plt.imshow(word_cloud, interpolation="bilinear")
plt.axis("off")
plt.show()

In [14]:
# all keywords from tweets NOT associated to disaster
all_non_disaster_keywords = " ".join([keyword for keyword in full_df[full_df["target"]==0]["keyword"].dropna()])

# visualizing keywords
word_cloud= WordCloud(width=800,
                      height=500,
                      max_font_size=112,
                      background_color ='white',
                      random_state=24).generate(all_non_disaster_keywords)

# plotting 
plt.figure(figsize=(10, 8))
plt.title("All keywords associated to Non-Disasters", size=20, weight="bold")
plt.imshow(word_cloud, interpolation="bilinear")
plt.axis("off")
plt.show()

In [15]:
# Text Cleaning 
def remove_URL(text):
    url = re.compile(r'https?://\S+')
    return url.sub(r' httpsmark ', text)

def remove_hashtag(text):
    hashtag = re.compile(r'#')
    return hashtag.sub(r' hashtag ', text)

def remove_exclamation(text):
    exclamation = re.compile(r'!')
    return exclamation.sub(r' exclamation ', text)

def remove_question(text):
    question = re.compile(r'?')
    return question.sub(r' question ', text)

def remove_punc(text):
    return text.translate(str.maketrans('','',string.punctuation))

def remove_emoji(string):
    emoji_pattern = re.compile("["
                               u"\U0001F600-\U0001F64F"  # emoticons
                               u"\U0001F300-\U0001F5FF"  # symbols & pictographs
                               u"\U0001F680-\U0001F6FF"  # transport & map symbols
                               u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                               u"\U00002500-\U00002BEF"  # chinese char
                               u"\U00002702-\U000027B0"
                               u"\U00002702-\U000027B0"
                               u"\U000024C2-\U0001F251"
                               u"\U0001f926-\U0001f937"
                               u"\U00010000-\U0010ffff"
                               u"\u2640-\u2642"
                               u"\u2600-\u2B55"
                               u"\u200d"
                               u"\u23cf"
                               u"\u23e9"
                               u"\u231a"
                               u"\ufe0f"  # dingbats
                               u"\u3030"
                               "]+", flags=re.UNICODE)
    return emoji_pattern.sub(r' emoji ', string)


train_df['text'] = train_df['text'].str.lower()
train_df['text'] = train_df['text'].apply(lambda text: remove_URL(text))
train_df['text'] = train_df['text'].apply(lambda text: remove_hashtag(text))
train_df['text'] = train_df['text'].apply(lambda text: remove_exclamation(text))
train_df['text'] = train_df['text'].apply(lambda text: remove_punc(text))
train_df['text'] = train_df['text'].apply(lambda text: remove_emoji(text))


test_df['text']  = test_df['text'].str.lower()
test_df['text']  = test_df['text'].apply(lambda text: remove_URL(text))
test_df['text']  = test_df['text'].apply(lambda text: remove_hashtag(text))
test_df['text']  = test_df['text'].apply(lambda text: remove_exclamation(text))
test_df['text']  = test_df['text'].apply(lambda text: remove_punc(text))
test_df['text']  = test_df['text'].apply(lambda text: remove_emoji(text))


In [16]:
plt.subplots(1,2,figsize=(14,4))
plt.subplot(1,2,1); plt.hist(train_df['text'].str.len(), color='orange', alpha=0.6); plt.ylabel(''); plt.title('Number of words in text (train)')
plt.subplot(1,2,2); plt.hist(test_df['text'].str.len(), color='green', alpha=0.6); plt.ylabel(''); plt.title('Number of words in text (test)')
plt.show()

In [17]:
# seperate off train and test
train = train_df
test = test_df

In [18]:
train.head(10)

In [19]:
# target variable 
y = train["target"].values

# initializing Kfold
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=24)

# count vectorizer transformation
count_vect = CountVectorizer()
count_vect.fit(train["text"].values.tolist() + test["text"].values.tolist())
train_count_vect = count_vect.transform(train["text"])

# tfidf vectorizer transformation 
tfidf_vect = TfidfVectorizer()
tfidf_vect.fit(train["text"].values.tolist() + test["text"].values.tolist())
train_tfidf_vect = tfidf_vect.transform(train["text"])

In [39]:
models= {
    "svm": SVC(),
    "logistic_regression": LogisticRegression(),
    "naive_bayes": MultinomialNB(),
    "SGD": SGDClassifier(),
    "random_forest": RandomForestClassifier()
}

# current vectors
vectors = {
    "count_vect": train_count_vect,
    "tfidf_vect": train_tfidf_vect
}

In [40]:
def kfold(clf:str, vect_type:str, y, kfold):
    results = {}
    # store the name of the model in dictionary
    results["model_name"] = clf + "_" + vect_type
    
    # call the model and training data
    model = models[clf]
    X = vectors[vect_type]
    
    # perfrom kfold cv
    for fold, (train_idx, valid_idx) in enumerate(kfold.split(X, y)):
        X_train, X_valid = X[train_idx], X[valid_idx]
        y_train, y_valid = y[train_idx], y[valid_idx]
        
        # train on seen data, predict on unseen
        model.fit(X_train, y_train)
        y_preds = model.predict(X_valid)
        
        results["fold_{}".format(fold+1)] = f1_score(y_valid, y_preds)
    return results

In [41]:
# storing all models
all_models = []

for clf in models:
    for vect in vectors:
        all_models.append(kfold(clf, vect, y, skf))

In [42]:
# convert to df
models_df = pd.DataFrame(all_models)
models_df

In [21]:
def get_word2vec_enc(corpus:list, vocab_size:int, embedding_size:int, gensim_pretrained_emb:str) -> list:
    word_vecs = api.load(gensim_pretrained_emb)
    embedding_weights = np.zeros((vocab_size, embedding_size))
    for word, i in corpus:
        if word in word_vecs:
            embedding_weights[i] = word_vecs[word]
    return embedding_weights

In [22]:
# lstm configurations
n_epochs = 8
embedding_size = 300
max_length = 202
pretrained_embedding_file = "word2vec-google-news-300"

# tokenizer
tokenizer = Tokenizer(oov_token="<unk>")
tokenizer.fit_on_texts(train["text"].values)
train_tokenized_list = tokenizer.texts_to_sequences(train["text"].values)

# store vocab size 
vocab_size = len(tokenizer.word_index) + 1

# padding sequences
X_padded = pad_sequences(train_tokenized_list, maxlen=max_length)

# get the pretrained word embeddings and prepare embedding layer
embedding_matrix = get_word2vec_enc(corpus=tokenizer.word_index.items(),
                                    vocab_size=vocab_size,
                                    embedding_size=embedding_size,
                                    gensim_pretrained_emb=pretrained_embedding_file)

embedding_layer = Embedding(input_dim=vocab_size,
                            output_dim=embedding_size,
                            weights=[embedding_matrix],
                            input_length=max_length,
                            trainable=False)

In [29]:
def my_LSTM(embedding_layer):
    print('Creating model...')
    model = Sequential()
    model.add(embedding_layer)
    model.add(Bidirectional(LSTM(units=64, dropout=0.1,  recurrent_dropout=0.1)))
    model.add(Dense(50, activation="relu"))
    model.add(Dropout(0.1))
    model.add(Dense(1, activation = "sigmoid"))
    
    print('Compiling...')
    model.compile(loss='binary_crossentropy',
                  optimizer='adam',
                  metrics=["accuracy"])
    return model

In [30]:
# stratified kfold with LSTM 
model_dict = {}
model_dict["model_name"] = "lstm_word_2_vec"

for fold, (train_idx, val_idx) in enumerate(skf.split(X=X_padded, y=y)):
    print(f"\nCurrently Training: {model_dict['model_name']}... Fold: {fold+1}")
    X_train, X_val = X_padded[train_idx], X_padded[val_idx]
    y_train, y_val = y[train_idx], y[val_idx]
     
    # train the model
    clf = my_LSTM(embedding_layer)
    clf.fit(X_train,
            y_train,
            epochs=1,
            verbose=1)
    # make predictions
    y_preds = clf.predict(X_val)
    y_preds = np.argmax(y_preds,axis=1)
    
    model_dict["fold_{}".format(fold+1)] = f1_score(y_val, y_preds)
        
# adding results to models df
new_model = pd.DataFrame(model_dict, columns=models_df.columns, index=[0])
models_df = pd.concat([models_df, new_model], ignore_index=True)

In [34]:
model = Sequential()
model.add(embedding_layer)
model.add(Bidirectional(LSTM(units=64, dropout=0.1,  recurrent_dropout=0.1)))
model.add(Dense(50, activation="relu"))
model.add(Dropout(0.1))
model.add(Dense(1, activation = "sigmoid"))

plot_model(model, to_file='model_plot.png', show_shapes=True, show_layer_names=True)

In [46]:
def my_LSTM(embedding_layer):
    print('Creating model...')
    model = Sequential()
    model.add(embedding_layer)
    model.add(Dropout(0.4))
    model.add(Bidirectional(LSTM(units=64, dropout=0.1,  recurrent_dropout=0.1)))
    model.add(Dense(50, kernel_regularizer=regularizers.l2(0.001), activation="relu"))
    model.add(Dropout(0.1))
    model.add(Dense(1, activation = "sigmoid"))
    
    print('Compiling...')
    model.compile(loss='binary_crossentropy',
                  optimizer='adam',
                  metrics=["accuracy"])
    return model

# training a single LSTM model
X_train, X_valid, y_train, y_valid = train_test_split(X_padded, y, test_size=.3, shuffle=True, random_state=24)

# initializing model
clf = my_LSTM(embedding_layer)
print(clf.summary())

# training model
clf.fit(X_train,
        y_train,
        epochs=n_epochs,
        verbose=1)

# make predictions
y_preds = clf.predict(X_val)
y_preds = np.argmax(y_preds,axis=1)

y_train_preds = clf.predict(X_train)
y_train_preds = np.argmax(y_train_preds,axis=1)

In [39]:
# BERT 
import tensorflow as tf
from tensorflow.keras.layers import Dense, Input
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.models import Model
from tensorflow.keras.callbacks import ModelCheckpoint

import transformers
from tqdm.notebook import tqdm
from tokenizers import BertWordPieceTokenizer

In [40]:
from transformers import BertTokenizer
tokenizer = BertTokenizer.from_pretrained('bert-large-uncased')

def bert_encode(data, maximum_length) :
    input_ids = []
    attention_masks = []

    for text in data:
        encoded = tokenizer.encode_plus(
            text, 
            add_special_tokens=True,
            max_length=maximum_length,
            pad_to_max_length=True,

            return_attention_mask=True,
        )
        input_ids.append(encoded['input_ids'])
        attention_masks.append(encoded['attention_mask'])
        
    return np.array(input_ids),np.array(attention_masks)

In [41]:
texts = train['text']
target = train['target']

train_input_ids, train_attention_masks = bert_encode(texts,60)

In [42]:
import tensorflow as tf
from tensorflow.keras.optimizers import Adam

def create_model(bert_model):
    
    input_ids = tf.keras.Input(shape=(60,),dtype='int32')
    attention_masks = tf.keras.Input(shape=(60,),dtype='int32')

    output = bert_model([input_ids,attention_masks])
    output = output[1]
    output = tf.keras.layers.Dense(32,activation='relu')(output)
    output = tf.keras.layers.Dropout(0.2)(output)
    output = tf.keras.layers.Dense(1,activation='sigmoid')(output)
    
    model = tf.keras.models.Model(inputs = [input_ids,attention_masks],outputs = output)
    model.compile(Adam(lr=1e-5), loss='binary_crossentropy', metrics=['accuracy'])
    return model

In [43]:
from transformers import TFBertModel
bert_model = TFBertModel.from_pretrained('bert-base-uncased')

In [44]:
model = create_model(bert_model)
model.summary()

In [45]:
tf.keras.utils.plot_model(model, show_shapes=True, dpi=48)

In [46]:
history = model.fit(
    [train_input_ids, train_attention_masks],
    target,
    validation_split=0.2, 
    epochs=3,
    batch_size=10
)

In [48]:
def plot_learning_curves(history, arr):
    fig, ax = plt.subplots(1, 2, figsize=(20, 5))
    for idx in range(2):
        ax[idx].plot(history.history[arr[idx][0]])
        ax[idx].plot(history.history[arr[idx][1]])
        ax[idx].legend([arr[idx][0], arr[idx][1]],fontsize=18)
        ax[idx].set_xlabel('A ',fontsize=16)
        ax[idx].set_ylabel('B',fontsize=16)
        ax[idx].set_title(arr[idx][0] + ' X ' + arr[idx][1],fontsize=16)

In [49]:
plot_learning_curves(history, [['loss', 'val_loss'],['accuracy', 'val_accuracy']])