In [19]:
import pandas as pd
import numpy as np
import seaborn as sns
import nltk
from nltk.corpus import stopwords
import os
import matplotlib.pyplot as plt
from matplotlib import pylab
import emoji
import re
import contractions
from bs4 import BeautifulSoup
import unicodedata
from nltk.stem import WordNetLemmatizer
import tensorflow as tf
import tensorflow_hub as hub
import tokenization

ModuleNotFoundError: No module named 'tensorflow'

In [None]:
nltk.download("wordnet")

In [None]:
%config IPCompleter.greedy=True
%matplotlib inline
%load_ext nb_black

In [None]:
# Constants
curr_dir = ".."
train_file = os.path.join(curr_dir, "dataset", "train.csv")
test_file = os.path.join(curr_dir, "dataset", "test.csv")

In [None]:
params = {
    "legend.fontsize": "x-large",
    "figure.figsize": (16, 5),
    "axes.labelsize": "x-large",
    "axes.titlesize": "x-large",
    "xtick.labelsize": "x-large",
    "ytick.labelsize": "x-large",
}
pylab.rcParams.update(params)
sns.set_theme(style="darkgrid")

In [None]:
df = pd.read_csv(train_file)
df.head()

In [None]:
df.info()

In [None]:
df.isnull().sum()

In [None]:
def plotBarGraph(data, labelx, labely, title, switch_axis=False):
    y = data.index.tolist()
    x = data.tolist()
    if switch_axis:
        x = data.index.tolist()
        y = data.tolist()
    sns.barplot(y=y, x=x)
    plt.title(title)
    plt.xlabel(labelx)
    plt.ylabel(labely)
    plt.show()

In [None]:
locations_map = df.location.value_counts().sort_values(ascending=False)[:10]
plotBarGraph(locations_map, "Frequency", "Countries", "Tweet locations")

In [None]:
# Top 20 Disaster related Keywords
keywords_map = (
    df.keyword.loc[df.target == 1].value_counts().sort_values(ascending=False)[:20]
)
plotBarGraph(keywords_map, "Frequency", "Keywords", "Top 20 disaster related keywords")

In [None]:
# Top 20 Non-Disaster related Keywords
keywords_non_disaster_map = df.keyword.loc[df.target == 0].value_counts(
    ascending=False
)[:20]
plotBarGraph(
    keywords_non_disaster_map,
    "Frequency",
    "Keywords",
    "Top 20 non-disaster related keywords",
)

In [None]:
print(
    "In the graphs(Top 20 disaster keywords and non-disaster keywords) above\n we see that disaster keywords are natural or artificial calamilites.\nWhile non-disaster keywords are too generalized keywords and not sufficient to describe a disaster."
)

In [None]:
sns.countplot(x="target", data=df)
plt.show()

In [None]:
# Removing Duplicate Text Data
text_data_duplicate_map = df.text.duplicated().value_counts()
print(text_data_duplicate_map)
plotBarGraph(
    text_data_duplicate_map,
    "Duplicate",
    "Frequency",
    "Text Data Duplicate or Not",
    switch_axis=True,
)

In [None]:
print("Original dataframe shape: ", df.shape)
df.drop_duplicates(subset="text", keep="first", inplace=True)
print("Dropping duplicate rows and keeping original value shape: ", df.shape)
df.text.duplicated().value_counts()

In [None]:
def search_text_data(query, column="text"):
    return df[df[column].str.contains(query)][column]


print(search_text_data("volcano")[:5])

In [None]:
# Lower case text data
def lower_case_data(data=""):
    data = data.lower()
    return data

In [None]:
# Handle Emojis
def sentences_with_emojis(id_texts):
    sentences = []
    indeces = id_texts[0]
    texts = id_texts[1]
    for index, sentence in zip(indeces, texts):
        has_emoji = bool(emoji.get_emoji_regexp().search(sentence))
        if has_emoji:
            sentences.append((index, sentence))
    if len(sentences) == 0:
        return "Sentences are clean and don't have emojis!"
    else:
        return sentences


# Source: https://stackoverflow.com/questions/33404752/removing-emojis-from-a-string-in-python
def clean_emojis(text):
    allchars = [str for str in text]
    emoji_list = [c for c in allchars if c in emoji.UNICODE_EMOJI]
    clean_text = " ".join(
        [str for str in text.split() if not any(i in str for i in emoji_list)]
    )
    return clean_text

In [None]:
# Clean urls
def clean_urls(text):
    text = re.sub(r"https?://\S+", "", text)
    return text

In [None]:
# Remove all sorts of special characters and punctuations.
def removeSpecialChar(text):
    sentence = []
    for s in text:
        if s == " ":
            sentence.append(s)
        if s.isalnum():
            sentence.append(s)
    return "".join(sentence)

In [None]:
# Check any html text in df.text (not cleaned yet) column
def checkHtml(text):
    return bool(BeautifulSoup(text, "html.parser").find())


html_sentence_map = df["text"].apply(checkHtml).tolist()
if not any(html_sentence_map):
    print("No text containing html found!")
else:
    print("There is some html text!")

In [None]:
# Remove accented text
def remove_accented_chars(text):
    new_text = (
        unicodedata.normalize("NFKD", text)
        .encode("ascii", "ignore")
        .decode("utf-8", "ignore")
    )
    return new_text

In [None]:
# Explore Stopwords list from nltk
def remove_words_having_not(words=[""]):
    for word in words:
        if "not" in word:
            words.remove(word)
    return words


stopwords_list = stopwords.words("english")
print(stopwords_list)
stopwords_list = remove_words_having_not(
    [removeSpecialChar(contractions.fix(word)) for word in stopwords_list]
)
print("\n---------------Stopwords after preprocessing---------------\n")
print(stopwords_list)

In [None]:
# Remove stopwords from nltk corpus
def remove_stopwords(text):
    new_sentence = ""
    for word in text.split():
        if word not in stopwords_list:
            new_sentence += word + " "
    return new_sentence

In [None]:
# Remove numbers from text
def remove_numbers(text=""):
    new_sentence = ""
    for word in text.split():
        num_free_word = "".join([i for i in word if not i.isdigit()])
        new_sentence += num_free_word + " "
    return new_sentence

In [None]:
# lemmatization
def lemmatize(text):
    lemmatizer = WordNetLemmatizer()
    new_sentence = ""
    for word in text.split():
        lematized_word = lemmatizer.lemmatize(word)
        new_sentence += lematized_word + " "
    return new_sentence

In [None]:
# Final data cleaning step is removing non-essential whitespaces
def remove_white_space(text):
    return " ".join(text.split())

In [None]:
# Text cleaning pipeline
def clean_sentence_pipeline(text):
    text = lower_case_data(text)
    text = clean_emojis(text)
    text = clean_urls(text)
    text = contractions.fix(text)
    text = removeSpecialChar(text)
    text = remove_accented_chars(text)
    text = remove_stopwords(text)
    text = remove_numbers(text)
    text = lemmatize(text)
    return remove_white_space(text)


def clean_text_column(col_name):
    df["clean_text"] = ""
    df["clean_text"] = df[col_name].apply(clean_sentence_pipeline)


clean_text_column("text")

In [None]:
# Comparison of cleaned texts
df[["text", "clean_text"]]

In [None]:
# Bert Model Implementation
'''
BERT: Bidirectional Encoder Representations from Transformers

Vocab:
a. Masked Language Model: Mask random words in a sentence(fill in the blanks) 
    to understand bidirectional context.
b. Next sentence prediction: Two sentences and detect which sentences follows each other.

Process:
Token inputs -> BERT Transformer Encoders -> Output as next sentence prediction and mask language modeling
-> Pass to sigmoid for classification.

1. Token Inputs
To generate token inputs, we need token embeddings(WordPiece Embedding) + Segment Embeddings(distinguish sentences)
+ Position Embeddings(position of word within a sentence encoded as a vector).

token embeddings: [CLS] token in the beginning of the sentence and [SEP] token at the end of the sentence
segment embeddings: distinguish sentences with tokens assigned to each sentence
position embedding: unique tokens assigned to each word in the sentence.
 
'''


'''
Embedding space=> Map words in sentence to a word cluster(embedding space)(Glove) to generate a vector for the word
Positional Encoder=> Context of the word with respect to the sentence vector
Embedding space + Positional encoding = encoding of word with context= EC

Encoder Block:feed input ("The red dog")
EC -> Multi-headed attention layer -> Feed forward layer

Attention layer: How relavent is the ith word  with respect to the other words in the same sentence
Feed forward nets: pass attention vectors into a simple neural net

Decoder Block: feed output ("le red chein")
Take the EC of output
EC -> multi-headed attention -> multi headed attention(encoded-decoded attention mapping(English to french translation happens here)
-> Feed foward layer -> linear -> softmax 

'''

In [18]:
# !wget --quiet https://raw.githubusercontent.com/tensorflow/models/master/official/nlp/bert/tokenization.py

zsh:1: command not found: wget


In [None]:
# Bert config
bert_layer = hub.KerasLayer("https://tfhub.dev/tensorflow/bert_en_uncased_L-12_H-768_A-12/1",trainable=True)
vocab_file = bert_layer.resolved_object.vocab_file.asset_path.numpy()
do_lower_case = bert_layer.resolved_object.do_lower_case.numpy()
tokenizer = tokenization.FullTokenizer(vocab_file, do_lower_case)