# NLP_DT

I am using this notebook as a guide: https://www.kaggle.com/donmarch14/disaster-tweets-prediction-nlp-guide.

In [None]:
import numpy as np 
import pandas as pd 
import os

import string
import emoji
import re
from wordcloud import WordCloud
import nltk #For Stemming, NLTK is needed
from nltk.stem.snowball import SnowballStemmer
import spacy
nlp = spacy.load('en_core_web_lg')

#SKlearn
from sklearn import model_selection
from sklearn.feature_extraction.text import CountVectorizer,TfidfVectorizer
from sklearn.svm import LinearSVC
from sklearn import model_selection
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import f1_score
from sklearn.model_selection import train_test_split

# TensorFlow
import tensorflow as tf
from tensorflow import keras
from keras import layers
from keras import backend as K
from tensorflow.keras.layers import Dense, Input
from keras.callbacks import ReduceLROnPlateau
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.models import Model
from tensorflow.keras.callbacks import ModelCheckpoint
import tensorflow_hub as hub

# XGBoost
import xgboost as xgb
from xgboost import XGBClassifier

import matplotlib.pyplot as plt # matplotlib and seaborn for plotting
import seaborn as sns

# Suppress warnings 
import warnings
warnings.filterwarnings('ignore')


# Inspection (EDA)

In [None]:
train_df = pd.read_csv('/kaggle/input/nlp-getting-started/train.csv')
test_df = pd.read_csv("/kaggle/input/nlp-getting-started/test.csv")

In [None]:
train_df.info()

In [None]:
train_df.sample(10)

In [None]:
# Train and test data shape

print("Train dataset shape : ",train_df.shape)
print("Test dataset shape : ",test_df.shape)

In [None]:
# Look at the number of each type of target

sns.barplot(train_df['target'].value_counts().index,train_df['target'].value_counts(),palette='rocket')
plt.title('Targets')
plt.show()

In [None]:
# Look at the different types of keywords

sns.barplot(y=train_df['keyword'].value_counts()[:25].index,x=train_df['keyword'].value_counts()[:25], orient='horizontal', palette='viridis')
plt.title('Keywords')
plt.show()

In [None]:
# Split thte train data into the disaster and non-disaster tweets

disaster_tweets = train_df[train_df['target']==1]['text']
non_disaster_tweets = train_df[train_df['target']==0]['text']

In [None]:
# Generate a wordcloud for each type of tweet

fig, (ax1, ax2) = plt.subplots(1, 2, figsize=[26, 8])
wordcloud1 = WordCloud( background_color='white',
                        width=600,
                        height=400).generate(" ".join(disaster_tweets))
ax1.imshow(wordcloud1)
ax1.axis('off')
ax1.set_title('Disaster Tweets',fontsize=40);

wordcloud2 = WordCloud( background_color='white',
                        width=600,
                        height=400).generate(" ".join(non_disaster_tweets))
ax2.imshow(wordcloud2)
ax2.axis('off')
ax2.set_title('Non Disaster Tweets',fontsize=40);

# Data Preprocessing

In [None]:
# Clean function removes any characters that might skew the models

def removeStopwords(text):
    doc = nlp(text)
    clean_text = ' '
    for txt in doc:
        if (txt.is_stop == False):
            clean_text = clean_text + " " + str(txt)        
    
    return clean_text

def removePunctuations(text):
    return text.translate(str.maketrans('', '', string.punctuation))

def removeEmojis(text):
    allchars = [c for c in text]
    emoji_list = [c for c in allchars if c in emoji.UNICODE_EMOJI["en"]]
    clean_text = ' '.join([str for str in text.split() if not any(i in str for i in emoji_list)])
    return clean_text

def removeNumbers(text):
    clean_text = re.sub(r'\d+', '', text)
    return clean_text

def removeLinks(text):
    clean_text = re.sub('https?://\S+|www\.\S+', '', text)
    #https? will match both http and https
    #A|B, where A and B can be arbitrary REs, creates a regular expression that will match either A or B.
    #\S Matches any character which is not a whitespace character.
    #+ Causes the resulting RE to match 1 or more repetitions of the preceding RE. ab+ will match ‘a’ followed by any non-zero number of ‘b’s; it will not match just ‘a’.
    return clean_text

def clean(text):
    text = text.lower() 
    text = removeStopwords(text)
    text = removePunctuations(text)
    text = removeEmojis(text)
    text = removeNumbers(text)
    text = removeLinks(text)
    return text


In [None]:
# Clean the train and test data: 
train_df['text']=train_df.text.apply(clean)
test_df['text']=test_df.text.apply(clean)

In [None]:
# Create a wordcloud for the cleaned train data

tweets = train_df['text']
fig, ax1, = plt.subplots(1,  figsize=[26, 8])
wordcloud1 = WordCloud( background_color='white',
                        width=600,
                        height=400).generate(" ".join(tweets))
ax1.imshow(wordcloud1)
ax1.axis('on')
ax1.set_title('Tweets',fontsize=40);

Thanks to https://www.kaggle.com/rftexas/text-only-bert-keras?scriptVersionId=31186559 Some data is wrong. For example, target of the training dataset at $328,443,513,2619,3640,3900,4342,5781,6552,6554,6570,6701,6702,6729,6861,7226$ are given as 1 whereas they are obviously 0, since they are not related to disaster.

We change it to 0.

In [None]:
ids_with_target_error = [328,443,513,2619,3640,3900,4342,5781,6552,6554,6570,6701,6702,6729,6861,7226]
train_df.at[train_df['id'].isin(ids_with_target_error),'target'] = 0
train_df[train_df['id'].isin(ids_with_target_error)]

# Text normalisation

Lets convert all the abbreviations to its full form. Thanks to https://www.kaggle.com/rftexas/text-only-bert-keras?scriptVersionId=31186559.

In [None]:
abbreviations = {
    "$" : " dollar ",
    "€" : " euro ",
    "4ao" : "for adults only",
    "a.m" : "before midday",
    "a3" : "anytime anywhere anyplace",
    "aamof" : "as a matter of fact",
    "acct" : "account",
    "adih" : "another day in hell",
    "afaic" : "as far as i am concerned",
    "afaict" : "as far as i can tell",
    "afaik" : "as far as i know",
    "afair" : "as far as i remember",
    "afk" : "away from keyboard",
    "app" : "application",
    "approx" : "approximately",
    "apps" : "applications",
    "asap" : "as soon as possible",
    "asl" : "age, sex, location",
    "atk" : "at the keyboard",
    "ave." : "avenue",
    "aymm" : "are you my mother",
    "ayor" : "at your own risk", 
    "b&b" : "bed and breakfast",
    "b+b" : "bed and breakfast",
    "b.c" : "before christ",
    "b2b" : "business to business",
    "b2c" : "business to customer",
    "b4" : "before",
    "b4n" : "bye for now",
    "b@u" : "back at you",
    "bae" : "before anyone else",
    "bak" : "back at keyboard",
    "bbbg" : "bye bye be good",
    "bbc" : "british broadcasting corporation",
    "bbias" : "be back in a second",
    "bbl" : "be back later",
    "bbs" : "be back soon",
    "be4" : "before",
    "bfn" : "bye for now",
    "blvd" : "boulevard",
    "bout" : "about",
    "brb" : "be right back",
    "bros" : "brothers",
    "brt" : "be right there",
    "bsaaw" : "big smile and a wink",
    "btw" : "by the way",
    "bwl" : "bursting with laughter",
    "c/o" : "care of",
    "cet" : "central european time",
    "cf" : "compare",
    "cia" : "central intelligence agency",
    "csl" : "can not stop laughing",
    "cu" : "see you",
    "cul8r" : "see you later",
    "cv" : "curriculum vitae",
    "cwot" : "complete waste of time",
    "cya" : "see you",
    "cyt" : "see you tomorrow",
    "dae" : "does anyone else",
    "dbmib" : "do not bother me i am busy",
    "diy" : "do it yourself",
    "dm" : "direct message",
    "dwh" : "during work hours",
    "e123" : "easy as one two three",
    "eet" : "eastern european time",
    "eg" : "example",
    "embm" : "early morning business meeting",
    "encl" : "enclosed",
    "encl." : "enclosed",
    "etc" : "and so on",
    "faq" : "frequently asked questions",
    "fawc" : "for anyone who cares",
    "fb" : "facebook",
    "fc" : "fingers crossed",
    "fig" : "figure",
    "fimh" : "forever in my heart", 
    "ft." : "feet",
    "ft" : "featuring",
    "ftl" : "for the loss",
    "ftw" : "for the win",
    "fwiw" : "for what it is worth",
    "fyi" : "for your information",
    "g9" : "genius",
    "gahoy" : "get a hold of yourself",
    "gal" : "get a life",
    "gcse" : "general certificate of secondary education",
    "gfn" : "gone for now",
    "gg" : "good game",
    "gl" : "good luck",
    "glhf" : "good luck have fun",
    "gmt" : "greenwich mean time",
    "gmta" : "great minds think alike",
    "gn" : "good night",
    "g.o.a.t" : "greatest of all time",
    "goat" : "greatest of all time",
    "goi" : "get over it",
    "gps" : "global positioning system",
    "gr8" : "great",
    "gratz" : "congratulations",
    "gyal" : "girl",
    "h&c" : "hot and cold",
    "hp" : "horsepower",
    "hr" : "hour",
    "hrh" : "his royal highness",
    "ht" : "height",
    "ibrb" : "i will be right back",
    "ic" : "i see",
    "icq" : "i seek you",
    "icymi" : "in case you missed it",
    "idc" : "i do not care",
    "idgadf" : "i do not give a damn fuck",
    "idgaf" : "i do not give a fuck",
    "idk" : "i do not know",
    "ie" : "that is",
    "i.e" : "that is",
    "ifyp" : "i feel your pain",
    "IG" : "instagram",
    "iirc" : "if i remember correctly",
    "ilu" : "i love you",
    "ily" : "i love you",
    "imho" : "in my humble opinion",
    "imo" : "in my opinion",
    "imu" : "i miss you",
    "iow" : "in other words",
    "irl" : "in real life",
    "j4f" : "just for fun",
    "jic" : "just in case",
    "jk" : "just kidding",
    "jsyk" : "just so you know",
    "l8r" : "later",
    "lb" : "pound",
    "lbs" : "pounds",
    "ldr" : "long distance relationship",
    "lmao" : "laugh my ass off",
    "lmfao" : "laugh my fucking ass off",
    "lol" : "laughing out loud",
    "ltd" : "limited",
    "ltns" : "long time no see",
    "m8" : "mate",
    "mf" : "motherfucker",
    "mfs" : "motherfuckers",
    "mfw" : "my face when",
    "mofo" : "motherfucker",
    "mph" : "miles per hour",
    "mr" : "mister",
    "mrw" : "my reaction when",
    "ms" : "miss",
    "mte" : "my thoughts exactly",
    "nagi" : "not a good idea",
    "nbc" : "national broadcasting company",
    "nbd" : "not big deal",
    "nfs" : "not for sale",
    "ngl" : "not going to lie",
    "nhs" : "national health service",
    "nrn" : "no reply necessary",
    "nsfl" : "not safe for life",
    "nsfw" : "not safe for work",
    "nth" : "nice to have",
    "nvr" : "never",
    "nyc" : "new york city",
    "oc" : "original content",
    "og" : "original",
    "ohp" : "overhead projector",
    "oic" : "oh i see",
    "omdb" : "over my dead body",
    "omg" : "oh my god",
    "omw" : "on my way",
    "p.a" : "per annum",
    "p.m" : "after midday",
    "pm" : "prime minister",
    "poc" : "people of color",
    "pov" : "point of view",
    "pp" : "pages",
    "ppl" : "people",
    "prw" : "parents are watching",
    "ps" : "postscript",
    "pt" : "point",
    "ptb" : "please text back",
    "pto" : "please turn over",
    "qpsa" : "what happens", #"que pasa",
    "ratchet" : "rude",
    "rbtl" : "read between the lines",
    "rlrt" : "real life retweet", 
    "rofl" : "rolling on the floor laughing",
    "roflol" : "rolling on the floor laughing out loud",
    "rotflmao" : "rolling on the floor laughing my ass off",
    "rt" : "retweet",
    "ruok" : "are you ok",
    "sfw" : "safe for work",
    "sk8" : "skate",
    "smh" : "shake my head",
    "sq" : "square",
    "srsly" : "seriously", 
    "ssdd" : "same stuff different day",
    "tbh" : "to be honest",
    "tbs" : "tablespooful",
    "tbsp" : "tablespooful",
    "tfw" : "that feeling when",
    "thks" : "thank you",
    "tho" : "though",
    "thx" : "thank you",
    "tia" : "thanks in advance",
    "til" : "today i learned",
    "tl;dr" : "too long i did not read",
    "tldr" : "too long i did not read",
    "tmb" : "tweet me back",
    "tntl" : "trying not to laugh",
    "ttyl" : "talk to you later",
    "u" : "you",
    "u2" : "you too",
    "u4e" : "yours for ever",
    "utc" : "coordinated universal time",
    "w/" : "with",
    "w/o" : "without",
    "w8" : "wait",
    "wassup" : "what is up",
    "wb" : "welcome back",
    "wtf" : "what the fuck",
    "wtg" : "way to go",
    "wtpa" : "where the party at",
    "wuf" : "where are you from",
    "wuzup" : "what is up",
    "wywh" : "wish you were here",
    "yd" : "yard",
    "ygtr" : "you got that right",
    "ynk" : "you never know",
    "zzz" : "sleeping bored and tired"
}


In [None]:
# Converts abbreviations to their full text
def convert_abbrev(word):
    return abbreviations[word.lower()] if word.lower() in abbreviations.keys() else word

In [None]:
train_df['text']=train_df.text.apply(convert_abbrev)
test_df['text']=test_df.text.apply(convert_abbrev)

## Stemming

In [None]:
# Finds the stem of a word
stemmer = SnowballStemmer(language='english')

tokens = train_df['text'][1].split()
clean_text = ' '

for token in tokens:
    print(token + ' --> ' + stemmer.stem(token))

In [None]:
# Replaces each word with its stem

def stemWord(text):
    stemmer = SnowballStemmer(language='english')
    tokens = text.split()
    clean_text = ' '
    for token in tokens:
        clean_text = clean_text + " " + stemmer.stem(token)      
    
    return clean_text

In [None]:
train_df['text']=train_df.text.apply(stemWord)
test_df['text']=test_df.text.apply(stemWord)

## Lemmatisation

In [None]:
# Outputs the word associated with the root

def lemmatizeWord(text):
    tokens=nlp(text)
    clean_text = ' '
    for token in tokens:
        clean_text = clean_text + " " + token.lemma_      
    
    return clean_text


In [None]:
train_df['text']=train_df.text.apply(lemmatizeWord)
test_df['text']=test_df.text.apply(lemmatizeWord)

# Transforming tokens to a vector

## CountVectorizer

In [None]:
count_vectorizer = CountVectorizer()
train_bag = count_vectorizer.fit_transform(train_df['text'])
test_bag = count_vectorizer.transform(test_df["text"])

## TF-IDF
TF-IDF (Term frequency-Inverse document frequency) Where the term frequency is the number of appearances of a term t / number of terms in the document. The inverse document frequency is a score of how rare a given word is : IDF=$1+log(\tfrac{N}{n})$. Where $N$ is the number of documents and $n$ is the number of documents with a term t. 

In [None]:
tfidf = TfidfVectorizer(min_df=2, max_df=0.5, ngram_range=(1, 2))
train_tfidf = tfidf.fit_transform(train_df['text'])
test_tfidf = tfidf.transform(test_df["text"])

## Word Vectors / Word embeddings

In [None]:
with nlp.disable_pipes():
    train_vectors = np.array([nlp(text).vector for text in train_df.text])
    test_vectors = np.array([nlp(text).vector for text in test_df.text])

# Building a Text Classification Model

## Support Vector Machines

In [None]:
# Set dual=False to speed up training, and it's not needed

svc_wordEmbed = LinearSVC(random_state=42, dual=False, max_iter=10000)
svc_wordEmbed.fit(train_vectors, train_df.target)

In [None]:
# Evaluate using the F1 score

scores = model_selection.cross_val_score(svc_wordEmbed, train_vectors, train_df["target"], cv=3, scoring="f1")
scores

## XGBoost

In [None]:
xgb_wordEmbed = xgb.XGBClassifier(max_depth=7, n_estimators=200, colsample_bytree=0.8, 
                        subsample=0.8, nthread=10, learning_rate=0.1)
scores = model_selection.cross_val_score(xgb_wordEmbed, train_vectors, train_df["target"], cv=3, scoring="f1")
scores

## Naive Bayes

In [None]:
clf_NB = MultinomialNB()
scores = model_selection.cross_val_score(clf_NB, train_bag, train_df["target"], cv=3, scoring="f1")
clf_NB.fit(train_bag, train_df["target"])

In [None]:
# Fitting a simple Naive Bayes on TFIDF
clf_NB_TFIDF = MultinomialNB()
scores = model_selection.cross_val_score(clf_NB_TFIDF, train_tfidf, train_df["target"], cv=3, scoring="f1")
clf_NB_TFIDF.fit(train_tfidf, train_df["target"])

## Logistic Regression

In [None]:
clf = LogisticRegression(C=1.0)
scores = model_selection.cross_val_score(clf, train_bag, train_df["target"], cv=3, scoring="f1")
clf.fit(train_bag, train_df["target"])

In [None]:
# Fitting a simple Logistic Regression on TFIDF
clf_tfidf = LogisticRegression(C=1.0)
scores = model_selection.cross_val_score(clf_tfidf, train_tfidf, train_df["target"], cv=3, scoring="f1")
clf_tfidf.fit(train_bag, train_df["target"])

## Neural Network

In [None]:
train_vectors.shape

In [None]:
# Functions for evaulating the NN

def recall_m(y_true, y_pred):
    true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
    possible_positives = K.sum(K.round(K.clip(y_true, 0, 1)))
    recall = true_positives / (possible_positives + K.epsilon())
    return recall

def precision_m(y_true, y_pred):
    true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
    predicted_positives = K.sum(K.round(K.clip(y_pred, 0, 1)))
    precision = true_positives / (predicted_positives + K.epsilon())
    return precision

def f1_m(y_true, y_pred):
    precision = precision_m(y_true, y_pred)
    recall = recall_m(y_true, y_pred)
    return 2*((precision*recall)/(precision+recall+K.epsilon()))


In [None]:
# Learning rate

learning_rate_reduction = ReduceLROnPlateau(monitor='val_f1_m', 
                                            patience=3, 
                                            verbose=1, 
                                            factor=0.5, 
                                            min_lr=0.00001)


early_stopping = EarlyStopping(
    min_delta=0.001, # minimium amount of change to count as an improvement
    patience=5, # how many epochs to wait before stopping
    restore_best_weights=True,
)


In [None]:
#  Neural Network

nn = keras.Sequential([
    layers.Dense(256, activation='relu', input_shape=[7613,300]),
    layers.Dropout(0.4),
    layers.Dense(256, activation='relu'),
    layers.Dropout(0.4),
    layers.Dense(1,activation='sigmoid')
])

nn.compile(loss='binary_crossentropy',optimizer='adam',metrics=[f1_m])
history=nn.fit(
    train_vectors,train_df["target"],
    validation_split=0.1,
    batch_size=128,
    epochs=25,
    callbacks=[early_stopping,learning_rate_reduction])

In [None]:
history_frame = pd.DataFrame(history.history)
history_frame.loc[:, ['f1_m','val_f1_m']].plot()
history_frame.loc[:, ['loss','val_loss']].plot();

In [None]:
# Making predictions from the NN
pred = nn.predict(test_vectors)

pred[pred > 0.5] = 1
pred[pred <= 0.5] = 0

## BERT

BERT (Bidirectional Encoder Representations from Transformers), unlike conventional text classifiers BERT does not read the text from left to right. Instead BERT is non-directional meaning it reads the whole string before making any judgements.

In [None]:
!wget --quiet https://raw.githubusercontent.com/tensorflow/models/master/official/nlp/bert/tokenization.py
 # get the official tokenization created by the Google team

In [None]:
import tokenization

# Helper function for BERT
def bert_encode(texts, tokenizer, max_len=512):
    all_tokens = []
    all_masks = []
    all_segments = []
    
    for text in texts:
        text = tokenizer.tokenize(text)
            
        text = text[:max_len-2]
        input_sequence = ["[CLS]"] + text + ["[SEP]"]
        pad_len = max_len - len(input_sequence)
        
        tokens = tokenizer.convert_tokens_to_ids(input_sequence)
        tokens += [0] * pad_len
        pad_masks = [1] * len(input_sequence) + [0] * pad_len
        segment_ids = [0] * max_len
        
        all_tokens.append(tokens)
        all_masks.append(pad_masks)
        all_segments.append(segment_ids)
    
    return np.array(all_tokens), np.array(all_masks), np.array(all_segments)

In [None]:
# Builds a BERT model

def build_model(bert_layer, max_len = 128, lr = 1e-5):
    input_word_ids = tf.keras.layers.Input(shape=(max_len,), dtype=tf.int32,name="input_word_ids")
    input_mask = tf.keras.layers.Input(shape=(max_len,), dtype=tf.int32,name="input_mask")
    segment_ids = tf.keras.layers.Input(shape=(max_len,), dtype=tf.int32,name="segment_ids")
        
    pooled_output, sequence_output = bert_layer([input_word_ids, input_mask, segment_ids])
    dense_out = Dense(1,activation="relu")(pooled_output)
    drop_out = tf.keras.layers.Dropout(0.8)(dense_out)
    out = Dense(1,activation="sigmoid")(pooled_output)
    
    model = Model(inputs=[input_word_ids, input_mask, segment_ids], outputs=out)
    adam = tf.keras.optimizers.Adam(lr)
    model.compile(optimizer=adam, loss='binary_crossentropy', metrics=[f1_m])
        
    return model

# Making the Submission

In [None]:
ss = pd.read_csv('../input/nlp-getting-started/sample_submission.csv')
ss.info()
ss.head()

In [None]:
test_df.head()

In [None]:
pred = nn.predict(test_vectors)
target = []
for p in pred:
    if p > 0.5:
        target.append(1)
    else:
        target.append(0)

ss['target'] = target
ss.to_csv('nlpdt_nn.csv',index=False)