In [None]:
import pandas as pd
import numpy as np
import os
import matplotlib
import matplotlib.pyplot as plt
plt.style.use('ggplot')
import seaborn as sns
sns.set(style='whitegrid')

from wordcloud import WordCloud

from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.decomposition import PCA, TruncatedSVD
from sklearn.metrics import classification_report,confusion_matrix

from collections import defaultdict
from collections import Counter

import re
import gensim
import string

from tqdm import tqdm
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import Embedding, LSTM,Dense, SpatialDropout1D, Dropout
from keras.initializers import Constant

import warnings
warnings.simplefilter('ignore')

In [None]:
import nltk
nltk.download('all')

## Exploratory Data Analysis:

In [None]:
df=pd.read_csv('../input/nlp-getting-started/train.csv')
df_test=pd.read_csv('../input/nlp-getting-started/test.csv')
sample_submission=pd.read_csv('../input/nlp-getting-started/sample_submission.csv')

In [None]:
df.shape, df_test.shape

In [None]:
df

In [None]:
df.loc[:,['text','target']]

In [None]:
df.info()

In [None]:
df.isnull().sum()

In [None]:
df.target.value_counts()

We can see above the distribution of the classes is slightly unbalanced, this is why we should expect to have sidetrack in the prediction towards class 0.

In [None]:
df2=df.copy(deep=True)
pie1=pd.DataFrame(df2['target'].replace(1,'disaster').replace(0,'non-disaster').value_counts())
pie1.reset_index(inplace=True)
pie1.plot(kind='pie', title='Pie chart of Disaster/Non-disaster tweets',y = 'target', 
          autopct='%1.1f%%', shadow=False, labels=pie1['index'], legend = False, fontsize=14, figsize=(12,12))

It would be interesting to see the number of characters contained in the text of each class, even better showing them as histograms so as to see if there is a specific pattern for any class:

In [None]:
sns.set(style='whitegrid')
f, (ax1, ax2) = plt.subplots(1,2,figsize=(25,8))

ax1.hist(df[df['target'] == 0]['text'].str.len(), bins=30, color='b')
ax1.set_title('Non-disaster tweets')

ax2.hist(df[df['target'] == 1]['text'].str.len(), bins=30, color='r')
ax2.set_title('Disaster tweets')

f.suptitle('Histogram number of characters in tweets')

We can see above they have similar characteristics and we can not say too much about a special feature of any of them, would be better to apply this method but for the words.

This is why, we are goind to compute and display a histogram of the number of words in the headlines of each topic:

In [None]:
f, (ax1, ax2,) = plt.subplots(1,2,figsize=(25,8))

ax1.hist(df[df['target'] == 0]['text'].str.split().map(lambda x: len(x)), bins=29, color='b')
ax1.set_title('Non-disaster tweets')

ax2.hist(df[df['target'] == 1]['text'].str.split().map(lambda x: len(x)), bins=29, color='r')
ax2.set_title('Disaster tweets')

f.suptitle('Histogram number of words in tweets')

To the instances that correspond to disaster I will add the what is contained in the keyword feature to the end of the tweet so as to get more impactful words in the sentence and help the model to find the common words in disaster tweets. In order to do this we have to obtain the indexes of those instances with target = 1 'Disaster':

In [None]:
disaster_index=df[df['target']==1].index.values

In [None]:
len(disaster_index)

As an example let us print the keyword followed by the tweet of the instance 63 which does correspond to disaster:

In [None]:
print(df.iloc[63,1])
print(df.iloc[63,3])

And after applying the function the text for such instance should be as follows:

In [None]:
df.iloc[63,3] + ' ' + df.iloc[63,1]

Now, let's apply this function to all instances in the list 'disaster_index':

In [None]:
for i in disaster_index:
  df.iloc[i,3] = str(df.iloc[i,3]) + ' ' + str(df.iloc[i,1])

In [None]:
df.info()

## Cleaning:

The tweets contained in the dataset are almost raw, this means we have to get rid of all 'impurities' such as tags, symbols, punctuations, emojis, etc. These does not add significant information to the prediction moreover makes our sentences more subjective. This process comprehend 7 key steps which will make our sentences partially-suit to be used in training of the model.

### Removing URLs: 
Some tweets either disaster or non-disaster include links 'URLs' which correspond to videos or other webpages containing key information about the subject they are trying to communicate, as we want to clean the sentences we must get rid of them. The function which applies such step will be caled remove_URL:

In [None]:
example="New competition launched :https://www.kaggle.com/c/nlp-getting-started"

In [None]:
def remove_URL(text):
    url = re.compile(r'https?://\S+|www\.\S+')
    return url.sub(r'',text)

remove_URL(example)

In [None]:
df['text']=df['text'].apply(lambda x : remove_URL(x))

In [None]:
df_test['text']=df_test['text'].apply(lambda x : remove_URL(x))

### Removing HTML tags:

We have to consider that some tweets were obtained using web scrapping, using this method the components of a publication are companied by special tags identifying them. As such tags are unuseful we must get rid of them to gather only the text. The function which applies such step will be called remove_html:

In [None]:
example = """<div>
<h1>Real or Fake</h1>
<p>Kaggle </p>
<a href="https://www.kaggle.com/c/nlp-getting-started">getting started</a>
</div>"""

In [None]:
def remove_html(text):
    html=re.compile(r'<.*?>')
    return html.sub(r'',text)
    
print(remove_html(example))

In [None]:
df['text']=df['text'].apply(lambda x : remove_html(x))

In [None]:
df_test['text']=df_test['text'].apply(lambda x : remove_html(x))

### Removing Emojis:

Emojis are an efficient way to show the feeling of the publishers in the message, we could translate the meaning of them to words and help to improve the scope of the message. These could be useful or confuse the algorithm when finding the same feeling for disaster and non-disaster tweets, because of this we prefer to get rid of them, the function which applies such step will be called remove_emoji:

In [None]:
# Reference : https://gist.github.com/slowkow/7a7f61f495e3dbb7e3d767f97bd7304b
def remove_emoji(text):
    emoji_pattern = re.compile("["
                           u"\U0001F600-\U0001F64F"  # emoticons
                           u"\U0001F300-\U0001F5FF"  # symbols & pictographs
                           u"\U0001F680-\U0001F6FF"  # transport & map symbols
                           u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                           u"\U00002702-\U000027B0"
                           u"\U000024C2-\U0001F251"
                           "]+", flags=re.UNICODE)
    return emoji_pattern.sub(r'', text)

remove_emoji("Omg another Earthquake 😔😔")

In [None]:
df['text']=df['text'].apply(lambda x: remove_emoji(x))

In [None]:
df_test['text']=df_test['text'].apply(lambda x: remove_emoji(x))

### Removing numbers:

In the current project we will only focus on words in order to classify the tweets, obviously the number could be useful because they can mean coordinates of a disaster, code of an accident, hour of accident, number of people killed or also can mean the release year of a videogame, number of followers of an influencer, etc. The use of these numbers in the model can be pending for a next project.

In [None]:
df['text']=df['text'].str.replace('\d+', '')

In [None]:
df_test['text']=df_test['text'].str.replace('\d+', '')

### Contractions and acronyms:
People world-wide make use of acronyms to speed-up the publishing of a tweet, some of them can be miswritten and others can be decomposed creating words that make sense, this process is exhaustive and requires investing a long time searching the meaning of each one, the function which replaces the contractions and acronyms by the words they stand for will be called cleaner: 


In [None]:
def cleaner(tweet):
  # Acronyms and miswritten words
  tweet = re.sub(r"Typhoon-Devastated", "typhoon devastated", tweet)
  tweet = re.sub(r"TyphoonDevastated", "typhoon devastated", tweet)
  tweet = re.sub(r"typhoondevastated", "typhoon devastated", tweet)
  tweet = re.sub(r"MH370", "Malaysia Airlines Flight", tweet)
  tweet = re.sub(r"MH", "Malaysia Airlines Flight", tweet)
  tweet = re.sub(r"mh370", "Malaysia Airlines Flight", tweet)
  tweet = re.sub(r"year-old", "years old", tweet)
  tweet = re.sub(r"yearold", "years old", tweet)
  tweet = re.sub(r"yr old", "years old", tweet)
  tweet = re.sub(r"PKK", "Kurdistan Workers Party", tweet)
  tweet = re.sub(r"MP", "madhya pradesh", tweet)
  tweet = re.sub(r"rly", "railway", tweet)
  tweet = re.sub(r"CDT", "Central Daylight Time", tweet)
  tweet = re.sub(r"sensorsenso", "sensor senso", tweet)
  tweet = re.sub(r"pm", "", tweet)
  tweet = re.sub(r"PM", "", tweet)
  tweet = re.sub(r"nan", " ", tweet)
  tweet = re.sub(r"terrorismturn", "terrorism turn", tweet)
  tweet = re.sub(r"epicente", "epicenter", tweet)
  tweet = re.sub(r"epicenterr", "epicenter", tweet)
  tweet = re.sub(r"WAwildfire", "Washington Wildfire", tweet)
  tweet = re.sub(r"prebreak", "pre break", tweet)
  tweet = re.sub(r"nowplaying", "now playing", tweet)
  tweet = re.sub(r"RT", "retweet", tweet)
  tweet = re.sub(r"EbolaOutbreak", "Ebola Outbreak", tweet)
  tweet = re.sub(r"LondonFire", "London Fire", tweet)
  tweet = re.sub(r"IDFire", "Idaho Fire", tweet)
  tweet = re.sub(r"withBioterrorism&use", "with Bioterrorism & use", tweet)
  tweet = re.sub(r"NASAHurricane", "NASA Hurricane", tweet)
  tweet = re.sub(r"withweapons", "with weapons", tweet)
  tweet = re.sub(r"NuclearPower", "Nuclear Power", tweet)
  tweet = re.sub(r"WhiteTerrorism", "White Terrorism", tweet)
  tweet = re.sub(r"MyanmarFlood", "Myanmar Flood", tweet)
  tweet = re.sub(r"ExtremeWeather", "Extreme Weather", tweet)

  # Special characters
  tweet = re.sub(r"%20", " ", tweet)
  tweet = re.sub(r"%", " ", tweet)
  tweet = re.sub(r"@", " ", tweet)
  tweet = re.sub(r"#", " ", tweet)
  tweet = re.sub(r"'", " ", tweet)
  tweet = re.sub(r"\x89û_", " ", tweet)
  tweet = re.sub(r"\x89ûò", " ", tweet)
  tweet = re.sub(r"16yr", "16 year", tweet)
  tweet = re.sub(r"re\x89û_", " ", tweet)
  tweet = re.sub(r"\x89û", " ", tweet)
  tweet = re.sub(r"\x89Û", " ", tweet)
  tweet = re.sub(r"re\x89Û", "re ", tweet)
  tweet = re.sub(r"re\x89û", "re ", tweet)
  tweet = re.sub(r"\x89ûª", "'", tweet)
  tweet = re.sub(r"\x89û", " ", tweet)
  tweet = re.sub(r"\x89ûò", " ", tweet)
  tweet = re.sub(r"\x89Û_", "", tweet)
  tweet = re.sub(r"\x89ÛÒ", "", tweet)
  tweet = re.sub(r"\x89ÛÓ", "", tweet)
  tweet = re.sub(r"\x89ÛÏWhen", "When", tweet)
  tweet = re.sub(r"\x89ÛÏ", "", tweet)
  tweet = re.sub(r"China\x89Ûªs", "China's", tweet)
  tweet = re.sub(r"let\x89Ûªs", "let's", tweet)
  tweet = re.sub(r"\x89Û÷", "", tweet)
  tweet = re.sub(r"\x89Ûª", "", tweet)
  tweet = re.sub(r"\x89Û\x9d", "", tweet)
  tweet = re.sub(r"å_", "", tweet)
  tweet = re.sub(r"\x89Û¢", "", tweet)
  tweet = re.sub(r"\x89Û¢åÊ", "", tweet)
  tweet = re.sub(r"fromåÊwounds", "from wounds", tweet)
  tweet = re.sub(r"åÊ", "", tweet)
  tweet = re.sub(r"åÈ", "", tweet)
  tweet = re.sub(r"JapÌ_n", "Japan", tweet)    
  tweet = re.sub(r"Ì©", "e", tweet)
  tweet = re.sub(r"å¨", "", tweet)
  tweet = re.sub(r"SuruÌ¤", "Suruc", tweet)
  tweet = re.sub(r"åÇ", "", tweet)
  tweet = re.sub(r"å£3million", "3 million", tweet)
  tweet = re.sub(r"åÀ", "", tweet)

  # Contractions
  tweet = re.sub(r"he's", "he is", tweet)
  tweet = re.sub(r"there's", "there is", tweet)
  tweet = re.sub(r"We're", "We are", tweet)
  tweet = re.sub(r"That's", "That is", tweet)
  tweet = re.sub(r"won't", "will not", tweet)
  tweet = re.sub(r"they're", "they are", tweet)
  tweet = re.sub(r"Can't", "Cannot", tweet)
  tweet = re.sub(r"wasn't", "was not", tweet)
  tweet = re.sub(r"don\x89Ûªt", "do not", tweet)
  tweet = re.sub(r"aren't", "are not", tweet)
  tweet = re.sub(r"isn't", "is not", tweet)
  tweet = re.sub(r"What's", "What is", tweet)
  tweet = re.sub(r"haven't", "have not", tweet)
  tweet = re.sub(r"hasn't", "has not", tweet)
  tweet = re.sub(r"There's", "There is", tweet)
  tweet = re.sub(r"He's", "He is", tweet)
  tweet = re.sub(r"It's", "It is", tweet)
  tweet = re.sub(r"You're", "You are", tweet)
  tweet = re.sub(r"I'M", "I am", tweet)
  tweet = re.sub(r"Im", "I am", tweet)
  tweet = re.sub(r"shouldn't", "should not", tweet)
  tweet = re.sub(r"wouldn't", "would not", tweet)
  tweet = re.sub(r"i'm", "I am", tweet)
  tweet = re.sub(r"I\x89Ûªm", "I am", tweet)
  tweet = re.sub(r"I'm", "I am", tweet)
  tweet = re.sub(r"Isn't", "is not", tweet)
  tweet = re.sub(r"Here's", "Here is", tweet)
  tweet = re.sub(r"you've", "you have", tweet)
  tweet = re.sub(r"you\x89Ûªve", "you have", tweet)
  tweet = re.sub(r"we're", "we are", tweet)
  tweet = re.sub(r"what's", "what is", tweet)
  tweet = re.sub(r"couldn't", "could not", tweet)
  tweet = re.sub(r"we've", "we have", tweet)
  tweet = re.sub(r"it\x89Ûªs", "it is", tweet)
  tweet = re.sub(r"doesn\x89Ûªt", "does not", tweet)
  tweet = re.sub(r"It\x89Ûªs", "It is", tweet)
  tweet = re.sub(r"Here\x89Ûªs", "Here is", tweet)
  tweet = re.sub(r"who's", "who is", tweet)
  tweet = re.sub(r"I\x89Ûªve", "I have", tweet)
  tweet = re.sub(r"y'all", "you all", tweet)
  tweet = re.sub(r"can\x89Ûªt", "cannot", tweet)
  tweet = re.sub(r"would've", "would have", tweet)
  tweet = re.sub(r"it'll", "it will", tweet)
  tweet = re.sub(r"we'll", "we will", tweet)
  tweet = re.sub(r"wouldn\x89Ûªt", "would not", tweet)
  tweet = re.sub(r"We've", "We have", tweet)
  tweet = re.sub(r"he'll", "he will", tweet)
  tweet = re.sub(r"Y'all", "You all", tweet)
  tweet = re.sub(r"Weren't", "Were not", tweet)
  tweet = re.sub(r"Didn't", "Did not", tweet)
  tweet = re.sub(r"they'll", "they will", tweet)
  tweet = re.sub(r"they'd", "they would", tweet)
  tweet = re.sub(r"DON'T", "DO NOT", tweet)
  tweet = re.sub(r"That\x89Ûªs", "That is", tweet)
  tweet = re.sub(r"they've", "they have", tweet)
  tweet = re.sub(r"i'd", "I would", tweet)
  tweet = re.sub(r"should've", "should have", tweet)
  tweet = re.sub(r"You\x89Ûªre", "You are", tweet)
  tweet = re.sub(r"where's", "where is", tweet)
  tweet = re.sub(r"Don\x89Ûªt", "Do not", tweet)
  tweet = re.sub(r"we'd", "we would", tweet)
  tweet = re.sub(r"i'll", "I will", tweet)
  tweet = re.sub(r"weren't", "were not", tweet)
  tweet = re.sub(r"They're", "They are", tweet)
  tweet = re.sub(r"Can\x89Ûªt", "Cannot", tweet)
  tweet = re.sub(r"you\x89Ûªll", "you will", tweet)
  tweet = re.sub(r"I\x89Ûªd", "I would", tweet)
  tweet = re.sub(r"let's", "let us", tweet)
  tweet = re.sub(r"it's", "it is", tweet)
  tweet = re.sub(r"can't", "can not", tweet)
  tweet = re.sub(r"cant", "can not", tweet)
  tweet = re.sub(r"don't", "do not", tweet)
  tweet = re.sub(r"dont", "do not", tweet)
  tweet = re.sub(r"you're", "you are", tweet)
  tweet = re.sub(r"i've", "I have", tweet)
  tweet = re.sub(r"that's", "that is", tweet)
  tweet = re.sub(r"i'll", "I will", tweet)
  tweet = re.sub(r"doesn't", "does not", tweet)
  tweet = re.sub(r"i'd", "I would", tweet)
  tweet = re.sub(r"didn't", "did not", tweet)
  tweet = re.sub(r"ain't", "am not", tweet)
  tweet = re.sub(r"you'll", "you will", tweet)
  tweet = re.sub(r"I've", "I have", tweet)
  tweet = re.sub(r"Don't", "do not", tweet)
  tweet = re.sub(r"I'll", "I will", tweet)
  tweet = re.sub(r"I'd", "I would", tweet)
  tweet = re.sub(r"Let's", "Let us", tweet)
  tweet = re.sub(r"you'd", "You would", tweet)
  tweet = re.sub(r"It's", "It is", tweet)
  tweet = re.sub(r"Ain't", "am not", tweet)
  tweet = re.sub(r"Haven't", "Have not", tweet)
  tweet = re.sub(r"Could've", "Could have", tweet)
  tweet = re.sub(r"youve", "you have", tweet)  
  tweet = re.sub(r"donå«t", "do not", tweet)

  return tweet

In [None]:
df['text'] = df['text'].apply(lambda s : cleaner(s))

In [None]:
df_test['text'] = df_test['text'].apply(lambda s : cleaner(s))

### Removing punctuations:

In this step the there are only a few tweets cleaned that still contain symbols and punctuations, as they don't add key information to the message we will get rid of them, the function which applies such step will be called remove_punct:


In [None]:
def remove_punct(text):
    table=str.maketrans('','',string.punctuation)
    return text.translate(table)

example="I am a #king"
print(remove_punct(example))

In [None]:
df['text']=df['text'].apply(lambda x : remove_punct(x))

In [None]:
df_test['text']=df_test['text'].apply(lambda x : remove_punct(x))

### Removing multiple spaces:

Now, some sentences cleaned have different types of extra whitespaces, obviusly they don't add anything to the corpus and we will get rid of them with the following lines:

In [None]:
df['text']=df['text'].str.replace('   ', ' ')
df['text']=df['text'].str.replace('     ', ' ')
df['text']=df['text'].str.replace('\xa0 \xa0 \xa0', ' ')
df['text']=df['text'].str.replace('  ', ' ')
df['text']=df['text'].str.replace('—', ' ')
df['text']=df['text'].str.replace('–', ' ')

In [None]:
df_test['text']=df_test['text'].str.replace('   ', ' ')
df_test['text']=df_test['text'].str.replace('     ', ' ')
df_test['text']=df_test['text'].str.replace('\xa0 \xa0 \xa0', ' ')
df_test['text']=df_test['text'].str.replace('  ', ' ')
df_test['text']=df_test['text'].str.replace('—', ' ')
df_test['text']=df_test['text'].str.replace('–', ' ')

## Analysis of common words per class:

As we will select the most common words from each topic we have to make sure to avoid selecting the denominated 'stop words' because these will not be relevant in this case, this is why we will import them by downloading from nltk tool and english dictionary:

In [None]:
nltk.download("stopwords")

In [None]:
from nltk.corpus import stopwords

In [None]:
stop_words = set(stopwords.words("english"))

In the following step we will plot the top most common words in each set we have, disaster tweets, non-disaster tweets and testing tweets:

In [None]:
from collections import defaultdict,Counter

In [None]:
word_count = Counter(" ".join(df[df['target']==1]['text']).split()).most_common(100)
x=[]
y=[]
for word,count in word_count:
    if (word.casefold() not in stop_words) :
        x.append(word)
        y.append(count)

sns.barplot(x=y[:20],y=x[:20])
plt.title('20 most common words in Disaster tweets')

In [None]:
word_count = Counter(" ".join(df[df['target']==0]['text']).split()).most_common(100)
x=[]
y=[]
for word,count in word_count:
    if (word.casefold() not in stop_words) :
        x.append(word)
        y.append(count)

sns.barplot(x=y[:20],y=x[:20])
plt.title('20 most common words in Non-disaster tweets')

In [None]:
word_count = Counter(" ".join(df_test['text']).split()).most_common(100)
x=[]
y=[]
for word,count in word_count:
    if (word.casefold() not in stop_words) :
        x.append(word)
        y.append(count)

sns.barplot(x=y[:30],y=x[:30])
plt.title('21 most common words in Test tweets')

Now let's compute the N-grams in each set already mentioned, the folowing function generate_ngrams will help us with the process: 

In [None]:
# Define ngram generator function
def generate_ngrams(text, n_gram):
    token = [token for token in text.lower().split(' ') if token != '' if token not in stop_words]
    ngrams = zip(*[token[i:] for i in range(n_gram)])
    return [' '.join(ngram) for ngram in ngrams]

The number of N-grams to compute:

In [None]:
N=50

### Bi-grams:

Bigrams analysis of test tweets:

In [None]:
# Bigrams
testing_bigrams = defaultdict(int)

for instance in df_test['text']:
    for word in generate_ngrams(instance, n_gram=2):
        testing_bigrams[word] += 1
   
df_testing_bigrams = pd.DataFrame(sorted(testing_bigrams.items(), key=lambda x: x[1])[::-1])

In [None]:
fig_dims = (25, 30)
fig, ax3 = plt.subplots(figsize=fig_dims)

sns.barplot(y=df_testing_bigrams[0].values[:50], x=df_testing_bigrams[1].values[:50], color='y', ax=ax3)
plt.tick_params(axis='x', labelsize=13)
plt.tick_params(axis='y', labelsize=13)

plt.title(f'Top {N} most common bigrams in testing tweets', fontsize=15)

plt.show()

Bigrams analysis of disaster and non-disaster tweets:

In [None]:
# Bigrams
disaster_bigrams = defaultdict(int)
nondisaster_bigrams = defaultdict(int)

for instance in df[df['target']==1]['text']:
    for word in generate_ngrams(instance, n_gram=2):
        disaster_bigrams[word] += 1

for instance in df[df['target']==0]['text']:
    for word in generate_ngrams(instance, n_gram=2):
        nondisaster_bigrams[word] += 1 
   
df_disaster_bigrams = pd.DataFrame(sorted(disaster_bigrams.items(), key=lambda x: x[1])[::-1])
df_nondisaster_bigrams = pd.DataFrame(sorted(nondisaster_bigrams.items(), key=lambda x: x[1])[::-1])

In [None]:
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(25,30), dpi=80)
plt.tight_layout()

sns.barplot(y=df_disaster_bigrams[0].values[:N], x=df_disaster_bigrams[1].values[:N], ax=ax1, color='r')
ax1.spines['right'].set_visible(False)
ax1.tick_params(axis='x', labelsize=13)
ax1.tick_params(axis='y', labelsize=13)

sns.barplot(y=df_nondisaster_bigrams[0].values[:N], x=df_nondisaster_bigrams[1].values[:N], ax=ax2, color='b')
ax2.spines['right'].set_visible(False)
ax2.tick_params(axis='x', labelsize=13)
ax2.tick_params(axis='y', labelsize=13)

ax1.set_title(f'Top {N} most common bigrams in Disaster tweets', fontsize=15)
ax2.set_title(f'Top {N} most common bigrams in Non-disaster tweets', fontsize=15)

plt.show()
plt.tight_layout()

### Tri-grams:

Trigrams analysis of test tweets:

In [None]:
# Trigrams
testing_trigrams = defaultdict(int)

for instance in df_test['text']:
    for word in generate_ngrams(instance, n_gram=3):
        testing_trigrams[word] += 1
   
df_testing_trigrams = pd.DataFrame(sorted(testing_trigrams.items(), key=lambda x: x[1])[::-1])

In [None]:
fig_dims = (25, 30)
fig, ax3 = plt.subplots(figsize=fig_dims)

sns.barplot(y=df_testing_trigrams[0].values[:50], x=df_testing_trigrams[1].values[:50], color='y', ax=ax3)
plt.tick_params(axis='x', labelsize=13)
plt.tick_params(axis='y', labelsize=13)

plt.title(f'Top {N} most common trigrams in testing tweets', fontsize=15)

plt.show()

Trigrams analysis of disaster and non-disaster tweets:

In [None]:
# Trigrams
disaster_bigrams = defaultdict(int)
nondisaster_bigrams = defaultdict(int)

for instance in df[df['target']==1]['text']:
    for word in generate_ngrams(instance, n_gram=3):
        disaster_bigrams[word] += 1

for instance in df[df['target']==0]['text']:
    for word in generate_ngrams(instance, n_gram=3):
        nondisaster_bigrams[word] += 1 
   
df_disaster_bigrams = pd.DataFrame(sorted(disaster_bigrams.items(), key=lambda x: x[1])[::-1])
df_nondisaster_bigrams = pd.DataFrame(sorted(nondisaster_bigrams.items(), key=lambda x: x[1])[::-1])

In [None]:
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(25,20), dpi=80)
plt.tight_layout()

sns.barplot(y=df_disaster_bigrams[0].values[:N], x=df_disaster_bigrams[1].values[:N], ax=ax1, color='r')
ax1.spines['right'].set_visible(False)
ax1.tick_params(axis='x', labelsize=13)
ax1.tick_params(axis='y', labelsize=13)

sns.barplot(y=df_nondisaster_bigrams[0].values[:N], x=df_nondisaster_bigrams[1].values[:N], ax=ax2, color='b')
ax2.spines['right'].set_visible(False)
ax2.tick_params(axis='x', labelsize=13)
ax2.tick_params(axis='y', labelsize=13)

ax1.set_title(f'Top {N} most common trigrams in Disaster tweets', fontsize=15)
ax2.set_title(f'Top {N} most common trigrams in Non-disaster tweets', fontsize=15)

plt.show()
plt.tight_layout()

## Tokenizing

In [None]:
df[['text','target']]

In [None]:
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

Create a new column in train file containing instances without stopwords:

In [None]:
df['text_without_stopwords'] = df['text'].apply(lambda x: ' '.join([word for word in x.split() if word not in (stop_words)]))

In [None]:
df[['text','text_without_stopwords']]

Create a new column in test file containing instances without stopwords:

In [None]:
df_test['text_without_stopwords'] = df_test['text'].apply(lambda x: ' '.join([word for word in x.split() if word not in (stop_words)]))

In [None]:
df_test[['text','text_without_stopwords']]

### Train-test split:
As we have only 7613 instances in our train dataframe we have to keep a reasonable proportion for validation set because it will not contain a huge  number of instances, 80%-20% would be nice.

In [None]:
training_portion=0.80

In [None]:
train_size = int(df.shape[0]*training_portion)

train_sentences = df['text_without_stopwords'][:train_size]
train_labels = df['target'][:train_size]

validation_sentences = df['text_without_stopwords'][train_size:]
validation_labels = df['target'][train_size:]

print(train_size)
print(len(train_sentences))
print(len(train_labels))
print(len(validation_sentences))
print(len(validation_labels))

Let us tokenize all words in our dataset so as to analyze and get an appropriate vocab_size and max_length:

In [None]:
tokenizer0 = Tokenizer()
tokenizer0.fit_on_texts(df['text'])
word_index = tokenizer0.word_index
len(word_index)

There are 16871 unique words in the training sentences, we must remember that this number would be higher if we would not have removed the stopwords.

Let's set the vocab size to 16000 so as not to slow down too much the model training later:

In [None]:
vocab_size = 16000
oov_tok = '<OOV>'

In [None]:
tokenizer = Tokenizer(num_words=vocab_size, oov_token=oov_tok)
tokenizer.fit_on_texts(df['text'])
word_index = tokenizer.word_index

Now, we can compute the length of each instance tokenized of the training set and see its distribution in order to choose maximum length which does not drop significant information at the end of the sequence:

In [None]:
train_sequences = tokenizer.texts_to_sequences(train_sentences)

In [None]:
lengths=[]
for k in range(len(train_sequences)):
  lengths.append(len(train_sequences[k]))

In [None]:
pd.DataFrame(lengths, columns=['Lenghts']).describe()

In [None]:
plt.hist(lengths, bins=27, alpha=0.5)
plt.show()

Let us set such argument to 20, truncation type 'post' and padding type 'post' as follows:

In [None]:
max_length = 20
trunc_type = 'post'
padding_type = 'post'

In [None]:
train_padded = pad_sequences(train_sequences,maxlen=max_length,padding=padding_type,truncating=trunc_type)

We can print the length of the sequences before and after padding, let's see their corresponding for 3 instances of training set:

In [None]:
print(len(train_sequences[0]))
print(len(train_padded[0]))

print(len(train_sequences[1]))
print(len(train_padded[1]))

print(len(train_sequences[10]))
print(len(train_padded[10]))

The same process we just did to training set we have to do in validation set, but this will be done using the tokenizer function fitted in training:

In [None]:
validation_sequences = tokenizer.texts_to_sequences(validation_sentences)
validation_padded = pad_sequences(validation_sequences,padding=padding_type,maxlen=max_length,truncating=trunc_type)

print(len(validation_sequences))
print(validation_padded.shape)

As training and validation sets are ready to be used in the training of the model, it's the right moment to do exactly the same for the instances in test set in order to have them properly processed to predict their classes.

In [None]:
test_sequences = tokenizer.texts_to_sequences(df_test['text_without_stopwords'])
test_padded = pad_sequences(test_sequences,padding=padding_type,maxlen=max_length,truncating=trunc_type)

print(len(test_sequences))
print(test_padded.shape)

In [None]:
np.unique(train_labels)

## Modeling:

In this step we are going to consider building models by scratch and pre-trained as can be seen below:

- Recurrent Neural Network by scratch.
- Pre-trained 100 dimensional Glove.

Before building the models we have to create two contraints or 'callbacks', one to reduce the learning rate whenever the validation accuracy does not increase so as to lead the steps towards the global minimum loss and the second is early stopping which will stop the training if the validation accuracy does not increase after 2 consecutive epochs.

In [None]:
from keras.callbacks import ReduceLROnPlateau
lr_reduction = ReduceLROnPlateau(monitor='val_accuracy',
                                 patience=1, 
                                 verbose=1, 
                                 factor=0.5, 
                                 min_lr=0.000001)

from keras.callbacks import EarlyStopping
early_stopping = EarlyStopping(monitor='val_accuracy', 
                               min_delta=0.005,
                               patience=3, 
                               verbose=1, 
                               mode='auto')

### Recurrent Neural Network by scratch

In [None]:
embedding_dim = 32
model = tf.keras.Sequential([
              tf.keras.layers.Embedding(vocab_size,embedding_dim,input_length=max_length),
              tf.keras.layers.GlobalAveragePooling1D(),
              tf.keras.layers.Dense(24,activation='relu'),
              tf.keras.layers.Dropout(0.1),
              tf.keras.layers.Dense(1,activation='sigmoid')
])
model.compile(loss='binary_crossentropy',optimizer='adam',metrics=['accuracy'])
model.summary()

Observation: The callback which reduces the learning rate was useful as the training accuracy increases whenever the lr is reduced, however the validation accuracy stays the same in the last epochs, this is why the early stopping will not be used so as not to limit the performance of the model:

In [None]:
num_epochs = 15
history = model.fit(train_padded,train_labels,epochs=num_epochs,
                    validation_data=(validation_padded,validation_labels),verbose=1,
                    callbacks=[lr_reduction])

The first model took 15 seconds to train reaching accuracies of: 98.3% train/83.1% val.

In [None]:
def plot_metrics(history):
  acc=history.history['accuracy']
  val_acc=history.history['val_accuracy']
  loss=history.history['loss']
  val_loss=history.history['val_loss']

  epochs=range(1,len(history.history['accuracy'])+1) # Get number of epochs

  #------------------------------------------------
  # Plot training and validation accuracy per epoch
  #------------------------------------------------
  plt.plot(epochs, acc, 'r')
  plt.plot(epochs, val_acc, 'b')
  plt.title('Training and validation accuracy')
  plt.xlabel("Epochs")
  plt.ylabel("Accuracy")
  plt.legend(["Accuracy", "Validation Accuracy"])

  plt.figure()

  #------------------------------------------------
  # Plot training and validation loss per epoch
  #------------------------------------------------
  plt.plot(epochs, loss, 'r')
  plt.plot(epochs, val_loss, 'b')
  plt.title('Training and validation loss')
  plt.xlabel("Epochs")
  plt.ylabel("Loss")
  plt.legend(["Loss", "Validation Loss"])

  plt.figure()

In [None]:
plot_metrics(history)

In [None]:
predicted_val = model.predict(validation_padded, batch_size=32)

In [None]:
predicted_val[:5]

Once we have this probabilities we have to convert it to discrete values, either 1 or 0, such task can be achieved by using the np.round function:

In [None]:
class_pred_val= np.round(predicted_val)
class_pred_val[:5]

In [None]:
validation_labels[:5]

Classification report and confusion matrix:

In [None]:
from sklearn.metrics import classification_report

print(classification_report(class_pred_val,validation_labels))

In [None]:
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay

disp = ConfusionMatrixDisplay(confusion_matrix=confusion_matrix(class_pred_val,validation_labels), display_labels=np.unique(train_labels))
disp.plot(cmap='Blues') 
plt.grid(False)

### Pre-trained model:

We will download the Glove pre-trained model for text classification with 100 embedding dimensions, after predicting the classes for the validation set we will compare with those obtained from the previously built scratch model.

As a first step we have to tokenize again the sentences without limiting the maximum length of them as follows:

In [None]:
tokenizer_glove = Tokenizer()
tokenizer_glove.fit_on_texts(df['text'])
word_index = tokenizer_glove.word_index
len(word_index)

In [None]:
train_glove_sequences=tokenizer_glove.texts_to_sequences(train_sentences)
train_glove_padded = pad_sequences(train_glove_sequences,maxlen=max_length,
                                   padding=padding_type,truncating=trunc_type)

train_glove_padded.shape

In [None]:
val_glove_sequences=tokenizer_glove.texts_to_sequences(validation_sentences)
val_glove_padded = pad_sequences(val_glove_sequences,maxlen=max_length,
                                 padding=padding_type,truncating=trunc_type)

val_glove_padded.shape

In [None]:
test_glove_sequences=tokenizer_glove.texts_to_sequences(df_test['text_without_stopwords'])
test_glove_padded = pad_sequences(test_glove_sequences,maxlen=max_length,
                                  padding=padding_type,truncating=trunc_type)

test_glove_padded.shape

In [None]:
vocab_size_glove=len(word_index)

Now, as we said we will download the weights of Glove 100 dimensional version  from Stanford:

In [None]:
embeddings_index = {};
with open('../input/glove6b100dtxt/glove.6B.100d.txt') as f:
    for line in f:
        values = line.split();
        word = values[0];
        coefs = np.asarray(values[1:], dtype='float32');
        embeddings_index[word] = coefs;

The following lines creates a matrix containing the weights and its dimension is the new vocabulary size by 100 embedding dimension, this will be loaded as the weights of the first embedding layer by setting the argument trainable=False.

In [None]:
embeddings_matrix = np.zeros((vocab_size_glove+1, 100));
for word, i in word_index.items():
    embedding_vector = embeddings_index.get(word);
    if embedding_vector is not None:
        embeddings_matrix[i] = embedding_vector;

In [None]:
print(len(embeddings_matrix))

Time now to build the model using the Glove-weights, take into account that we will consider the same layers included in the previous model:

In [None]:
from tensorflow.keras.optimizers import Adam

model_glove2=Sequential()
model_glove2.add(Embedding(vocab_size_glove+1,100,weights=[embeddings_matrix],input_length=max_length,trainable=False))
model_glove2.add(SpatialDropout1D(0.2))
model_glove2.add(LSTM(100, dropout=0.2, recurrent_dropout=0.2))
model_glove2.add(Dense(1, activation='sigmoid'))

optimizer=Adam(learning_rate=5e-3)
model_glove2.compile(loss='binary_crossentropy',optimizer=optimizer,metrics=['accuracy'])
model_glove2.summary()

Above we can see this new model contains around 1.7 million parameters to compute, this would take a long time, but as we have loaded the weights only 80.501 are trainable. In contrast to the scratch model which contains over 576 thousand parameters and obviously have to be computed all of them.

Observation: In this model the callback which reduces the learning rate was not useful as the accuracies didn't improve whenever the lr was reduced, this is why I'm not using them here, also the change in batch size impacted the performance:

In [None]:
num_epochs = 15
history_glove2 = model_glove2.fit(train_glove_padded,train_labels,epochs=num_epochs,
                    validation_data=(val_glove_padded,validation_labels),verbose=1,
                    batch_size=32)

The model took 4 minutes and 33 seconds to train reaching accuracies of: 97.0% train/89.9% val.

In [None]:
plot_metrics(history_glove2)

In [None]:
predicted_val = model_glove2.predict(val_glove_padded, batch_size=32)

In [None]:
predicted_val[:5]

Once we have this probabilities we have to convert it to discrete values, either 1 or 0, such task can be achieved by using the np.round function:

In [None]:
class_pred_val_glove= np.round(predicted_val)
class_pred_val_glove[:5]

In [None]:
validation_labels[:5]

Classification report and confusion matrix:

In [None]:
from sklearn.metrics import classification_report

print(classification_report(class_pred_val_glove,validation_labels))

In [None]:
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay

disp = ConfusionMatrixDisplay(confusion_matrix=confusion_matrix(class_pred_val_glove,validation_labels), display_labels=np.unique(train_labels))
disp.plot(cmap='Blues') 
plt.grid(False)

Predicting classes for test instances:

In [None]:
test_prediction = model_glove2.predict(test_glove_padded)
test_prediction = test_prediction.round().astype('int')

I would like to know any feedback in order to increase the performance of the models or tell me if you found a different one even better!

If you liked this notebook I would appreciate so much your upvote if you want to see more projects/tutorials like this one. I encourage you to see my projects portfolio, am sure you will love it.

Thank you!