# Disaster or Not Disaster Tweets

## NLP:
* EDA (with WordCloud)
* Bag of Words
* TF IDF
* GloVe
* PCA visualization for the main models
* Showing Confusion Matrices for GloVe

Reference - https://www.kaggle.com/vbmokin/nlp-eda-bag-of-words-tf-idf-glove-bert

## About Data
Each sample in the train and test set has the following information:

* The text of a tweet
* A keyword from that tweet
* The location the tweet was sent from

## Import necessary Libraries

In [None]:
import pandas as pd
import numpy as np
import os
import matplotlib
import matplotlib.pyplot as plt
import matplotlib.patches as mpatches
import seaborn as sns

from nltk.corpus import stopwords
from nltk.util import ngrams

from wordcloud import WordCloud

from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.decomposition import PCA, TruncatedSVD
from sklearn.metrics import classification_report,confusion_matrix

from collections import defaultdict
from collections import Counter
plt.style.use('ggplot')
stop_words=set(stopwords.words('english'))

import re
from nltk.tokenize import word_tokenize
import gensim
import string

from tqdm import tqdm
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import Embedding, LSTM,Dense, SpatialDropout1D, Dropout
from keras.initializers import Constant
from keras.optimizers import Adam

import warnings
warnings.simplefilter('ignore')

## Load data

In [None]:
train = pd.read_csv('/kaggle/input/nlp-getting-started/train.csv')
test = pd.read_csv('/kaggle/input/nlp-getting-started/test.csv')

In [None]:
train.head()

In [None]:
test.head()

## Class Distribution
let's check the class distribution

In [None]:
sns.countplot(x = 'target', data=train)

So the class is little biased towards the Disaster tweets

## Calculate number of words in texts


In [None]:
train['num_words'] = train['text'].apply(lambda x: len(str(x).split()))

In [None]:
train.head()

## Plot number of words to check

In [None]:
sns.distplot(x=train[train['target'] == 0]['num_words'], label='Not Disaster')
sns.distplot(x=train[train['target'] == 1]['num_words'], label='Disaster')
plt.legend()

so here we can see both categories are in the normalized form

## let's calculate average length of words

In [None]:
train['avg_len_words'] = train['text'].apply(lambda x: str(x).split())
train['avg_len_words'] = train['avg_len_words'].apply(lambda x: [len(word) for word in x])

In [None]:
train['avg_len_words'] = train['avg_len_words'].apply(lambda x: np.mean(x))

In [None]:
sns.distplot(x=train[train['target'] == 0]['avg_len_words'], label='Not Disaster')
sns.distplot(x=train[train['target'] == 1]['avg_len_words'], label='Disaster')
plt.legend()

## Number of character's in the tweet

In [None]:
train['length'] = train['text'].apply(lambda x: len(x))

In [None]:
plt.figure(figsize=(15, 6))
sns.distplot(x=train[train['target'] == 1]['length'], label='Disaster')
sns.distplot(x=train[train['target'] == 0]['length'], label='Not Disaster')
plt.legend()

The distribution of both seems to be almost same.120 to 140 characters in a tweet are the most common among both.

## Create function for Diaster and Non-Disaster tweet words

In [None]:
def create_corpus(target):
    corpus = []
    for words in train[train['target'] == target]['text'].str.split():
        for word in words:
            corpus.append(word)
    return corpus

### First we will analyze tweets with class 0

In [None]:
corpus = create_corpus(0)

dic = defaultdict(int)
for word in corpus:
    if word in stop_words:
        dic[word] += 1

top = sorted(dic.items(), key = lambda x: x[1], reverse=True)[:10]

### Common stopwords

In [None]:
np.array(stop_words)

In [None]:
plt.figure(figsize=(10, 5))
x, y = zip(*top)
plt.bar(x, y, color='green')
plt.title('Top 10 stop words in Non-Disaster Tweets', fontsize=20)

## Now create corpus for class 1

In [None]:
corpus1 = create_corpus(1)

dic1 = defaultdict(int)
for word in corpus1:
    if word in stop_words:
        dic1[word] += 1

top1 = sorted(dic1.items(), key = lambda x: x[1], reverse=True)[:10]

In [None]:
plt.figure(figsize=(10, 5))
x, y = zip(*top1)
plt.bar(x, y, color='red')
plt.title('Top 10 stop words in Disaster Tweets', fontsize=20)

In both of them,"the" dominates which is followed by "a" in class 0 and "in" in class 1.

## Analyzing Punctuations

### for class 0

In [None]:
dic = defaultdict(int)
for word in corpus:
    if word in string.punctuation:
        dic[word] += 1
        
top = sorted(dic.items(), key = lambda x: x[1], reverse=True)

plt.figure(figsize=(10, 5))
x, y = zip(*top)
plt.bar(x, y, color='green')
plt.title('Punctuation in Non Disaster Tweets', fontsize=20)

### for class 1

In [None]:
dic = defaultdict(int)
for word in corpus1:
    if word in string.punctuation:
        dic[word] += 1
        
top = sorted(dic.items(), key = lambda x: x[1], reverse=True)

plt.figure(figsize=(10, 5))
x, y = zip(*top)
plt.bar(x, y, color='red')
plt.title('Punctuation in Disaster Tweets', fontsize=20)

## Common words in tweets

In [None]:
counter = Counter(corpus)

common_words = counter.most_common()
x = []
y = []
for word, count in common_words[:40]:
    if word not in stop_words:
        if word not in string.punctuation:
            x.append(word)
            y.append(count)

plt.figure(figsize=(8, 8))
sns.barplot(x=y, y=x)
plt.title('Common words in Non Disaster tweets', fontsize=20)

Lot of cleaning needed !

 ### Let's First clean tweets

In [None]:
df = pd.concat([train, test])

In [None]:
example="New competition launched :https://www.kaggle.com/c/nlp-getting-started"

In [None]:
def remove_urls(text):
    '''Removes url from the tweets'''
    url = re.compile(r'https?://\S+|www\.\S+')
    return url.sub(r'', text)
remove_urls(example)

In [None]:
df['text'] = df['text'].apply(lambda x: remove_urls(x))

In [None]:
example = """<div>
<h1>Real or Fake</h1>
<p>Kaggle </p>
<a href="https://www.kaggle.com/c/nlp-getting-started">getting started</a>
</div>"""

In [None]:
def remove_html(text):
    '''Removes html from the tweets'''
    html = re.compile(r'<.*?>')
    return html.sub(r'', text)

print(remove_html(example))

In [None]:
df['text'] = df['text'].apply(lambda x: remove_html(x))

In [None]:
def remove_emojis(text):
    '''Removes emojis from the tweets'''
    emoji_pattern = re.compile("["
                           u"\U0001F600-\U0001F64F"  # emoticons
                           u"\U0001F300-\U0001F5FF"  # symbols & pictographs
                           u"\U0001F680-\U0001F6FF"  # transport & map symbols
                           u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                           u"\U00002702-\U000027B0"
                           u"\U000024C2-\U0001F251"
                           "]+", flags=re.UNICODE)
    return emoji_pattern.sub(r'', text)

remove_emojis('Omg another Earthquake 😔😔')

In [None]:
df['text'] = df['text'].apply(lambda x: remove_emojis(x))

In [None]:
def remove_punct(text):
    '''Removes punctuations from the tweets'''
    table = str.maketrans('', '', string.punctuation)
    return text.translate(table)

example="I am ,.a #king"
print(remove_punct(example))

In [None]:
df['text'] = df['text'].apply(lambda x: remove_punct(x))

## N-grams analysis

In [None]:
def get_top_ngrams(corpus,n_grams, n=None):
    vec = CountVectorizer(ngram_range=(n_grams,n_grams)).fit(corpus)
    bag_of_words = vec.transform(corpus)
    sum_words = bag_of_words.sum(axis=0)
    words_freq = [(word, sum_words[0, idx]) for word, idx in vec.vocabulary_.items()]
    words_freq = sorted(words_freq, key=lambda x: x[1], reverse=True)
    
    return words_freq[:n]

### Bigram analysis

In [None]:
plt.figure(figsize=(10, 10))

top_tweet_ngrams = get_top_ngrams(df['text'], 2, 10)
x, y = map(list, zip(*top_tweet_ngrams))
sns.barplot(x=y, y=x)

### Let's see tri-gram analysis for diaster and non-disaster tweets

In [None]:
plt.figure(figsize=(10, 10))

top_tweet_ngrams = get_top_ngrams(df[df['target'] == 0]['text'], 3, 10)
x, y = map(list, zip(*top_tweet_ngrams))
sns.barplot(x=y, y=x)

In [None]:
plt.figure(figsize=(10, 10))

top_tweet_ngrams = get_top_ngrams(df[df['target'] == 1]['text'], 3, 10)
x, y = map(list, zip(*top_tweet_ngrams))
sns.barplot(x=y, y=x)

Now we can see the clear difference in words of both disaster and non diaster tweets 

## WordCloud

In [None]:
from wordcloud import WordCloud

In [None]:
def create_corpus_df(df, target):
    corpus = []
    for words in df[df['target'] == target]['text'].str.split():
        for word in words:
            corpus.append(word)
    return corpus

## Non-Disaster Tweets

In [None]:
corpus_new0 = create_corpus_df(df, 0)
len(corpus_new0)

In [None]:
corpus_new0[:10]

In [None]:

# Generating the wordcloud with the values under the category dataframe
plt.figure(figsize=(12, 8))
wordcloud = WordCloud(background_color='black', max_font_size=80).generate(" ".join(corpus_new0[:50]))
plt.imshow(wordcloud)
plt.axis('off')
plt.show()

### Disaster tweets

In [None]:
corpus_new1 = create_corpus_df(df, 1)

In [None]:
corpus_new1[:10]

In [None]:
plt.figure(figsize=(12, 8))
wordcloud = WordCloud(background_color='black', max_font_size=80).generate(" ".join(corpus_new1[:50]))
plt.imshow(wordcloud)
plt.axis('off')
plt.show()

In [None]:
df.head(10)

## Bag of Words counts

In [None]:
def cv(data):
    count_vectorizer = CountVectorizer()
    emb = count_vectorizer.fit_transform(data)
    return emb, count_vectorizer

list_corpus = df['text'].tolist()
list_labels = df['target'].tolist()

X_train, X_test, y_train, y_test = train_test_split(list_corpus, list_labels, test_size=0.2,
                                                    random_state=1)

X_train_counts, count_vectorizer = cv(X_train)
X_test_counts = count_vectorizer.transform(X_test)

## Visualizing the Embeddings

In [None]:
def plot_LSA(test_data, test_labels, savepath='PCA_demp.csv', plot=True):
    lsa = TruncatedSVD(n_components=2)
    lsa.fit(test_data)
    lsa_scores = lsa.transform(test_data)
    color_mapper = {label:idx for idx, label in enumerate(set(test_labels))}
    color_column = [color_mapper[label] for label in test_labels]
    colors = ['orange', 'blue']
    if plot:
        plt.scatter(lsa_scores[:, 0], lsa_scores[:, 1], s=8, alpha=0.8, c=test_labels, cmap=matplotlib.colors.ListedColormap(colors))
        orange_patch = mpatches.Patch(color='orange', label='Non Disaster')
        blue_patch = mpatches.Patch(color='blue', label='Disaster')
        plt.legend(handles=[orange_patch, blue_patch], prop={'size': 30})
        
fig = plt.figure(figsize=(16, 16))
plot_LSA(X_train_counts, y_train)
plt.show()

## TF IDF vectorizer

In [None]:
def tfidf(data):
    tfidf_vectorizer = TfidfVectorizer()
    
    train = tfidf_vectorizer.fit_transform(data)
    
    return train, tfidf_vectorizer

X_train_tfidf, tfidf_vectorizer = tfidf(X_train)
X_test_tfidf = tfidf_vectorizer.transform(X_test)

In [None]:
fig = plt.figure(figsize=(16,16))
plot_LSA(X_train_tfidf, y_train)
plt.show()

## GloVe Model
GloVe method is built on an important idea,
  
  **"You can derive semantic relationships between words   from the co-occurrence matrix."**
  
Given a corpus having V words, the co-occurrence matrix X will be a V x V matrix, where the i th row and j th column of X, X_ij denotes how many times word i has co-occurred with word j. An example co-occurrence matrix might look as follow.

![Glove-matrix](https://miro.medium.com/max/347/1*QWcK8CIDs8kMkOwsOxvywA.png)

Read more about GloVe - https://towardsdatascience.com/light-on-math-ml-intuitive-guide-to-understanding-glove-embeddings-b13b4f19c010

Here we will use GloVe pretrained corpus model to represent our words. It is available in 3 varieties : 50D, 100D and 200 Dimentional. We will try 100D here.

In [None]:
def create_corpus_new(df):
    corpus = []
    for text in tqdm(df['text']):
        words = [word.lower() for word in word_tokenize(text)]
        corpus.append(words)
    return corpus

In [None]:
corpus = create_corpus_new(df)

### Create embedding dictionary to store vecotors

In [None]:
embedding_dict={}
with open('../input/glove-global-vectors-for-word-representation/glove.6B.100d.txt','r') as f:
    for line in f:
        values = line.split()
        word = values[0]
        vectors = np.asarray(values[1:], 'float32')
        embedding_dict[word] = vectors
f.close()

## Tokenize the text using Tokenizer()

In [None]:
Max_len = 50
tokenizer_obj = Tokenizer()
tokenizer_obj.fit_on_texts(corpus)
sequences = tokenizer_obj.texts_to_sequences(corpus)

tweet_pad = pad_sequences(sequences, maxlen=Max_len, truncating='post', padding='post')

In [None]:
word_index = tokenizer_obj.word_index
print('Number of unique words : ', len(word_index))

## Create embedding matrix of words

In [None]:
num_words = len(word_index) + 1
embedding_matrix = np.zeros((num_words, 100))

for word, i in tqdm(word_index.items()):
    if i < num_words:
        emb_vec = embedding_dict.get(word)
        if emb_vec is not None:
            embedding_matrix[i] = emb_vec

In [None]:
tweet_pad[0][0:]

## Baseline Model with GloVe results

In [None]:
model = Sequential()

embedding = Embedding(num_words, 100, embeddings_initializer=Constant(embedding_matrix), input_length=Max_len, trainable=False)
model.add(embedding)
model.add(SpatialDropout1D(0.2))
model.add(LSTM(100, dropout=0.2, recurrent_dropout=0.2))
model.add(Dense(1, activation='sigmoid'))

optimizer = Adam(learning_rate=3e-4)

model.compile(loss='binary_crossentropy', optimizer=optimizer, metrics=['accuracy'])

In [None]:
model.summary()

## Split the data into train and test

In [None]:
train1 = tweet_pad[:train.shape[0]]
test1 = tweet_pad[train.shape[0]:]

In [None]:
X_train, X_test, y_train, y_test = train_test_split(train1, train['target'].values, test_size=0.2)
print('Shape of train', X_train.shape)
print('Shape of validation', X_test.shape)

In [None]:
train.head()

In [None]:
fig = plt.figure(figsize=(16, 16))
plot_LSA(train1, train['target'])
plt.show()

## Fit the model

In [None]:
history = model.fit(X_train, y_train, batch_size=4, epochs=10,
                    validation_data=(X_test, y_test), verbose=2)

In [None]:
train_pred_Glove = model.predict(train1)
train_pred_Glove_int = train_pred_Glove.round().astype('int')

## Plot Confusion matrix

In [None]:
def plot_cm(y_true, y_pred, title, figsize=(5, 5)):
    cm = confusion_matrix(y_true, y_pred, labels=np.unique(y_true))
    cm_sum = np.sum(cm, axis=1, keepdims=True)
    cm_perc = cm / cm_sum.astype(float) * 100
    annot = np.empty_like(cm).astype(str)
    nrows, ncols = cm.shape
    for i in range(nrows):
        for j in range(ncols):
            c = cm[i, j]
            p = cm_perc[i, j]
            if i == j:
                s = cm_sum[i]
                annot[i, j] = '%.1f%%\n%d/%d' % (p, c, s)
                
            elif c == 0:
                annot[i, j] = ''
                
            else:
                annot[i, j] = '%.1f%%\n%d' % (p, c)
                
    cm = pd.DataFrame(cm, index=np.unique(y_true), columns=np.unique(y_true))
    cm.index.name = 'Actual'
    cm.columns.name = 'Predicted'
    fig, ax = plt.subplots(figsize=figsize)
    plt.title(title)
    sns.heatmap(cm, cmap='YlGnBu', annot=annot, fmt='', ax=ax)

In [None]:
plot_cm(train_pred_Glove_int, train['target'].values, 'Confusion matrix for Glove model', figsize=(7, 7))

It's my first notebook on kaggle 
I hope you find this notebook useful and enjoyble

Your comment and feedback are most welcome.