# Introduction

This notebook will focus on text-classification and sentiment analysis. We will go through all major NLP and dat analysis techniques, some of which include:

* LSTMs
* Transformers (such as BERT)
* Naive Bayes
* XGBoost 

and much more...

The first half of the notebook is focused on cleaning and pre-processing the data, while the second half builds and compares different models with the techniques mentioned above.

In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import warnings
warnings.filterwarnings('ignore')

In [None]:
import re
import string
import numpy as np 
import random
import pandas as pd 
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
from plotly import graph_objs as go
import plotly.express as px
import plotly.figure_factory as ff
from collections import Counter

from PIL import Image
from wordcloud import WordCloud, STOPWORDS, ImageColorGenerator


import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

from tqdm import tqdm
import os
import nltk
import spacy
import random
from spacy.util import compounding
from spacy.util import minibatch

from collections import defaultdict
from collections import Counter

import keras
from keras.models import Sequential
from keras.initializers import Constant
from keras.layers import LSTM, Embedding,BatchNormalization, Dense, TimeDistributed, Dropout, Bidirectional, Flatten, GlobalMaxPool1D
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.layers.embeddings import Embedding
from keras.callbacks import ModelCheckpoint, ReduceLROnPlateau
from keras.optimizers import Adam

from sklearn.metrics import precision_score, recall_score, f1_score, classification_report,accuracy_score

In [None]:
# Defining the global variables for the color schemes we will incorporate
pblue = "#496595"
pb2 = "#85a1c1"
pb3 = "#3f4d63"
pg = "#c6ccd8"
pb = "#202022"
pbg = "#f4f0ea"

pgreen = px.colors.qualitative.Plotly[2]

In [None]:
df = pd.read_csv('../input/sms-spam-collection-dataset/spam.csv')
df.head()

In [None]:
df.shape

In [None]:
df.isnull().sum()

In [None]:
df.dtypes

In [None]:
df.isnull().sum()

In [None]:
df.dropna(axis=1, inplace=True)
df.head()

In [None]:
df.rename(columns={"v1":"label", "v2":"text"}, inplace=True)
df.head()

In [None]:
# Finding maximum length of text message

np.max(df['text'].apply(lambda x: len(x.split())).values)

<h2>Exploratory data analysis</h2>

In [None]:
# Checking balance of dataset
grouped_df = df.groupby('label').count().values.flatten()
grouped_df

The 'text' property is a string and must be specified as:

      - A string
      - A number that will be converted to a string
      - A tuple, list, or one-dimensional numpy array 
      
The 'x' and 'y' property is an array that may be specified as a tuple,
    list, numpy array, or pandas Series

In [None]:
fig = go.Figure()

fig.add_trace(go.Bar(
        x=['ham'],
        y=[grouped_df[0]],
        name='Safe',
        text=[grouped_df[0]],
        textposition='auto',
        marker_color=pblue
)
             )
fig.add_trace(go.Bar(
        x=['spam'],
        y=[grouped_df[1]],
        name='Spam',
        text=[grouped_df[1]],
        textposition='auto',
        marker_color=pg
))

fig.update_layout(
    title='Class distribution in the dataset')

fig.show()

In [None]:
# Creating series with length as index
# Sorting the series by index i.e. length
len_df_ham = df[df['label']=='ham'].text.apply(lambda x: len(x.split())).value_counts().sort_index()
len_df_spam = df[df['label']=='spam'].text.apply(lambda x: len(x.split())).value_counts().sort_index()

In [None]:
len_df_ham

In [None]:
len_df_spam

In [None]:
# X-axis consists of the length of the msgs
# Y-axis consists of the frequency of those lengths

fig = go.Figure()
fig.add_trace(go.Scatter(
x=len_df_ham.index,
y=len_df_ham.values,
name='Safe',
fill='tozeroy',
marker_color=pblue))

fig.add_trace(go.Scatter(
x=len_df_spam.index,
y=len_df_spam.values,
name='Spam',
fill='tozeroy',
marker_color=pg
))

fig.update_layout(
    title='Frequency of SMS lengths')
fig.update_xaxes(range=[0, 80])
fig.show()

We can see that the safe SMS messages are much shorter than the spam messages.

<h2>Data preprocessing</h2>

In [None]:
def cleaning(text):
    text = str(text).lower()
    text = re.sub('\[.*?\]', '', text)
    text = re.sub('https?://\S+|www\.\S+', '', text)
    text = re.sub('<.*?>+', '', text)
    text = re.sub('[%s]' % re.escape(string.punctuation), '', text)
    text = re.sub('\n', '', text)
    text = re.sub('\w*\d\w*', '', text)
    return text

In [None]:
df['text']

In [None]:
df['text'] = df['text'].apply(cleaning)
df['text']

In [None]:
# Removing stop words
stop_words = stopwords.words('english')
more = ['u', 'im', 'c']
stop_words = stop_words + more


def sw_rem(text):
    text = ' '.join(word for word in text.split(' ') if word not in stop_words)
    return text

df['text'] = df['text'].apply(sw_rem)
df['text']

<h2>Stemming and lemmatization</h2>

Documents and other forms of text use different forms of the same words, such as play, playing, played. There are families of derivationally related words that have similar meanings. Our main task with stemming and lemmatization is to reduce all these derived words into the parent/family word, therefore reducing the total vocabulary while retaining information.

* **Stemming** - Omits the ends of words to achieve the goal correctly, this works **most of the times** and can also remove the derivational suffix

* **Lemmatization** - Working with a vocabulary and morphological analysis of wrods, removing inflectional endings only and returning the base and dictionary form of a word.

As we do not require much emphasis on words, we will focus more on stemming than lemmatization,.

<h3>Stemming algorithms</h3>

We have multiple algorithms to achieve our stemming goals, some of them are as follows:

* PorterStemmer - Fast and efficient. Strips off the end (suffix) to produce the stems. It does not follow linguistics but rather a set of 05 rules for diferent cases. 

* SnowballStemmer - Generate a set of rules for any language. These are useful for non-english stemming tasks.

* LancasterStemmer - Iterative algorithm, uses about 120 rules, it tries to find an applicable rule by the last character of each word. The last character may be omitted or replaced.

In [None]:
stems = nltk.SnowballStemmer('english')

def stemming(text):
    text = ' '.join(stems.stem(word) for word in text.split())
    return text

In [None]:
df['text'] = df['text'].apply(stemming)
df.head()

In [None]:
# Creating a pipeline

def pipeline(text):
    text = cleaning(text)
    text = ' ' .join(word for word in text.split(' ') if word not in stop_words)
    text = ' '.join(stems.stem(word) for word in text.split(' '))
    return text

In [None]:
df['text'] = df['text'].apply(pipeline)
df.head()

In [None]:
# Encoding the categorical target variable
from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()
le.fit(df['label'])

df['label_num'] = le.transform(df['label'])
df.head()

<h2>Visualizing tokens</h2>

In [None]:
# This will combine all the text values for safe sms
#' '.join(text for text in df[df['label']=='ham'].text)

In [None]:
# Extracting the twitter word cloud mask
twitter_mask = np.array(Image.open('/kaggle/input/masksforwordclouds/twitter_mask3.jpg'))

wc = WordCloud(background_color='white', max_words=200, mask=twitter_mask)

wc.generate(' '.join(text for text in df[df['label']=='ham'].text))
plt.figure(figsize=(15, 10))
plt.title('Top words for safe messages', fontdict={'size':22})
plt.imshow(wc)
plt.axis('off')
plt.show()

In [None]:
# Extracting the twitter word cloud mask
wc = WordCloud(background_color='white', max_words=200, mask=twitter_mask)

wc.generate(' '.join(text for text in df[df['label']=='spam'].text))
plt.figure(figsize=(15, 10))
plt.title('Top words for Spam messages', fontdict={'size':22})
plt.imshow(wc)
plt.axis('off')
plt.show()

<h2>Vectorization</h2>

We currently have each text record in string format. We need to convert each of those records into a vector that our models can work with. We will first do this using the bag-of-words model.

We will use two major approaches here

* **CountVectorizer** - Working on frequency of each word in the given string.

* **Term frequency-inverse document frqeuency TFIDF** - Works on frequency divided by the appearance of the given word in the total documents.

In [None]:
x = df['text']
y = df['label_num']

len(x), len(y)

In [None]:
from sklearn.model_selection import train_test_split

x_train, x_test, y_train, y_test = train_test_split(x, y)
print(len(x_train), len(y_train))
print(len(x_test), len(y_test))

In [None]:
# First working with count vectorizer

from sklearn.feature_extraction.text import CountVectorizer

# instantiate the vectorizer
count = CountVectorizer()
count.fit(x)

x_train_num = count.transform(x_train)
x_test_num = count.transform(x_test)

The CountVectorizer model can be tuned in a variety of ways:

* Stop words - Extremely common words can be omitted by the model by setting this parameter to the language corresponding to the text.

* ngram_range - It pairs up words together as features. If we consider bigrams and we have a sentence "I am happy", we will have two features - ["I am", "am happy"]. We can define a range of ngrams, so if we have the same sentence with a range from 1 to 2, our features will be:  `["I", "am", "happy", "I am", "am happy"]`. This increase is features helps to fine tune the model.

* min_df, max_df - Minimum and maximum frequencies of words of n-grams that can be used as features. If either of the conditions are not met, the feature will be omitted.

* max_features - Choose the most frequent words and drop everything else.

In [None]:
# Example of a tuned model
count_tuned = CountVectorizer(stop_words='english', ngram_range=(1,2), min_df=0.1, max_df=0.7, max_features=100)

In [None]:
# Working with TF-IDF now
from sklearn.feature_extraction.text import TfidfTransformer
# We are using transformer here
# If we use vectorizer, we can directly use the text
tfidf = TfidfTransformer()

tfidf.fit(x_train_num)
x_train_tfidf = tfidf.transform(x_train_num)

x_train_tfidf

<h2>Working with Embeddings - GloVe</h2>

In [None]:
text = df['text']
label = df['label_num']

In [None]:
# Calculating the total vocabulary
tk = Tokenizer()
tk.fit_on_texts(text)

vocab = len(tk.word_index)+1
vocab

Now we will proceed with converting the text to numerical values and also padding the vectors so each of them are of equal length. 

In [None]:
# Maximum length
max_len = np.max(df['text'].apply(lambda x: len(x.split())).values)
max_len

In [None]:
text

In [None]:
def embedding(text):
    return tk.texts_to_sequences(text)

train_padded = pad_sequences(embedding(text), 80, padding='post')
train_padded

<h2>GloVe Embeddings</h2>

These embeddings are based on the principle that we can derive sematic relationships between words from their co-occurence matrix. This embedding focuses on words co-occurrences over the whole corpus. 

They are a form of word representation that try to merge human understanding of languages into their structure. They have a learned representation in an n-dimension space, where words with similar meanings have similar embeddings. Two similar words are represented by almost similar vectors that are at a small distance in the vector space.

When using a vector space, all the words are represented as vectors in a predefined N-dimension vector space. Each word is mapped to a vector and the vector values are learned in a way that resembles a neural network.

In [None]:
# Using our helper functions for GloVe

embedding_dict = dict()
embedding_dim = 100

# Each word is represented in one line in the text file
# Format - Word val1 val2 val3......val-n for n-dimension vector space

with open('../input/glove6b100dtxt/glove.6B.100d.txt') as fp:
    for line in fp.readlines():
        records = line.split()
        word = records[0]
        vector = np.asarray(records[1:], dtype='float32')
        embedding_dict[word] = vector

In [None]:
# Creating a matrix for each word as index (word numerical value extracted from tokenizer
# with N-features (corresponding to GloVe)
# We will replace the matrix elements by the words and their embeddings

# Our embeddings will also consist embeddings for padding
embedding_matrix = np.zeros((vocab, embedding_dim))

for word, index in tk.word_index.items():
    embed_vector = embedding_dict.get(word)
    if embed_vector is not None:
        embedding_matrix[index] = embed_vector
        
embedding_matrix

In [None]:
# We will be creating seaborn and plotly confusion matrices
import plotly.figure_factory as ff
x_axes = ['Safe','Spam']
y_axes = ['Spam', 'Safe']

def conf_matrix(z, x=x_axes, y=y_axes):
    z = np.flip(z, 0)
    # Change each element of z to string 
    # This allows them to be used as annotations
    z_str = [[str(y) for y in x] for x in z]
    fig = ff.create_annotated_heatmap(z, x=x, y=y, annotation_text=z_str)
    
    fig.update_layout(title_text='Confusion matrix', xaxis=dict(title='Predicted Value'),
                     yaxis=dict(title='Real value'))
    
    fig['data'][0]['showscale'] = True
    return fig

In [None]:
from sklearn.metrics import confusion_matrix
categories=['Safe', 'Spam']
def seaborn_conf(y, ypred):
    y_true = ["Safe", "Spam"]
    y_pred = ["Safe", "Spam"]
    cf = confusion_matrix(y, ypred)
    df_cm = pd.DataFrame(cf, columns=np.unique(y_true), index = np.unique(y_true))
    plt.figure(figsize=(8,6))
    sns.heatmap(df_cm, annot=True, fmt='g')
    plt.title('Confusion matrix')
    plt.xlabel('Predicted value')
    plt.ylabel('Real value')
    plt.show()

<h2>Model creation and prediction</h2>

We will first start with the **naive bayes classifier** which comes from a family of simple "probabilistic classifiers" based on application of Bayes theroem with strong independent assumptions between features.

The model is highly scalable, with number of parameters being linear with number of variables. 

In [None]:
from sklearn.naive_bayes import MultinomialNB
nb = MultinomialNB()

# Train the model - CountVectorizer model
nb.fit(x_train_num, y_train)

In [None]:
# Class and probability predictions
yp_class = nb.predict(x_test_num)
yp_prob = nb.predict_proba(x_test_num)[:, 1]

In [None]:
from sklearn import metrics
print(metrics.accuracy_score(y_test, yp_class))
seaborn_conf(y_test, yp_class)

In [None]:
metrics.roc_auc_score(y_test, yp_prob)

<h2>Working with Naive Bayes + TF-IDF</h2>

In [None]:
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.pipeline import Pipeline

pipe = Pipeline([('bow', CountVectorizer()), 
                 ('tfid', TfidfTransformer()),  
                 ('model', MultinomialNB())])

In [None]:
pipe.fit(x_train, y_train)
yp_class = pipe.predict(x_test)
print(metrics.accuracy_score(y_test, yp_class))
seaborn_conf(y_test, yp_class)

<h2>XGBoost</h2>

In [None]:
import xgboost as xgb
pipe = Pipeline([
    ('bow', CountVectorizer()), 
    ('tfid', TfidfTransformer()),  
    ('model', xgb.XGBClassifier(
        learning_rate=0.1,
        max_depth=6,
        n_estimators=90,
        use_label_encoder=False,
        eval_metric='auc'
    ))
    ]
)

In [None]:
pipe.fit(x_train, y_train)
yp_class_test = pipe.predict(x_test)
yp_class_train = pipe.predict(x_train)

print('Training accuracy score: {}'.format(metrics.accuracy_score(y_train, yp_class_train)))
print('Testing accuracy score: {}'.format(metrics.accuracy_score(y_test, yp_class_test)))

seaborn_conf(y_test, yp_class_test)

<h2>LSTMs and GloVE embeddings</h2>

In [None]:
train_padded.shape

In [None]:
x_train, x_test, y_train, y_test = train_test_split(train_padded, label, test_size=0.2)

In [None]:
model = Sequential()
model.add(Embedding(input_dim=embedding_matrix.shape[0], 
                   output_dim=embedding_matrix.shape[1],
                   weights=[embedding_matrix],
                   input_length=max_len
                   )
         )
model.add(Bidirectional(LSTM(max_len, return_sequences=True, recurrent_dropout=0.15)))
model.add(GlobalMaxPool1D())
model.add(BatchNormalization())
model.add(Dropout(0.3))
model.add(Dense(max_len, activation='relu'))
model.add(Dropout(0.3))
model.add(Dense(max_len, activation='relu'))
model.add(Dropout(0.3))
model.add(Dense(1, activation='sigmoid'))


model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
model.summary()

In [None]:
# Defining Callbacks
# Checkpoints in case our model stops training due to some circumstance - saving progress
checkpoints = ModelCheckpoint('ck_model.h5', monitor='val_loss', verbose=1, save_best_only=True)
# Reducing the learning rate if no improvement in validation loss over 5 epochs
# This is to train the model better
reduce_lr = ReduceLROnPlateau(monitor='val_loss', vactor=0.1, verbose=1, patience=5, min_lr=0.0001)

In [None]:
history = model.fit(x_train, y_train, epochs=10, batch_size=32, validation_data=(x_test, y_test), verbose=1, callbacks=[reduce_lr, checkpoints])

In [None]:
# Plotting the results
def learning_curve(history, arr):
    fig, ax=plt.subplots(1, 2, figsize=(20, 5))
    for idx in range(2):
        ax[idx].plot(history.history[arr[idx][0]])
        ax[idx].plot(history.history[arr[idx][1]])
        ax[idx].legend([arr[idx][0], arr[idx][1]])
        ax[idx].set_xlabel('Epochs')
        ax[idx].set_ylabel('Value')
        ax[idx].set_title(arr[idx][0]+' X '+ arr[idx][1])

In [None]:
learning_curve(history, [['loss', 'val_loss'], ['accuracy', 'val_accuracy']])

In [None]:
yp = model.predict(x_test)
yp

In [None]:
yp = (model.predict(x_test)>0.5).astype('int32')
yp

In [None]:
seaborn_conf(y_test, yp)

<h2>Transformers - BERT</h2>

BERT has revolutionized the world of NLP by providing state-of-the-art results on many NLP tasks. BERT stands for Bidirectional Encoder Representation from Transformer. It is the state-of-the-art embedding model published by Google. It has created a major breakthrough in the field of NLP by providing greater results in many NLP tasks, such as question answering, text generation, sentence classification, and many more besides. One of the major reasons for the success of BERT is that it is a context-based embedding model, unlike other popular embedding models, such as word2vec, which are context-free.

In [None]:
!pip install transformers

In [None]:
import tensorflow as tf
from tensorflow.keras.layers import Dense, Input
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.models import Model
from tensorflow.keras.callbacks import ModelCheckpoint

import transformers
from tqdm.notebook import tqdm
from tokenizers import BertWordPieceTokenizer

In [None]:
from transformers import BertTokenizer
tokenizer = BertTokenizer.from_pretrained('bert-large-uncased')

def bert_encode(data, maximum_length):
    input_ids=[]
    attention_masks=[]
    for text in data:
        encoded = tokenizer.encode_plus(text, add_special_tokens=True, max_length = maximum_length, pad_to_max_length=True, return_attention_mask=True)
        input_ids.append(encoded['input_ids'])
        attention_masks.append(encoded['attention_mask'])
    return np.array(input_ids), np.array(attention_masks)

In [None]:
np.max(df['text'].apply(lambda x: len(x.split())).values)

In [None]:
bt_text = df['text']
bt_label = df['label_num']

bt_ids, bt_masks = bert_encode(bt_text, 80)

In [None]:
from transformers import TFBertModel
def create_model(bert_model):
    
    input_ids = tf.keras.Input(shape=(80,),dtype='int32')
    attention_masks = tf.keras.Input(shape=(80,),dtype='int32')

    output = bert_model([input_ids,attention_masks])
    output = output[1]
    output = tf.keras.layers.Dense(32,activation='relu')(output)
    output = tf.keras.layers.Dropout(0.2)(output)
    output = tf.keras.layers.Dense(1,activation='sigmoid')(output)
    
    model = tf.keras.models.Model(inputs = [input_ids,attention_masks],outputs = output)
    model.compile(Adam(lr=1e-5), loss='binary_crossentropy', metrics=['accuracy'])
    return model

In [None]:
bert_model = TFBertModel.from_pretrained('bert-base-uncased')

In [None]:
model = create_model(bert_model)
model.summary()

In [None]:
history = model.fit([bt_ids, bt_masks], bt_label, validation_split=0.25, epochs=3, batch_size=10)

In [None]:
learning_curve(history, [['loss', 'val_loss'],['accuracy', 'val_accuracy']])

<h2>Working with disaster tweets - Dataset</h2>

In [None]:
df = pd.read_csv('../input/nlp-getting-started/train.csv')
df_test = pd.read_csv('../input/nlp-getting-started/test.csv')

df_train = df_train.dropna(axis=1)
df_train.head()

In [None]:
df.groupby('target').count()

In [None]:
# We can use .agg('count').values also
class_counts = df.groupby('target').id.count().values
class_counts

In [None]:
fig = go.Figure()

fig.add_trace(go.Bar(
        x=['Fake disaster'],
        y=[class_counts[0]],
        name='Fake',
        text=[class_counts[0]],
        textposition='auto',
        marker_color=pblue
)
             )
fig.add_trace(go.Bar(
        x=['Real disaster'],
        y=[class_counts[1]],
        name='Real',
        text=[class_counts[1]],
        textposition='auto',
        marker_color=pg
))

fig.update_layout(
    title='Class distribution in the dataset')

fig.show()

In [None]:
len_real = df[df['target']==1].text.apply(lambda x: len(x.split())).value_counts().sort_index()
len_fake = df[df['target']==0].text.apply(lambda x: len(x.split())).value_counts().sort_index()

In [None]:
fig = go.Figure()
fig.add_trace(go.Scatter(
    x=len_real.index,
    y=len_real.values,
    name='Real disaster',
    fill='tozeroy',
    marker_color=pblue,
))
fig.add_trace(go.Scatter(
    x=len_fake.index,
    y=len_fake.values,
    name='Fake disaster',
    fill='tozeroy',
    marker_color=pg,
))
fig.update_layout(
    title='<span style="font-size:32px; font-family:Times New Roman">Data Roles in Different Fields</span>'
)
fig.show()

<h2>Data pre-processing and cleaning</h2>

In [None]:
def remove_url(text):
    url = re.compile(r'https?://\S+|www\.\S+')
    return url.sub(r'', text)


def remove_emoji(text):
    emoji_pattern = re.compile(
        '['
        u'\U0001F600-\U0001F64F'  # emoticons
        u'\U0001F300-\U0001F5FF'  # symbols & pictographs
        u'\U0001F680-\U0001F6FF'  # transport & map symbols
        u'\U0001F1E0-\U0001F1FF'  # flags (iOS)
        u'\U00002702-\U000027B0'
        u'\U000024C2-\U0001F251'
        ']+',
        flags=re.UNICODE)
    return emoji_pattern.sub(r'', text)


def remove_html(text):
    html = re.compile(r'<.*?>|&([a-z0-9]+|#[0-9]{1,6}|#x[0-9a-f]{1,6});')
    return re.sub(html, '', text)

def clean_text(text):
    '''Make text lowercase, remove text in square brackets,remove links,remove punctuation
    and remove words containing numbers.'''
    text = str(text).lower()
    text = re.sub('\[.*?\]', '', text)
    text = re.sub(
        'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+', 
        '', 
        text
    )
    text = re.sub('https?://\S+|www\.\S+', '', text)
    text = re.sub('<.*?>+', '', text)
    text = re.sub('[%s]' % re.escape(string.punctuation), '', text)
    text = re.sub('\n', '', text)
    text = re.sub('\w*\d\w*', '', text)
    
    text = remove_url(text)
    text = remove_emoji(text)
    text = remove_html(text)
    
    return text

In [None]:
# Testing the function
remove_emoji("Omg another Earthquake 😔😔")

In [None]:
stopw = stopwords.words('english')
more = ['u', 'im', 'c']
stopw = stopw + more

stemmer = nltk.SnowballStemmer('english')

def data_cleaning(text):
    text = clean_text(text)
    text = ' '.join(stemmer.stem(word) for word in text.split(' ') if word not in stopw)
    return text

In [None]:
df['cleaned_text'] = df['text'].apply(data_cleaning)
df_test['cleaned_text'] = df_test['text'].apply(data_cleaning)

In [None]:
df.head()

<h2>WordCloud analysis</h2>

In [None]:
def corpus(df, label):
    corpus=[]
    for x in df[df['target']==label]['cleaned_text'].str.split():
        for i in x:
            corpus.append(i)
    return corpus

In [None]:
corpus_reald = corpus(df, 1)
dic = defaultdict(int)

# Creating a dictionary with frequency of words
for word in corpus_reald:
    dic[word]+=1
    
# Sorting words by descending frequency
top = sorted(dic.items(), key=lambda x: x[1], reverse=True)[:10]
top

In [None]:
twitter_mask = np.array(Image.open('/kaggle/input/masksforwordclouds/twitter_mask3.jpg'))

wc = WordCloud(
    background_color='white', 
    max_words=200, 
    mask=twitter_mask,
)
wc.generate(' '.join(text for text in df.loc[df['target'] == 1, 'cleaned_text']))
plt.figure(figsize=(18,10))
plt.title('Wordcloud for real disasters', 
          fontdict={'size': 22,  'verticalalignment': 'bottom'})
plt.imshow(wc)
plt.axis("off")
plt.show()

In [None]:
corpus_faked = corpus(df, 0)
dic = defaultdict(int)

# Creating a dictionary with frequency of words
for word in corpus_faked:
    dic[word]+=1
    
# Sorting words by descending frequency
top = sorted(dic.items(), key=lambda x: x[1], reverse=True)[:10]
top

In [None]:
twitter_mask = np.array(Image.open('/kaggle/input/masksforwordclouds/twitter_mask3.jpg'))

wc = WordCloud(
    background_color='white', 
    max_words=200, 
    mask=twitter_mask,
)
wc.generate(' '.join(text for text in df.loc[df['target'] == 0, 'cleaned_text']))
plt.figure(figsize=(18,10))
plt.title('Wordcloud for fake disasters', 
          fontdict={'size': 22,  'verticalalignment': 'bottom'})
plt.imshow(wc)
plt.axis("off")
plt.show()

<h2>Model creation and testing</h2>

In [None]:
x = df['cleaned_text']
y = df['target']

from sklearn.model_selection import train_test_split

x_train, x_test, y_train, y_test = train_test_split(x, y)
len(x_train), len(y_train), len(x_test), len(y_test)

In [None]:
pipe = Pipeline([
    ('bow', CountVectorizer()), 
    ('tfid', TfidfTransformer()),  
    ('model', xgb.XGBClassifier(
        use_label_encoder=False,
        eval_metric='auc',
    ))
])

pipe.fit(x_train, y_train)
yp_test = pipe.predict(x_test)
yp_train = pipe.predict(x_train)

print('Training accuracy: {}'.format(metrics.accuracy_score(y_train, yp_train)))
print('Testing accuracy: {}'.format(metrics.accuracy_score(y_test, yp_test)))

seaborn_conf(y_test, yp_test)

<h3>GloVE - LSTM</h2>

In [None]:
training = df['cleaned_text'].values
testing = df_test['cleaned_text'].values
# Target labels
labels = df['target'].values

In [None]:
# Word tokenizer
tk = Tokenizer()
tk.fit_on_texts(training)

vocab = len(tk.word_index)+1
vocab

In [None]:
def metric_calculation(y_test, y_pred):
    print("F1-score: ", f1_score(y_pred, y_test))
    print("Precision: ", precision_score(y_pred, y_test))
    print("Recall: ", recall_score(y_pred, y_test))
    print("Acuracy: ", accuracy_score(y_pred, y_test))
    print("-"*50)
    print(classification_report(y_pred, y_test))
    
def embeddings(corpus): 
    return tk.texts_to_sequences(corpus)

In [None]:
len_train = np.max(df['cleaned_text'].apply(lambda x: len(x)))
len_train

In [None]:
train_padded_sentences = pad_sequences(
    embeddings(training), 
    len_train, 
    padding='post'
)
test_padded_sentences = pad_sequences(
    embeddings(testing), 
    len_train,
    padding='post'
)

train_padded_sentences

In [None]:
# As we've already created a GloVe dictionary in the SMS-dataset, we will start with the matrix

embedding_matrix = np.zeros((vocab, embedding_dim))

for word, index in tk.word_index.items():
    embedding_vector = embedding_dict.get(word)
    if embedding_vector is not None:
        embedding_matrix[index] = embedding_vector
        
embedding_matrix

In [None]:
x_train, x_test, y_train, y_test = train_test_split(train_padded_sentences, labels, test_size=0.20)

In [None]:
model = Sequential()
model.add(Embedding(input_dim=embedding_matrix.shape[0], 
                   output_dim=embedding_matrix.shape[1],
                   weights=[embedding_matrix],
                   input_length=max_len
                   )
         )
model.add(Bidirectional(LSTM(max_len, return_sequences=True, recurrent_dropout=0.15)))
model.add(GlobalMaxPool1D())
model.add(BatchNormalization())
model.add(Dropout(0.3))
model.add(Dense(max_len, activation='relu'))
model.add(Dropout(0.3))
model.add(Dense(max_len, activation='relu'))
model.add(Dropout(0.3))
model.add(Dense(1, activation='sigmoid'))


model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
model.summary()

In [None]:
# Callbacks
checkpoint = ModelCheckpoint('model.h5', monitor = 'val_loss', verbose = 1, save_best_only = True)
reduce_lr = ReduceLROnPlateau(monitor = 'val_loss', factor = 0.2, verbose = 1, patience = 5,                        min_lr = 0.001)

history = model.fit(x_train, y_train, epochs = 7,batch_size = 32,validation_data = (x_test, y_test),verbose = 1,callbacks = [reduce_lr, checkpoint])

In [None]:
learning_curve(history, [['loss', 'val_loss'],['accuracy', 'val_accuracy']])

In [None]:
preds = model.predict_classes(x_test)
metric_calculation(preds, y_test)