This kernel demonstrates binary classification of tweets using Sequence model.


Let's start with importing all the necessary packages.

In [None]:
import pandas as pd
import numpy as np
import re
import string
from collections import Counter, namedtuple

from nltk.corpus import stopwords
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import f1_score, accuracy_score

from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import Dense, Dropout, LSTM, Embedding
from keras.optimizers import Nadam,adam

np.random.seed(1)

#### Load data

In [None]:
data = pd.read_csv('../input/nlp-getting-started/train.csv')

let's see how data looks like

In [None]:
data.head()

Check for class imbalance

In [None]:
data.drop(columns = ['id','keyword','location'], inplace=True)
neg, pos = np.bincount(data.target)
print(f'Total: {len(data)} \nPositive: {pos} \nNegative: {neg}')

There is no class imbalance problem.

Check for null values in data.

In [None]:
data.isnull().sum()

### Let's work with tweets

Clean the text by removing urls, html tags, emojis and stopwords.

In [None]:
def clean_text(text):
    
    #remove urls
    url_pattern = re.compile(r'https?://\S+|www\.\S+')
    text = url_pattern.sub(r'', text)
    
    #remove html
    html_pattern = re.compile(r'<.*?>')
    text = html_pattern.sub(r'', text)
    
    #remove emojis
    emoji_pattern = re.compile(pattern = "["
        u"\U0001F600-\U0001F64F"  # emoticons
        u"\U0001F300-\U0001F5FF"  # symbols & pictographs
        u"\U0001F680-\U0001F6FF"  # transport & map symbols
        u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                           "]+", flags = re.UNICODE)
    text = emoji_pattern.sub(r'',text)
    
    #remove punctuations
    table = str.maketrans("", "", string.punctuation)
    text = text.translate(table)
    
    #remove stopwords
    stop = set(stopwords.words('english'))
    text = [word.lower() for word in text.split() if word.lower() not in stop]

    return ' '.join(text)

In [None]:
data['text'] = data['text'].apply(lambda x: clean_text(x))

In [None]:
data.head()

We cannot directly use textual data as input to our sequence model. We need to map each word in the tweet to an integer. We can then use Embedding layer of keras to vector encode the words.

Let's find the vocabulary size first.

In [None]:
def word_counter(text):  
    
    count = Counter()
    for i in text.values:
        for word in i.split():
            count[word] += 1
    return count    

text = data['text']
counter = word_counter(text)

vocab_size = len(counter)

We need to have a fixed sized input for the model, here I am using maximum length as 20. Try with different values to find the best one. Usually a smaller value is recommended since it makes the input less sparse when padded with zeros.

In [None]:
max_len = 20

To map the words to unique integer values, we will be using keras Tokenizer.

Keras Tokenizer can be used to get the sequence for each tweet. It maps each word to an integer, representing an index of that word in word_index list.

In [None]:
t = Tokenizer(num_words = vocab_size)
t.fit_on_texts(data['text'])

word_index = t.word_index

dict(list(word_index.items())[:10])

We will use this tokenizer later on train and test tweets.

Let's take initial 7500 examples for training and validation, remaining for testing.

In [None]:
df = data[:7500]

### Let's build a sequential model using keras.


In [None]:
model = Sequential()
model.add(Embedding(vocab_size, 200, input_length = max_len))
model.add(LSTM(80))
model.add(Dropout(0.5))
model.add(Dense(1, activation='sigmoid'))

nadam = Nadam(learning_rate=0.0001)

model.compile(loss = 'binary_crossentropy', optimizer=nadam, metrics=['accuracy'])

In [None]:
model.summary()

In [None]:
skf = StratifiedKFold(n_splits=5)
X = df['text']
y = df['target']

In [None]:
accuracy = []
# train model on 5 folds
for train_index, test_index in skf.split(X, y):
    
    train_x, test_x = X[train_index], X[test_index]
    train_y, test_y = y[train_index], y[test_index]
    print("Tweet before tokenization: ", train_x.iloc[0])
    
    #Tokenize the tweets using tokenizer.
    train_tweets = t.texts_to_sequences(train_x)
    test_tweets = t.texts_to_sequences(test_x)
    print("Tweet after tokenization: ", train_tweets[0])
    
    #pad the tokenized tweet data
    train_tweets_padded = pad_sequences(train_tweets, maxlen=max_len, padding='post', truncating='post')
    test_tweets_padded = pad_sequences(test_tweets, maxlen=max_len, padding='post', truncating='post')
    print('Tweet after padding: ', train_tweets_padded[0])
    
    #train model on processed tweets
    history = model.fit(train_tweets_padded, train_y, epochs=5, validation_data = (test_tweets_padded,test_y))
    
    #make predictions
    pred_y = model.predict_classes(test_tweets_padded)
    print("Validation accuracy : ",accuracy_score(pred_y, test_y))
    
    #store validation accuracy
    accuracy.append(accuracy_score(pred_y, test_y))

In [None]:
print("Validation accuracy of the model :", np.mean(accuracy))

Our model is trained with validation accuracy of 91%, let's see how it performs on unseen tweets from test data.

In [None]:
test_df = data[7501:]

tokenized_tweets = t.texts_to_sequences(test_df['text'])
padded_tweets = pad_sequences(tokenized_tweets, maxlen=max_len, padding='post', truncating='post')
test_y = test_df['target']
pred_y = model.predict_classes(padded_tweets)

In [None]:
accuracy_score(pred_y, test_y)

We acheived 92% test accuracy!!🎉

References:

https://machinelearningmastery.com/sequence-classification-lstm-recurrent-neural-networks-python-keras/

https://www.youtube.com/watch?v=j7EB7yeySDw