# Sentiment Analysis
1. Text input
2. Stemming / Lemmatization
3. Tokenization
4. Classification
5. Stopword filtering
6. Sentiment class
7. Negation

In [None]:
import numpy as np
import pandas as pd
import tensorflow as tf
import matplotlib.pyplot as plt

import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.stem import SnowballStemmer
from nltk.stem import WordNetLemmatizer

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

import re

print("Tensorflow Version : ", tf.__version__)

# Data preprocessing

In [None]:
df = pd.read_csv('../input/sentiment140/training.1600000.processed.noemoticon.csv', encoding='latin', header=None)
df.head()

In [None]:
df.columns = ['sentiment', 'id', 'date', 'query', 'user_id', 'text']
df.head()

In [None]:
df.shape

We need only 'text' and 'sentiment' columns to sentiment analysis.

In [None]:
df = df.drop(['id', 'date', 'query', 'user_id'], axis=1)

In [None]:
lab_to_sentiment = {0: 'Negative', 4: 'Positive'}

def decode_label(label):
    return lab_to_sentiment[label]

df.sentiment = df.sentiment.apply(lambda x: decode_label(x))
df.head()

In [None]:
df.sentiment.unique()

We visualize the dataset of distribution.

In [None]:
val_count = df.sentiment.value_counts()
plt.figure(figsize=(8, 4))
plt.bar(val_count.index, val_count.values)
plt.title("Sentiment Data Distribution")

There is without any skewness.

Now we explore more data.

In [None]:
import random
random_idx_list = [random.randint(1, len(df.text)) for i in range(10)]
df.loc[random_idx_list, :]

Text values have many punctuations(문장부호) and other words. We need to get rid of (제거하다) them.

# Text preprocessing

## Stemming / Lemmatizatoin
goal : reduce <<inflectional forms and sometimes derivationally (파생적으로) related forms of a word>> to a common base form.

* Stemming  
Stemming usually referes to a process that chops off (chop off : 잘라내다) the ends of words in the hope of achieving goal correctly most of the time and often includes the removal of drivational affixes (affix : 접사).

* Lemmatization  
Lemmatization usually refers to doing things properly with the use of a vocabulary and morphological analysis (형태소 분석) of words, normally aiming to remove inflectoinal endings (굴절 어미) only and to return the base and dictoinary form of a word.

## Hyperlinks and Mentions
ex) @arunrk7, httpsL//keras.io

## Stopwords
nltk library has functoins to perform text processing task.

In [None]:
"""
+ : 하나 이상 있을 경우
| : or
? : 하나 있거나, 하나도 없거나
"""
stemmer = SnowballStemmer('english')
# stemmer = PorterStemmer()
text_cleaning_re = "@\S+|https?:\S+http?:\S|[^A-Za-z0-9]"
stop_words = stopwords.words('english')

In [None]:
def preprocess(text, stem=False):
    text = re.sub(text_cleaning_re, ' ', str(text).lower()).strip()
    tokens = []
    for token in text.split():
        if token not in stop_words:
            if stem:
                tokens.append(stemmer.stem(token))
            else:
                tokens.append(token)
    return " ".join(tokens)

In [None]:
df.text = df.text.apply(lambda x: preprocess(x))

In [None]:
df.sample(n=10)

# Word cloud

## Positive words

In [None]:
df[df.sentiment=='Positive'].text

In [None]:
from wordcloud import WordCloud

plt.figure(figsize=(20, 20))
wc = WordCloud(max_words=2000, width=1600, height=800).generate(" ".join(df[df.sentiment=='Positive'].text))
plt.imshow(wc, interpolation='bilinear')

## Negative words

In [None]:
plt.figure(figsize=(20, 20))
wc = WordCloud(max_words=2000, width=1600, height=800).generate(" ".join(df[df.sentiment=='Negative'].text))
plt.imshow(wc, interpolation='bilinear')

# Train and test split

In [None]:
TRAIN_SIZE = 0.8
MAX_NB_WORDS = 100000
MAX_SEQUENCE_LENGTH = 30

In [None]:
train_data, test_data = train_test_split(df, test_size=1-TRAIN_SIZE, random_state=7)
print('Train data size : ', len(train_data))
print('Test data size : ', len(test_data))

In [None]:
train_data.head()

# Tokenizeation

'tokenizer' create tokens for every word in the data corpus and map them to a index using dictionary.  

'word_index' contains the index for each word.  

'vocab_size' represents the total number of word in the data corpus.

In [None]:
from keras.preprocessing.text import Tokenizer

tokenizer = Tokenizer()
tokenizer.fit_on_texts(train_data.text)
word_index = tokenizer.word_index
vocab_size = len(tokenizer.word_index) + 1
print('Vocabulary size : ', vocab_size)

In [None]:
for i, (k, v) in enumerate(word_index.items()):
    if i == 10:
        break
    print(k, v)

Now we got a 'tokenizer' object, which can be used to convert any word into a Key in dictionary (number).

Since we are going to build a sequence model, we should feed in a sequence of numbers to it. And also we should ensure there is no variance in input shapes of sequences. It all should be of same length. But texts in tweets have difference count of words in it. To avoid this, we seek a little help from 'pad_sequences' to do our job. It will make all the sequence in one constant length 'MAX_SEQUENCE_LENGTH'.

In [None]:
tokenizer.texts_to_sequences(train_data.text)[:10]

In [None]:
from keras.preprocessing.sequence import pad_sequences

x_train = pad_sequences(tokenizer.texts_to_sequences(train_data.text),
                       maxlen=MAX_SEQUENCE_LENGTH)
x_test = pad_sequences(tokenizer.texts_to_sequences(test_data.text),
                       maxlen=MAX_SEQUENCE_LENGTH)

print('Training X shape : ', x_train.shape)
print('Test X shape : ', x_test.shape)

In [None]:
x_train[:5]

In [None]:
train_data.sentiment.unique()

In [None]:
labels = train_data.sentiment.unique().tolist()
labels

## Label encoding

In [None]:
encoder = LabelEncoder()
encoder.fit(train_data.sentiment.to_list())

y_train = encoder.transform(train_data.sentiment.to_list())
y_test = encoder.transform(test_data.sentiment.to_list())

y_train = y_train.reshape(-1, 1)
y_test = y_test.reshape(-1, 1)

print('y_train shape : ', y_train.shape)
print('y_test shape : ', y_test.shape)

In [None]:
y_train[:5]

# Word embedding

Using GloVe for embedding.

In [None]:
# !wget http://nlp.stanford.edu/data/glove.6B.zip
# !unzip glove.6B.zip

In [None]:
GLOVE_EMB = '/kaggle/working/glove.6B.300d.txt'
EMBEDDING_DIM = 300
LR = 1e-3
BATCH_SIZE = 1024
EPOCHS = 10
MODEL_PATH = '.../output/kaggle/working/best_model.hdf5'

In [None]:
# embeddings_index = {}

# f = open(GLOVE_EMB)
# for line in f:
#     values = line.split()
#     word = value = values[0]
#     coefs = np.asarray(values[1:], dtype='float32')
#     embeddings_index[word] = coefs
# f.close()

# print('Found %s word vectors' %len(embeddings_index))
# # for i, (k, v) in enumerate(embeddings_index.items()):
# #     if i == 5:
# #         break
# #     print(k, v)

In [None]:
# embedding_matrix = np.zeros((vocab_size, EMBEDDING_DIM))
# for word, i in word_index.items():
#     embedding_vector = embeddings_index.get(word)
#     if embedding_vector is not None:
#         embedding_matrix[i] = embedding_vector

In [None]:
# embedding_layer = tf.keras.layers.Embedding(vocab_size,
#                                            EMBEDDING_DIM,
#                                            weights=[embedding_matrix],
#                                            input_length=MAX_SEQUENCE_LENGTH,
#                                            trainable=False)

Using word2vec for embedding.

In [None]:
import gensim
from urllib.request import urlretrieve

In [None]:
urlretrieve("https://s3.amazonaws.com/dl4j-distribution/GoogleNews-vectors-negative300.bin.gz", 
            filename="/kaggle/working/GoogleNews-vectors-negative300.bin.gz")
word2vec_model = gensim.models.KeyedVectors.load_word2vec_format('/kaggle/working/GoogleNews-vectors-negative300.bin.gz', binary=True)

In [None]:
word2vec_model.vectors.shape

In [None]:
def get_vector(word):
    if word in word2vec_model:
        return word2vec_model[word]
    else:
        return None

In [None]:
word2vec_model.vectors[:10]

In [None]:
embedding_matrix = np.zeros((vocab_size, EMBEDDING_DIM))
for word, i in word_index.items():
    embedding_vector = get_vector(word)
    if embedding_vector is not None:
        embedding_matrix[i] = embedding_vector

In [None]:
embedding_layer = tf.keras.layers.Embedding(vocab_size,
                                           EMBEDDING_DIM,
                                           weights=[embedding_matrix],
                                           input_length=MAX_SEQUENCE_LENGTH,
                                           trainable=False)

# Model training - LSTM

In [None]:
from tensorflow.keras.layers import Conv1D, Bidirectional, LSTM, Dense, Input, Dropout
from tensorflow.keras.layers import SpatialDropout1D
from tensorflow.keras.callbacks import ModelCheckpoint

In [None]:
sequence_input = Input(shape=(MAX_SEQUENCE_LENGTH,), dtype='int32')
embedding_sequences = embedding_layer(sequence_input)
x = SpatialDropout1D(0.2)(embedding_sequences)
x = Conv1D(64, 5, activation='relu')(x)
x = Bidirectional(LSTM(64, dropout=0.2, recurrent_dropout=0.2))(x)
x = Dense(512, activation='relu')(x)
x = Dropout(0.5)(x)
x = Dense(512, activation='relu')(x)
outputs = Dense(1, activation='sigmoid')(x)
model = tf.keras.Model(sequence_input, outputs)

In [None]:
from tensorflow.keras.utils import plot_model

plot_model(model)

* LRScheduler  
It changes a learning rate at specific epoch to achieve more improved result. In this notebook, the learning rate exponentionally decreases after remaining same for first 10 Epoch.

* ModelCheckPoint  
It saves best model while training based on some metrics. Here, it saves the model with minimum Validity Loss.

In [None]:
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import ReduceLROnPlateau

model.compile(optimizer=Adam(learning_rate=LR),
             loss='binary_crossentropy',
             metrics=['accuracy'])
ReduceLROnPlateau = ReduceLROnPlateau(factor=0.1,
                                     min_lr=0.01,
                                     monitor='val_loss',
                                     verbose=1)

In [None]:
print("Training on GPU...") if tf.test.is_gpu_available() else print("Training on CPU...")

In [None]:
history = model.fit(x_train, y_train, 
                    batch_size=BATCH_SIZE, 
                    epochs=EPOCHS, 
                    validation_data=(x_test, y_test), 
                    callbacks=[ReduceLROnPlateau])

# Model evaluation

In [None]:
s, (at, al) = plt.subplots(2, 1)
print(s)
print(at, al)
at.plot(history.history['accuracy'], c='b')
at.plot(history.history['val_accuracy'], c='r')
at.set_title('model accuracy')
at.set_ylabel('accuracy')
at.set_xlabel('epoch')
at.legend(['LSTM_train', 'LSTM_val'], loc='upper left')

al.plot(history.history['loss'], c='m')
al.plot(history.history['val_loss'], c='c')
al.set_title('model loss')
al.set_ylabel('loss')
al.set_xlabel('epoch')
al.legend(['train', 'val'], loc='upper left')

In [None]:
def decode_sentiment(score):
    return 'Positive' if score > 0.5 else 'Negative'

scores = model.predict(x_test, verbose=1, batch_size=10000)
y_pred_1d = [decode_sentiment(score) for score in scores]

In [None]:
y_pred_1d[:10]

In [None]:
from sklearn.metrics import classification_report

print(classification_report(list(test_data.sentiment), y_pred_1d))