# Sentiment Analysis With Bi-LSTM - Word2Vec

## Library

In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
import nltk
from nltk import word_tokenize
from nltk.corpus import stopwords
import tensorflow as tf
from keras.preprocessing.text import Tokenizer
from keras.utils import pad_sequences
from gensim.models import Word2Vec



nltk.download('punkt')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

## Preprocessing

### Read Train and Test Dataset

In [2]:
cols = ["no" , "name" , "result" , "review"]
train = pd.read_csv("twitter_training.csv" , names = cols)
test = pd.read_csv("twitter_validation.csv" , names = cols)

### Concat Train and Test Dataset

In [3]:
dataset = pd.concat([train,test])
dataset

Unnamed: 0,no,name,result,review
0,2401,Borderlands,Positive,im getting on borderlands and i will murder yo...
1,2401,Borderlands,Positive,I am coming to the borders and I will kill you...
2,2401,Borderlands,Positive,im getting on borderlands and i will kill you ...
3,2401,Borderlands,Positive,im coming on borderlands and i will murder you...
4,2401,Borderlands,Positive,im getting on borderlands 2 and i will murder ...
...,...,...,...,...
995,4891,GrandTheftAuto(GTA),Irrelevant,⭐️ Toronto is the arts and culture capital of ...
996,4359,CS-GO,Irrelevant,tHIS IS ACTUALLY A GOOD MOVE TOT BRING MORE VI...
997,2652,Borderlands,Positive,Today sucked so it’s time to drink wine n play...
998,8069,Microsoft,Positive,Bought a fraction of Microsoft today. Small wins.


### Remove Column "No"

In [4]:
dataset_new = dataset.drop(["no", "name"] , axis = 1)
dataset_new

Unnamed: 0,result,review
0,Positive,im getting on borderlands and i will murder yo...
1,Positive,I am coming to the borders and I will kill you...
2,Positive,im getting on borderlands and i will kill you ...
3,Positive,im coming on borderlands and i will murder you...
4,Positive,im getting on borderlands 2 and i will murder ...
...,...,...
995,Irrelevant,⭐️ Toronto is the arts and culture capital of ...
996,Irrelevant,tHIS IS ACTUALLY A GOOD MOVE TOT BRING MORE VI...
997,Positive,Today sucked so it’s time to drink wine n play...
998,Positive,Bought a fraction of Microsoft today. Small wins.


### Remove Class "Irrelevant" (I just use Positive, Neutral, Negative)

In [5]:
def remove_irrelevant(dataset):
    condition = (dataset["result"] == "Irrelevant")
    dataset = dataset[~condition]
    return dataset

In [6]:
dataset_clean = remove_irrelevant(dataset_new)
dataset_clean

Unnamed: 0,result,review
0,Positive,im getting on borderlands and i will murder yo...
1,Positive,I am coming to the borders and I will kill you...
2,Positive,im getting on borderlands and i will kill you ...
3,Positive,im coming on borderlands and i will murder you...
4,Positive,im getting on borderlands 2 and i will murder ...
...,...,...
993,Negative,Please explain how this is possible! How can t...
994,Positive,Good on Sony. As much as I want to see the new...
997,Positive,Today sucked so it’s time to drink wine n play...
998,Positive,Bought a fraction of Microsoft today. Small wins.


### Delete NaN

In [7]:
dataset_clean = dataset_clean.dropna(subset=['review'])

### Reset Index

In [8]:
dataset_clean = dataset_clean.reset_index()
dataset_clean = dataset_clean.drop(["index"] , axis = 1)
dataset_clean

Unnamed: 0,result,review
0,Positive,im getting on borderlands and i will murder yo...
1,Positive,I am coming to the borders and I will kill you...
2,Positive,im getting on borderlands and i will kill you ...
3,Positive,im coming on borderlands and i will murder you...
4,Positive,im getting on borderlands 2 and i will murder ...
...,...,...
61944,Negative,Please explain how this is possible! How can t...
61945,Positive,Good on Sony. As much as I want to see the new...
61946,Positive,Today sucked so it’s time to drink wine n play...
61947,Positive,Bought a fraction of Microsoft today. Small wins.


### Change Class Positive:1, Neutral:0, Negative:2

In [9]:
dataset_clean["result"] = dataset_clean["result"].map({"Positive":1,"Neutral":0,"Negative":2})
dataset_clean

Unnamed: 0,result,review
0,1,im getting on borderlands and i will murder yo...
1,1,I am coming to the borders and I will kill you...
2,1,im getting on borderlands and i will kill you ...
3,1,im coming on borderlands and i will murder you...
4,1,im getting on borderlands 2 and i will murder ...
...,...,...
61944,2,Please explain how this is possible! How can t...
61945,1,Good on Sony. As much as I want to see the new...
61946,1,Today sucked so it’s time to drink wine n play...
61947,1,Bought a fraction of Microsoft today. Small wins.


### Case Folding, Remove of Punctuation, Tokenization, Stopwords

In [10]:
def preprocessing(dataset):
  dataset['review'] = dataset['review'].str.lower() #CaseFolding
  dataset['review'] = dataset['review'].str.replace(r'[^\w\s]+', '') # Penghapusan Tanda Baca
  dataset["review"] = dataset["review"].apply(lambda text : nltk.word_tokenize(text))
  return dataset

In [11]:
dataset_clean = preprocessing(dataset_clean)

  dataset['review'] = dataset['review'].str.replace(r'[^\w\s]+', '') # Penghapusan Tanda Baca


In [12]:
dataset_clean

Unnamed: 0,result,review
0,1,"[im, getting, on, borderlands, and, i, will, m..."
1,1,"[i, am, coming, to, the, borders, and, i, will..."
2,1,"[im, getting, on, borderlands, and, i, will, k..."
3,1,"[im, coming, on, borderlands, and, i, will, mu..."
4,1,"[im, getting, on, borderlands, 2, and, i, will..."
...,...,...
61944,2,"[please, explain, how, this, is, possible, how..."
61945,1,"[good, on, sony, as, much, as, i, want, to, se..."
61946,1,"[today, sucked, so, its, time, to, drink, wine..."
61947,1,"[bought, a, fraction, of, microsoft, today, sm..."


### Split Dataset

In [13]:
x_train, x_test, y_train, y_test = train_test_split(dataset_clean.review, dataset_clean.result, test_size=0.25, shuffle=True)

## Prepadding and Word2Vec

In [14]:
w2v = Word2Vec([sentence for sentence in dataset_clean.review.values])

maxlen = 100
tokenizer = Tokenizer(num_words=10000, oov_token='<OOV>')
tokenizer.fit_on_texts(dataset_clean.review)

word_index = tokenizer.word_index

x_train_sequences = tokenizer.texts_to_sequences(x_train)
x_test_sequences = tokenizer.texts_to_sequences(x_test)

In [15]:
x_train = pad_sequences(x_train_sequences,
                                maxlen=100,
                                padding='pre')

x_test = pad_sequences(x_test_sequences,
                             maxlen=100,
                             padding='pre')

In [16]:
embedding_matrix = np.zeros((len(word_index)+1, 100))
for word,i in word_index.items():
    try:
        embedding_matrix[i] = w2v.wv[i]
    except:
        embedding_matrix[i] = np.zeros(100)

## Model Bi-LSTM

In [27]:
model = tf.keras.Sequential([
    tf.keras.layers.Embedding(len(embedding_matrix), 100, input_length=100, weights=[embedding_matrix]),
    tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(8)),
    tf.keras.layers.Dropout(0.3),
    tf.keras.layers.Dense(32, activation='relu'),
    tf.keras.layers.Dense(3, activation='softmax')
])


In [28]:
model.compile(optimizer='adam', metrics=['acc'], loss='sparse_categorical_crossentropy')
model.fit(x_train, y_train, validation_data=(x_test, y_test), epochs=10)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x7fc661d942b0>