In [20]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from keras.preprocessing.text import Tokenizer
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dropout, Embedding, Dense, GlobalMaxPool1D
from tensorflow.keras.optimizers import RMSprop
from keras.layers import LeakyReLU
import tensorflow as tf

In [2]:
dataset = pd.read_csv('Tweets.csv')

In [3]:
dataset.head()

Unnamed: 0,textID,text,selected_text,sentiment
0,cb774db0d1,"I`d have responded, if I were going","I`d have responded, if I were going",neutral
1,549e992a42,Sooo SAD I will miss you here in San Diego!!!,Sooo SAD,negative
2,088c60f138,my boss is bullying me...,bullying me,negative
3,9642c003ef,what interview! leave me alone,leave me alone,negative
4,358bd9e861,"Sons of ****, why couldn`t they put them on t...","Sons of ****,",negative


In [4]:
dataset = dataset.dropna()

In [5]:
dataset.head()

Unnamed: 0,textID,text,selected_text,sentiment
0,cb774db0d1,"I`d have responded, if I were going","I`d have responded, if I were going",neutral
1,549e992a42,Sooo SAD I will miss you here in San Diego!!!,Sooo SAD,negative
2,088c60f138,my boss is bullying me...,bullying me,negative
3,9642c003ef,what interview! leave me alone,leave me alone,negative
4,358bd9e861,"Sons of ****, why couldn`t they put them on t...","Sons of ****,",negative


In [6]:
dataset['label_id'] = dataset['sentiment'].factorize()[0]
cat_id = dataset[['sentiment', 'label_id']].drop_duplicates().sort_values('label_id')
cat_to_id = dict(cat_id.values)
id_to_cat = dict(cat_id[['label_id', 'sentiment']].values)

#show data id_to_kategori
id_to_cat

{0: 'neutral', 1: 'negative', 2: 'positive'}

In [7]:
dataset.head()

Unnamed: 0,textID,text,selected_text,sentiment,label_id
0,cb774db0d1,"I`d have responded, if I were going","I`d have responded, if I were going",neutral,0
1,549e992a42,Sooo SAD I will miss you here in San Diego!!!,Sooo SAD,negative,1
2,088c60f138,my boss is bullying me...,bullying me,negative,1
3,9642c003ef,what interview! leave me alone,leave me alone,negative,1
4,358bd9e861,"Sons of ****, why couldn`t they put them on t...","Sons of ****,",negative,1


In [8]:
dataset.shape

(27480, 5)

In [9]:
label = dataset['label_id'].values
text = dataset['selected_text'].values

In [10]:
print(label)

[0 1 1 ... 2 2 0]


In [11]:
print(text)

['I`d have responded, if I were going' 'Sooo SAD' 'bullying me' ...
 'Yay good for both of you.' 'But it was worth it  ****.'
 'All this flirting going on - The ATG smiles. Yay.  ((hugs)']


In [12]:
maxfeatures = 10000
tokenizer = Tokenizer(num_words = maxfeatures)
tokenizer.fit_on_texts(text)

In [13]:
maxseqlen = max([len(i.split()) for i in text])
print(maxseqlen)

33


In [14]:
from tensorflow.keras.preprocessing.sequence import pad_sequences
from keras.utils.np_utils import to_categorical

X = tokenizer.texts_to_sequences(text)
X = pad_sequences(X, maxseqlen)
y = to_categorical(label, num_classes = 3)

print("Shape of Independent variable ['text']:", X.shape)
print("Shape of Dependent variable ['label']:", y.shape)

Shape of Independent variable ['text']: (27480, 33)
Shape of Dependent variable ['label']: (27480, 3)


In [15]:
from sklearn.model_selection import train_test_split

Xtrain, Xtest, ytrain, ytest = train_test_split(X, y, test_size = 0.2, random_state = 42)

print(Xtrain.shape, ytrain.shape)
print(Xtest.shape, ytest.shape)

(21984, 33) (21984, 3)
(5496, 33) (5496, 3)


In [31]:
from tensorflow.keras.models import Sequential
model = Sequential()
model.add(Embedding(input_dim = maxfeatures, output_dim = 128, input_length = maxseqlen))
model.add(LSTM(512, return_sequences = True))
model.add(GlobalMaxPool1D())

model.add(Dense(256))
model.add(LeakyReLU(alpha = 0.3))
model.add(Dropout(0.1))

model.add(Dense(128))
model.add(LeakyReLU(alpha = 0.3))
model.add(Dropout(0.1))

model.add(Dense(64))
model.add(LeakyReLU(alpha = 0.3))
model.add(Dropout(0.1))

model.add(Dense(32))
model.add(LeakyReLU(alpha = 0.3))
model.add(Dropout(0.1))

model.add(Dense(16))
model.add(LeakyReLU(alpha = 0.3))
model.add(Dropout(0.1))


model.add(Dense(3, activation='softmax'))


opt = RMSprop(learning_rate = 0.0012, rho = 0.7, momentum = 0.5)
model.compile(optimizer = opt, loss = 'categorical_crossentropy', metrics = ['accuracy'])
model.summary()

Model: "sequential_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_1 (Embedding)     (None, 33, 128)           1280000   
                                                                 
 lstm_1 (LSTM)               (None, 33, 512)           1312768   
                                                                 
 global_max_pooling1d_1 (Glo  (None, 512)              0         
 balMaxPooling1D)                                                
                                                                 
 dense_4 (Dense)             (None, 256)               131328    
                                                                 
 leaky_re_lu_3 (LeakyReLU)   (None, 256)               0         
                                                                 
 dropout_3 (Dropout)         (None, 256)               0         
                                                      

In [32]:
# from tensorflow.keras.callbacks import EarlyStopping

# #declare checkpoint variable and early stopping to get best model
# early_stop = EarlyStopping(monitor = 'val_accuracy', patience = 3)

In [33]:
tensorboard = tf.keras.callbacks.TensorBoard(log_dir = 'logs',
                                 histogram_freq = 0, 
                                 write_graph = True, 
                                 write_images = False,    
                                 update_freq = 'epoch', 
                                 profile_batch = 2, 
                                 embeddings_freq = 0,    
                                 embeddings_metadata = None)

In [34]:
history = model.fit(Xtrain, ytrain,
                    batch_size = 100, epochs = 15, shuffle = True,
                    validation_split = 0.1, verbose = 1,
                    callbacks = tensorboard)

Epoch 1/15
Epoch 2/15
Epoch 3/15
Epoch 4/15
Epoch 5/15
Epoch 6/15
Epoch 7/15
Epoch 8/15
Epoch 9/15
Epoch 10/15
Epoch 11/15
Epoch 12/15
Epoch 13/15
Epoch 14/15
Epoch 15/15


In [36]:
%load_ext tensorboard

The tensorboard extension is already loaded. To reload it, use:
  %reload_ext tensorboard


In [37]:
%tensorboard --logdir logs

Reusing TensorBoard on port 6006 (pid 3184), started 0:29:17 ago. (Use '!kill 3184' to kill it.)

In [38]:
from sklearn.metrics import accuracy_score

#build eveluation function
def evaluation(model, X, Y):
  global Y_pred, Y_act
  Y_pred = model.predict(X)
  Y_pred_class = np.argmax(Y_pred, axis=1)
  rounded_labels=np.argmax(Y, axis=1)
  Y_act = rounded_labels
  
  accuracy = accuracy_score(Y_act, Y_pred_class)
  return accuracy

In [39]:
accuracy = evaluation(model, Xtest, ytest)
print('accuracy: %.3f' % (accuracy * 100), '%')

accuracy: 83.151 %


In [40]:
from sklearn.metrics import confusion_matrix, classification_report

target = ['neu', 'neg', 'pos']
print(confusion_matrix(Y_act, np.argmax(Y_pred, axis=1)))
print(classification_report(Y_act, np.argmax(Y_pred, axis = 1), target_names = target))

[[1946  171  119]
 [ 274 1238   60]
 [ 214   88 1386]]
              precision    recall  f1-score   support

         neu       0.80      0.87      0.83      2236
         neg       0.83      0.79      0.81      1572
         pos       0.89      0.82      0.85      1688

    accuracy                           0.83      5496
   macro avg       0.84      0.83      0.83      5496
weighted avg       0.83      0.83      0.83      5496

