<a href="https://colab.research.google.com/github/Shankar-Pankhania/Twitter_Sentiment_Analysis_on_Bitcoin/blob/main/Train_LSTM.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [18]:
from google.colab import drive 
drive.mount('/content/drive/')
%cd /content/drive/My Drive/Dissertation/

Drive already mounted at /content/drive/; to attempt to forcibly remount, call drive.mount("/content/drive/", force_remount=True).
/content/drive/My Drive/Dissertation


Import packages

In [19]:
import numpy as np
import pandas as pd
import tensorflow as tf
import matplotlib.pyplot as plt
from sklearn.metrics import confusion_matrix
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Embedding, SpatialDropout1D, Conv1D, MaxPooling1D
from sklearn.model_selection import train_test_split
from keras.utils.np_utils import to_categorical
import re
from tensorflow.compat.v1.keras.layers import CuDNNLSTM
from sklearn import model_selection
from sklearn.model_selection import KFold
from sklearn.metrics import classification_report
from sklearn.metrics import roc_curve
from sklearn.metrics import auc
tf.config.list_physical_devices('GPU')   #check GPU connected

[PhysicalDevice(name='/physical_device:GPU:0', device_type='GPU')]

Read dataset

In [20]:
total_dataset = pd.read_csv('bitcoin_tweets.csv')
total_dataset = total_dataset[total_dataset.sentiment != 0] #remove neutral tweets, only need positve and negative sentiments.
total_dataset['sentiment'] = total_dataset['sentiment'].replace(-1, 0) #replaces negative sentiment -1 with 0 for analysis

Tokenizer

In [21]:
max_fatures = 2000
tokenizer = Tokenizer(num_words=max_fatures)#(ngram_range=(1,2)
tokenizer.fit_on_texts(total_dataset['processed_tweet'].values)
X = tokenizer.texts_to_sequences(total_dataset['processed_tweet'].values)
X = pad_sequences(X)

LSTM Network

In [22]:
embed_dim = 128

model = Sequential()
model.add(Embedding(2000, embed_dim,input_length = X.shape[1]))
model.add(SpatialDropout1D(0.4))
model.add(Conv1D(64, 5, activation='tanh'))
model.add(MaxPooling1D(pool_size=4))
model.add(CuDNNLSTM(128))
model.add(Dense(2,activation='softmax'))
model.compile(loss = 'categorical_crossentropy', optimizer='adam',metrics = ['accuracy'])
print(model.summary())

Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 116, 128)          256000    
_________________________________________________________________
spatial_dropout1d_1 (Spatial (None, 116, 128)          0         
_________________________________________________________________
conv1d_1 (Conv1D)            (None, 112, 64)           41024     
_________________________________________________________________
max_pooling1d_1 (MaxPooling1 (None, 28, 64)            0         
_________________________________________________________________
cu_dnnlstm_1 (CuDNNLSTM)     (None, 128)               99328     
_________________________________________________________________
dense_1 (Dense)              (None, 2)                 258       
Total params: 396,610
Trainable params: 396,610
Non-trainable params: 0
________________________________________________

Splitting

In [23]:
Y = pd.get_dummies(total_dataset['sentiment']).values
X_train, X_test, Y_train, Y_test = train_test_split(X,Y, test_size = 0.2, random_state = 69)
print(X_train.shape,Y_train.shape)
print(X_test.shape,Y_test.shape)

(57056, 116) (57056, 2)
(14264, 116) (14264, 2)


K Fold Cross Validation for Naive Bayes

In [None]:
#The code below has been commented out because we will be using test/train split for this sentiment analysis. If you would like to check, please uncomment the code and comment the test/train split code and run the whole code again.

"""
num_folds = 10 
# Parse numbers as floats
X_train = X_train.astype('float32')
X_test = X_test.astype('float32')

# Normalize data
X_train = X_train / 255
X_test = X_test / 255

# Define per-fold score containers
acc_per_fold = []
loss_per_fold = []

# Merge inputs and targets
inputs = np.concatenate((X_train, X_test), axis=0)
targets = np.concatenate((Y_train, Y_test), axis=0)

# Define the K-fold Cross Validator
kfold = KFold(n_splits=num_folds, shuffle=True)

fold_no = 1
batch_size = 32
for train, test in kfold.split(inputs, targets):
  
  #fit data to model
  model.fit(inputs[train], targets[train], epochs = 4, batch_size=batch_size, verbose = 2)

  # Generate generalization metrics
  scores = model.evaluate(inputs[test], targets[test], verbose=0)
  print(f'Score for fold {fold_no}: {model.metrics_names[0]} of {scores[0]}; {model.metrics_names[1]} of {scores[1]*100}%')
  acc_per_fold.append(scores[1] * 100)
  loss_per_fold.append(scores[0])

  # Increase fold number
  fold_no = fold_no + 1


print('Average scores for all folds:')
print(f'> Accuracy: {np.mean(acc_per_fold)} (+- {np.std(acc_per_fold)})')
print(f'> Loss: {np.mean(loss_per_fold)}')
"""

Train the Network



In [25]:
model.fit(X_train, Y_train, epochs = 10)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<tensorflow.python.keras.callbacks.History at 0x7f164995e210>

Accuracy

In [26]:
validation_size = 2000

X_validate = X_test[-validation_size:]
Y_validate = Y_test[-validation_size:]
X_test = X_test[:-validation_size]
Y_test = Y_test[:-validation_size]
score,acc = model.evaluate(X_test, Y_test)
print("Accuracy with LSTM:" + str(acc))

Accuracy with LSTM:0.9246575236320496


Measure number of correct guesses

In [None]:
#initialise values
positive_count, negative_count, positive_correct, negative_correct = 0, 0, 0, 0
for x in range(len(X_validate)):
    
    prediction = model.predict(X_validate[x].reshape(1,X_test.shape[1]),batch_size=1,verbose = 2)[0]
   
    if np.argmax(prediction) == np.argmax(Y_validate[x]):
        if np.argmax(Y_validate[x]) == 0:
            negative_correct += 1
        else:
            positive_correct += 1
       
    if np.argmax(Y_validate[x]) == 0:
        negative_count += 1
    else:
        positive_count += 1

In [28]:
print("positive_accuracy", positive_correct/positive_count*100, "%")
print("negative_accuracy", negative_correct/negative_count*100, "%")

positive_accuracy 95.2127659574468 %
negative_accuracy 86.29032258064517 %
