<a href="https://colab.research.google.com/github/ThomasWit21/Master-thesis/blob/main/Recurrent_Neural_Network_Model.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Code inspired by Ripamonti(2018):
https://www.kaggle.com/paoloripamonti/twitter-sentiment-analysis

In [None]:
#Decoding integers to labels
decode_map = {0: "NEGATIVE", 2: "NEUTRAL", 4: "POSITIVE"}
def decode_sentiment(label):
    return decode_map[int(label)]

In [None]:
##Setting parameters
# TEXT CLENAING
TEXT_CLEANING_RE = "@\S+|https?:\S+|http?:\S|[^A-Za-z0-9]+"

# WORD2VEC 
W2V_SIZE = 300
W2V_WINDOW = 7
W2V_EPOCH = 32
W2V_MIN_COUNT = 10

# KERAS
SEQUENCE_LENGTH = 300
EPOCHS = 8
BATCH_SIZE = 1024

# SENTIMENT
POSITIVE = "POSITIVE"
NEGATIVE = "NEGATIVE"
NEUTRAL = "NEUTRAL"
SENTIMENT_THRESHOLDS = (0.4, 0.7)

In [None]:
#Loading in the training data
Vadercheck = pd.read_csv("/content/drive/My Drive/VADERcheck.csv", encoding = "ISO-8859-1")
Vadercheck = Vadercheck.drop(['1467810369', 'Mon Apr 06 22:19:45 PDT 2009', 'NO_QUERY', '_TheSpecialOne_'], axis = 1)
Vadercheck = Vadercheck.rename(columns = {'0' : 'Sentiment Score', "@switchfoot http://twitpic.com/2y1zl - Awww, that's a bummer.  You shoulda got David Carr of Third Day to do it. ;D" : "Text"})
Samplecheck = Vadercheck.sample(n=500000)
Samplecheck['Sentiment Score'] = Samplecheck['Sentiment Score'].apply(lambda x: decode_sentiment(x))

In [None]:
#Preprocessing
from nltk.stem.snowball import SnowballStemmer
import nltk
from nltk.corpus import stopwords
nltk.download('stopwords')
import re
stop_words = stopwords.words("english")
stemmer = SnowballStemmer("english")

def preprocess(text, stem=False):
    # Remove link,user and special characters
    text = re.sub(TEXT_CLEANING_RE, ' ', str(text).lower()).strip()
    tokens = []
    for token in text.split():
        if token not in stop_words:
            if stem:
                tokens.append(stemmer.stem(token))
            else:
                tokens.append(token)
    return " ".join(tokens)

In [None]:
#Train, Test, Split
from sklearn.model_selection import train_test_split
df_train, df_test = train_test_split(Samplecheck, random_state=42)
print("TRAIN size:", len(df_train))
print("TEST size:", len(df_test))

In [None]:
%%time
files = [_text.split() for _text in df_train.Text] 

In [None]:
#Word2Vec and building the vocabulary
import gensim
w2v_model = gensim.models.word2vec.Word2Vec(size=W2V_SIZE, 
                                            window=W2V_WINDOW, 
                                            min_count=W2V_MIN_COUNT, 
                                            workers=8)

w2v_model.build_vocab(files)

words = w2v_model.wv.vocab.keys()
vocab_size = len(words)
print("Vocab size", vocab_size)

In [None]:
#Training the Word2Vec model
%%time
w2v_model.train(files, total_examples=len(files), epochs=W2V_EPOCH)

In [None]:
#Tokenizing and padding
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences

tokenizer = Tokenizer()
tokenizer.fit_on_texts(df_train.Text)

vocab_size = len(tokenizer.word_index) + 1
print("Total words", vocab_size)

x_train = pad_sequences(tokenizer.texts_to_sequences(df_train.Text), maxlen=SEQUENCE_LENGTH)
x_test = pad_sequences(tokenizer.texts_to_sequences(df_test.Text), maxlen=SEQUENCE_LENGTH)

In [None]:
#Making labels
labels = df_train['Sentiment Score'].unique().tolist()
labels.append(NEUTRAL)
labels

In [None]:
from sklearn.preprocessing import LabelEncoder

encoder = LabelEncoder()
encoder.fit(df_train['Sentiment Score'].tolist())

y_train = encoder.transform(df_train['Sentiment Score'].tolist())
y_test = encoder.transform(df_test['Sentiment Score'].tolist())

y_train = y_train.reshape(-1,1)
y_test = y_test.reshape(-1,1)


print("y_train",y_train.shape)
print("y_test",y_test.shape)


In [None]:
#Building the recurrent neural network model
import numpy as np
from keras.models import Sequential
from keras.layers import Activation, Dense, Dropout, Embedding, Flatten, Conv1D, MaxPooling1D, LSTM

embedding_matrix = np.zeros((vocab_size, W2V_SIZE))
for word, i in tokenizer.word_index.items():
  if word in w2v_model.wv:
    embedding_matrix[i] = w2v_model.wv[word]

embedding_layer = Embedding(vocab_size, W2V_SIZE, weights=[embedding_matrix], input_length=SEQUENCE_LENGTH, trainable=False)

model = Sequential()
model.add(embedding_layer)
model.add(Dropout(0.5))
model.add(LSTM(100, dropout=0.2, recurrent_dropout=0.2))
model.add(Dense(1, activation='sigmoid'))

model.compile(loss='binary_crossentropy',
              optimizer="adam",
              metrics=['accuracy'])

In [None]:
#Training the model
%%time
history = model.fit(x_train, y_train,
                    batch_size=BATCH_SIZE,
                    epochs=EPOCHS,
                    validation_split=0.1,
                    verbose=1)

In [None]:
#Evaluating the model
score = model.evaluate(x_test, y_test)
print()
print("ACCURACY:",score[1])
print("LOSS:",score[0])