In [5]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [6]:
%cd /content/drive/MyDrive/DSBA6162_Project

/content/drive/MyDrive/DSBA6162_Project


In [7]:
import numpy as np
import pandas as pd

# Explore dataset

In [8]:
train = pd.read_csv("twitter_training.csv")
test = pd.read_csv("twitter_validation.csv")

In [9]:
train.shape
test.shape
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 74682 entries, 0 to 74681
Data columns (total 4 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   id         74682 non-null  int64 
 1   entity     74682 non-null  object
 2   sentiment  74682 non-null  object
 3   content    73996 non-null  object
dtypes: int64(1), object(3)
memory usage: 2.3+ MB


In [10]:
train.head(10)

Unnamed: 0,id,entity,sentiment,content
0,2401,Borderlands,Positive,im getting on borderlands and i will murder yo...
1,2401,Borderlands,Positive,I am coming to the borders and I will kill you...
2,2401,Borderlands,Positive,im getting on borderlands and i will kill you ...
3,2401,Borderlands,Positive,im coming on borderlands and i will murder you...
4,2401,Borderlands,Positive,im getting on borderlands 2 and i will murder ...
5,2401,Borderlands,Positive,im getting into borderlands and i can murder y...
6,2402,Borderlands,Positive,So I spent a few hours making something for fu...
7,2402,Borderlands,Positive,So I spent a couple of hours doing something f...
8,2402,Borderlands,Positive,So I spent a few hours doing something for fun...
9,2402,Borderlands,Positive,So I spent a few hours making something for fu...


In [11]:
test.head(10)

Unnamed: 0,id,entity,sentiment,content
0,3364,Facebook,Irrelevant,I mentioned on Facebook that I was struggling ...
1,352,Amazon,Neutral,BBC News - Amazon boss Jeff Bezos rejects clai...
2,8312,Microsoft,Negative,@Microsoft Why do I pay for WORD when it funct...
3,4371,CS-GO,Negative,"CSGO matchmaking is so full of closet hacking,..."
4,4433,Google,Neutral,Now the President is slapping Americans in the...
5,6273,FIFA,Negative,Hi @EAHelp I’ve had Madeleine McCann in my cel...
6,7925,MaddenNFL,Positive,Thank you @EAMaddenNFL!! \n\nNew TE Austin Hoo...
7,11332,TomClancysRainbowSix,Positive,"Rocket League, Sea of Thieves or Rainbow Six: ..."
8,1107,AssassinsCreed,Positive,my ass still knee-deep in Assassins Creed Odys...
9,2069,CallOfDuty,Negative,FIX IT JESUS ! Please FIX IT ! What In the wor...


In [12]:
# same id values are similar content ?
print("Number of unique values in the 'id' column training set:", train['id'].nunique())

Number of unique values in the 'id' column training set: 12447


In [13]:
# no repeat id values in test
print("Number of unique values in the 'id' column testing set:", test['id'].nunique())

Number of unique values in the 'id' column testing set: 1000


In [14]:
# check the distribution of our sentiments
train['sentiment'].value_counts()

Negative      22542
Positive      20832
Neutral       18318
Irrelevant    12990
Name: sentiment, dtype: int64

In [15]:
test['sentiment'].value_counts()

Neutral       285
Positive      277
Negative      266
Irrelevant    172
Name: sentiment, dtype: int64

In [16]:
# drop na values
train.dropna(inplace=True)
test.dropna(inplace=True)

In [17]:
# no na values in test
test.shape

(1000, 4)

In [18]:
# 686 rows dropped from training set
train.shape

(73996, 4)

In [19]:
# drop Neutral and Irrelevant
test = test[(test['sentiment'] != 'Neutral') & (test['sentiment'] != 'Irrelevant')]
train = train[(train['sentiment'] != 'Neutral') & (train['sentiment'] != 'Irrelevant')]

In [20]:
train['sentiment'].value_counts()

Negative    22358
Positive    20655
Name: sentiment, dtype: int64

In [21]:
test['sentiment'].value_counts()

Positive    277
Negative    266
Name: sentiment, dtype: int64

# Preprocess

In [22]:
from keras.preprocessing.sequence import pad_sequences
from keras.preprocessing.text import Tokenizer

In [23]:
all_text = pd.concat([train['content'], test['content']], axis=0)

In [24]:
tokenizer = Tokenizer()
tokenizer.fit_on_texts(all_text)

In [25]:
print("Number of unique words in the tokenizer dictionary =", len(tokenizer.word_index))
print("Tokenizer dictionary =", tokenizer.word_index)

Number of unique words in the tokenizer dictionary = 20979


In [22]:
!wget http://nlp.stanford.edu/data/glove.6B.zip
!unzip glove*.zip

--2023-12-06 15:12:51--  http://nlp.stanford.edu/data/glove.6B.zip
Resolving nlp.stanford.edu (nlp.stanford.edu)... 171.64.67.140
Connecting to nlp.stanford.edu (nlp.stanford.edu)|171.64.67.140|:80... connected.
HTTP request sent, awaiting response... 302 Found
Location: https://nlp.stanford.edu/data/glove.6B.zip [following]
--2023-12-06 15:12:51--  https://nlp.stanford.edu/data/glove.6B.zip
Connecting to nlp.stanford.edu (nlp.stanford.edu)|171.64.67.140|:443... connected.
HTTP request sent, awaiting response... 301 Moved Permanently
Location: https://downloads.cs.stanford.edu/nlp/data/glove.6B.zip [following]
--2023-12-06 15:12:51--  https://downloads.cs.stanford.edu/nlp/data/glove.6B.zip
Resolving downloads.cs.stanford.edu (downloads.cs.stanford.edu)... 171.64.64.22
Connecting to downloads.cs.stanford.edu (downloads.cs.stanford.edu)|171.64.64.22|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 862182613 (822M) [application/zip]
Saving to: ‘glove.6B.zip.2’

gl

In [26]:
def embedding_for_vocab(filepath, word_index, embedding_dim):
    # Adding 1 because of the reserved 0 index
    vocab_size = len(word_index) + 1
    embedding_matrix_vocab = np.zeros((vocab_size, embedding_dim))

    with open(filepath, encoding="utf8") as f:
        for line in f:
            word, *vector = line.split()
            if word in word_index:
                idx = word_index[word]
                embedding_matrix_vocab[idx] = np.array(vector, dtype=np.float32)[:embedding_dim]

    return embedding_matrix_vocab

In [27]:
embedding_dim = 50
embedding_matrix_vocab = embedding_for_vocab('glove.6B.50d.txt', tokenizer.word_index, embedding_dim)

print("Dense vector for first word is => ", embedding_matrix_vocab[1])


Dense vector for first word is =>  [ 4.18000013e-01  2.49679998e-01 -4.12420005e-01  1.21699996e-01
  3.45270008e-01 -4.44569997e-02 -4.96879995e-01 -1.78619996e-01
 -6.60229998e-04 -6.56599998e-01  2.78430015e-01 -1.47670001e-01
 -5.56770027e-01  1.46579996e-01 -9.50950012e-03  1.16579998e-02
  1.02040000e-01 -1.27920002e-01 -8.44299972e-01 -1.21809997e-01
 -1.68009996e-02 -3.32789987e-01 -1.55200005e-01 -2.31309995e-01
 -1.91809997e-01 -1.88230002e+00 -7.67459989e-01  9.90509987e-02
 -4.21249986e-01 -1.95260003e-01  4.00710011e+00 -1.85939997e-01
 -5.22870004e-01 -3.16810012e-01  5.92130003e-04  7.44489999e-03
  1.77780002e-01 -1.58969998e-01  1.20409997e-02 -5.42230010e-02
 -2.98709989e-01 -1.57490000e-01 -3.47579986e-01 -4.56370004e-02
 -4.42510009e-01  1.87849998e-01  2.78489990e-03 -1.84110001e-01
 -1.15139998e-01 -7.85809994e-01]


In [28]:
embedding_matrix_vocab.shape

(20980, 50)

In [29]:
# embedding_dim = 50
# embedding_matrix_vocab = embedding_for_vocab('glove.6B.300d.txt', tokenizer.word_index, embedding_dim)

# print("Dense vector for first word is => ", embedding_matrix_vocab[1])


In [30]:
sentiment_mapping = {'Negative': 0, 'Positive': 1}
train['sentiment_encoded'] = train['sentiment'].map(sentiment_mapping)
test['sentiment_encoded'] = test['sentiment'].map(sentiment_mapping)

In [31]:
max_sequence_length = 50
sequences_train = tokenizer.texts_to_sequences(train['content'])
x_train_padded = pad_sequences(sequences_train, maxlen=max_sequence_length)

In [32]:
sequences_test = tokenizer.texts_to_sequences(test['content'])
x_test_padded = pad_sequences(sequences_test, maxlen=max_sequence_length)

# Models

LSTM

In [52]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout

In [53]:
model1 = Sequential()

In [54]:
model1.add(Embedding(input_dim=len(tokenizer.word_index) + 1, output_dim=embedding_dim, input_length=max_sequence_length))

In [55]:
model1.add(LSTM(units = 50, return_sequences = True, input_shape = (x_train_padded.shape[1],1)))
model1.add(Dropout(0.2))

In [56]:
model1.add(LSTM(units = 50, return_sequences = True))
model1.add(Dropout(0.2))

In [57]:
model1.add(LSTM(units = 50))
model1.add(Dropout(0.2))

In [58]:
model1.add(Dense(units = 1))

In [59]:
model1.compile(optimizer = 'adam', loss = 'binary_crossentropy', metrics = ['accuracy'])

In [60]:
model1.fit(x_train_padded, train['sentiment_encoded'], epochs=5, batch_size=32, validation_data=(x_test_padded, test['sentiment_encoded']))

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.src.callbacks.History at 0x7ea0794ee470>

In [42]:
predictions = model1.predict(x_test_padded)



# **Transformers Model**

In [43]:
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers

In [44]:
class TransformerBlock(layers.Layer):
    def __init__(self, embed_dim, num_heads, ff_dim, rate=0.1):
        super().__init__()
        self.att = layers.MultiHeadAttention(num_heads=num_heads, key_dim=embed_dim)
        self.ffn = keras.Sequential(
            [layers.Dense(ff_dim, activation="relu"), layers.Dense(embed_dim),]
        )
        self.layernorm1 = layers.LayerNormalization(epsilon=1e-6)
        self.layernorm2 = layers.LayerNormalization(epsilon=1e-6)
        self.dropout1 = layers.Dropout(rate)
        self.dropout2 = layers.Dropout(rate)

    def call(self, inputs, training):
        attn_output = self.att(inputs, inputs)
        attn_output = self.dropout1(attn_output, training=training)
        out1 = self.layernorm1(inputs + attn_output)
        ffn_output = self.ffn(out1)
        ffn_output = self.dropout2(ffn_output, training=training)
        return self.layernorm2(out1 + ffn_output)

In [45]:
class TokenAndPositionEmbedding(layers.Layer):
    def __init__(self, maxlen, vocab_size, embed_dim):
        super().__init__()
        self.token_emb = layers.Embedding(input_dim=vocab_size, output_dim=embed_dim)
        self.pos_emb = layers.Embedding(input_dim=maxlen, output_dim=embed_dim)

    def call(self, x):
        maxlen = tf.shape(x)[-1]
        positions = tf.range(start=0, limit=maxlen, delta=1)
        positions = self.pos_emb(positions)
        x = self.token_emb(x)
        return x + positions

In [46]:
vocab_size = len(tokenizer.word_index) + 1
maxlen = 50
embed_dim = 50
num_heads = 2
ff_dim = 50

In [47]:
inputs = layers.Input(shape=(maxlen,))
embedding_layer = TokenAndPositionEmbedding(maxlen, vocab_size, embed_dim)
x = embedding_layer(inputs)
transformer_block = TransformerBlock(embed_dim, num_heads, ff_dim)
x = transformer_block(x)
x = layers.GlobalAveragePooling1D()(x)
x = layers.Dropout(0.1)(x)
x = layers.Dense(20, activation="relu")(x)
x = layers.Dropout(0.1)(x)
outputs = layers.Dense(3, activation="softmax")(x)

In [48]:
model2 = keras.Model(inputs=inputs, outputs=outputs)

In [49]:
model2.compile(optimizer="adam", loss="sparse_categorical_crossentropy", metrics=["accuracy"])

In [51]:
history = model2.fit(x_train_padded, train['sentiment_encoded'], batch_size=32, epochs=5, validation_data=(x_test_padded, test['sentiment_encoded']))

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
