In [1]:
import pandas as pd
import numpy as np

In [2]:
training_df = pd.read_csv(r"D:\Ashraf\NCI\Semester 2\DAPA\Dataset\twitter_training.csv")

testing_df = pd.read_csv(r"D:\Ashraf\NCI\Semester 2\DAPA\Dataset\twitter_validation.csv")

In [3]:
training_df.head(10)

Unnamed: 0,2401,Borderlands,Positive,"im getting on borderlands and i will murder you all ,"
0,2401,Borderlands,Positive,I am coming to the borders and I will kill you...
1,2401,Borderlands,Positive,im getting on borderlands and i will kill you ...
2,2401,Borderlands,Positive,im coming on borderlands and i will murder you...
3,2401,Borderlands,Positive,im getting on borderlands 2 and i will murder ...
4,2401,Borderlands,Positive,im getting into borderlands and i can murder y...
5,2402,Borderlands,Positive,So I spent a few hours making something for fu...
6,2402,Borderlands,Positive,So I spent a couple of hours doing something f...
7,2402,Borderlands,Positive,So I spent a few hours doing something for fun...
8,2402,Borderlands,Positive,So I spent a few hours making something for fu...
9,2402,Borderlands,Positive,2010 So I spent a few hours making something f...


In [4]:
column = ['ID','Game','Sentiment','Tweet']

training_df.columns = column
testing_df.columns = column

In [5]:
training_df.head(10)

Unnamed: 0,ID,Game,Sentiment,Tweet
0,2401,Borderlands,Positive,I am coming to the borders and I will kill you...
1,2401,Borderlands,Positive,im getting on borderlands and i will kill you ...
2,2401,Borderlands,Positive,im coming on borderlands and i will murder you...
3,2401,Borderlands,Positive,im getting on borderlands 2 and i will murder ...
4,2401,Borderlands,Positive,im getting into borderlands and i can murder y...
5,2402,Borderlands,Positive,So I spent a few hours making something for fu...
6,2402,Borderlands,Positive,So I spent a couple of hours doing something f...
7,2402,Borderlands,Positive,So I spent a few hours doing something for fun...
8,2402,Borderlands,Positive,So I spent a few hours making something for fu...
9,2402,Borderlands,Positive,2010 So I spent a few hours making something f...


In [6]:
training_df.shape

(74681, 4)

In [7]:
training_df.isnull().sum()

ID             0
Game           0
Sentiment      0
Tweet        686
dtype: int64

In [8]:
training_df.drop_duplicates(inplace=True)
training_df.dropna(inplace=True)

training_df.isnull().sum()

ID           0
Game         0
Sentiment    0
Tweet        0
dtype: int64

In [9]:
training_df.shape

(71655, 4)

In [10]:
testing_df

Unnamed: 0,ID,Game,Sentiment,Tweet
0,352,Amazon,Neutral,BBC News - Amazon boss Jeff Bezos rejects clai...
1,8312,Microsoft,Negative,@Microsoft Why do I pay for WORD when it funct...
2,4371,CS-GO,Negative,"CSGO matchmaking is so full of closet hacking,..."
3,4433,Google,Neutral,Now the President is slapping Americans in the...
4,6273,FIFA,Negative,Hi @EAHelp I’ve had Madeleine McCann in my cel...
...,...,...,...,...
994,4891,GrandTheftAuto(GTA),Irrelevant,⭐️ Toronto is the arts and culture capital of ...
995,4359,CS-GO,Irrelevant,tHIS IS ACTUALLY A GOOD MOVE TOT BRING MORE VI...
996,2652,Borderlands,Positive,Today sucked so it’s time to drink wine n play...
997,8069,Microsoft,Positive,Bought a fraction of Microsoft today. Small wins.


In [11]:
testing_df.drop_duplicates(inplace=True)
testing_df.dropna(inplace=True)

testing_df.isnull().sum()

ID           0
Game         0
Sentiment    0
Tweet        0
dtype: int64

In [12]:
import re

def preprocess_text(text):
    text = text.lower()
    text = re.sub(r'[^\w\s]', '', text)
    return text

training_df['Tweet'] = training_df['Tweet'].apply(preprocess_text)
testing_df['Tweet'] = testing_df['Tweet'].apply(preprocess_text)

training_df['Sentiment_label'] = pd.Categorical(training_df['Sentiment']).codes
testing_df['Sentiment_label'] = pd.Categorical(testing_df['Sentiment']).codes

In [13]:
training_df.head(10)

Unnamed: 0,ID,Game,Sentiment,Tweet,Sentiment_label
0,2401,Borderlands,Positive,i am coming to the borders and i will kill you...,3
1,2401,Borderlands,Positive,im getting on borderlands and i will kill you all,3
2,2401,Borderlands,Positive,im coming on borderlands and i will murder you...,3
3,2401,Borderlands,Positive,im getting on borderlands 2 and i will murder ...,3
4,2401,Borderlands,Positive,im getting into borderlands and i can murder y...,3
5,2402,Borderlands,Positive,so i spent a few hours making something for fu...,3
6,2402,Borderlands,Positive,so i spent a couple of hours doing something f...,3
7,2402,Borderlands,Positive,so i spent a few hours doing something for fun...,3
8,2402,Borderlands,Positive,so i spent a few hours making something for fu...,3
9,2402,Borderlands,Positive,2010 so i spent a few hours making something f...,3


In [14]:
training_df['Sentiment_label'].unique()

array([3, 2, 1, 0], dtype=int8)

# Modelling

LSTM

In [18]:
from sklearn.model_selection import train_test_split
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout
from tensorflow.keras.optimizers import Adam

max_words = 10000
max_len = 100

tokenizer = Tokenizer(num_words=max_words)
tokenizer.fit_on_texts(training_df['Tweet'])

X_train_seq = tokenizer.texts_to_sequences(training_df['Tweet'])
X_test_seq = tokenizer.texts_to_sequences(testing_df['Tweet'])

X_train_pad = pad_sequences(X_train_seq, maxlen=max_len)
X_test_pad = pad_sequences(X_test_seq, maxlen=max_len)

y_train = training_df['Sentiment_label']
y_test = testing_df['Sentiment_label']


In [19]:
num_classes = len(pd.unique(training_df['Sentiment_label']))

model = Sequential()
model.add(Embedding(input_dim=max_words, output_dim=128, input_length=max_len))
model.add(LSTM(128, return_sequences=True))
model.add(Dropout(0.5))
model.add(LSTM(64))
model.add(Dense(num_classes, activation='softmax'))

model.compile(optimizer=Adam(learning_rate=1e-3), loss='sparse_categorical_crossentropy', metrics=['accuracy'])

history = model.fit(X_train_pad, y_train, epochs=5, batch_size=32, validation_data=(X_test_pad, y_test))



Epoch 1/5
[1m2240/2240[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m289s[0m 127ms/step - accuracy: 0.5532 - loss: 1.0410 - val_accuracy: 0.8609 - val_loss: 0.4005
Epoch 2/5
[1m2240/2240[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m274s[0m 122ms/step - accuracy: 0.8195 - loss: 0.4812 - val_accuracy: 0.9249 - val_loss: 0.2586
Epoch 3/5
[1m2240/2240[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m284s[0m 127ms/step - accuracy: 0.8766 - loss: 0.3283 - val_accuracy: 0.9359 - val_loss: 0.2282
Epoch 4/5
[1m2240/2240[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m265s[0m 118ms/step - accuracy: 0.9081 - loss: 0.2433 - val_accuracy: 0.9359 - val_loss: 0.2187
Epoch 5/5
[1m2240/2240[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m272s[0m 122ms/step - accuracy: 0.9221 - loss: 0.2045 - val_accuracy: 0.9439 - val_loss: 0.2256


In [20]:
loss, accuracy = model.evaluate(X_test_pad, y_test)

print(f'Test Loss: {loss}')
print(f'Test Accuracy: {accuracy}')

[1m32/32[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 41ms/step - accuracy: 0.9398 - loss: 0.2096
Test Loss: 0.22555747628211975
Test Accuracy: 0.9439439177513123
