In [1]:
# IMPORTANT: RUN THIS CELL IN ORDER TO IMPORT YOUR KAGGLE DATA SOURCES,
# THEN FEEL FREE TO DELETE THIS CELL.
# NOTE: THIS NOTEBOOK ENVIRONMENT DIFFERS FROM KAGGLE'S PYTHON
# ENVIRONMENT SO THERE MAY BE MISSING LIBRARIES USED BY YOUR
# NOTEBOOK.
import kagglehub
jp797498e_twitter_entity_sentiment_analysis_path = kagglehub.dataset_download('jp797498e/twitter-entity-sentiment-analysis')

print('Data source import complete.')


Using Colab cache for faster access to the 'twitter-entity-sentiment-analysis' dataset.
Data source import complete.


In [2]:
import numpy as np
import pandas as pd

from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import SimpleRNN, LSTM, Dense, Embedding

from tensorflow.keras.optimizers import Adam

In [3]:
columns = ['id', 'country', 'label', 'text']
df = pd.read_csv("/kaggle/input/twitter-entity-sentiment-analysis/twitter_training.csv", names=columns)

In [4]:
df.head()

Unnamed: 0,id,country,label,text
0,2401,Borderlands,Positive,im getting on borderlands and i will murder yo...
1,2401,Borderlands,Positive,I am coming to the borders and I will kill you...
2,2401,Borderlands,Positive,im getting on borderlands and i will kill you ...
3,2401,Borderlands,Positive,im coming on borderlands and i will murder you...
4,2401,Borderlands,Positive,im getting on borderlands 2 and i will murder ...


In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 74682 entries, 0 to 74681
Data columns (total 4 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   id       74682 non-null  int64 
 1   country  74682 non-null  object
 2   label    74682 non-null  object
 3   text     73996 non-null  object
dtypes: int64(1), object(3)
memory usage: 2.3+ MB


In [6]:
df = df.dropna(subset=['text'])

In [7]:
df.drop(columns=['id','country'])

Unnamed: 0,label,text
0,Positive,im getting on borderlands and i will murder yo...
1,Positive,I am coming to the borders and I will kill you...
2,Positive,im getting on borderlands and i will kill you ...
3,Positive,im coming on borderlands and i will murder you...
4,Positive,im getting on borderlands 2 and i will murder ...
...,...,...
74677,Positive,Just realized that the Windows partition of my...
74678,Positive,Just realized that my Mac window partition is ...
74679,Positive,Just realized the windows partition of my Mac ...
74680,Positive,Just realized between the windows partition of...


In [8]:
df['label'].unique()

array(['Positive', 'Neutral', 'Negative', 'Irrelevant'], dtype=object)

In [9]:
from sklearn.preprocessing import OneHotEncoder

encoder = OneHotEncoder(sparse_output=False)
y = encoder.fit_transform(df[['label']])

In [10]:
tokenizer = Tokenizer()

tokenizer.fit_on_texts(df['text'])

In [11]:
tokenizer.word_index

{'the': 1,
 'i': 2,
 'to': 3,
 'and': 4,
 'a': 5,
 'of': 6,
 'is': 7,
 'in': 8,
 'for': 9,
 'this': 10,
 'it': 11,
 'you': 12,
 'on': 13,
 'my': 14,
 'that': 15,
 'com': 16,
 'with': 17,
 'game': 18,
 'so': 19,
 'be': 20,
 'me': 21,
 'have': 22,
 'just': 23,
 'but': 24,
 'not': 25,
 'are': 26,
 'all': 27,
 'at': 28,
 'was': 29,
 'like': 30,
 'out': 31,
 'from': 32,
 '2': 33,
 'your': 34,
 'pic': 35,
 'twitter': 36,
 'now': 37,
 'get': 38,
 'we': 39,
 'as': 40,
 'they': 41,
 'has': 42,
 'if': 43,
 'one': 44,
 'do': 45,
 'good': 46,
 't': 47,
 'about': 48,
 'can': 49,
 'play': 50,
 'no': 51,
 'will': 52,
 'an': 53,
 'new': 54,
 'really': 55,
 'love': 56,
 'when': 57,
 'up': 58,
 "i'm": 59,
 'unk': 60,
 'what': 61,
 'more': 62,
 'time': 63,
 'by': 64,
 'johnson': 65,
 'how': 66,
 'people': 67,
 'some': 68,
 'or': 69,
 'why': 70,
 '3': 71,
 'see': 72,
 'shit': 73,
 "it's": 74,
 'co': 75,
 'been': 76,
 'best': 77,
 'still': 78,
 'facebook': 79,
 '’': 80,
 'https': 81,
 'got': 82,
 'games': 

In [12]:
sequences = tokenizer.texts_to_sequences(df['text'])
sequences

[[307, 174, 13, 140, 4, 2, 52, 1772, 12, 27],
 [2, 120, 404, 3, 1, 6744, 4, 2, 52, 434, 12, 27],
 [307, 174, 13, 140, 4, 2, 52, 434, 12, 27],
 [307, 404, 13, 140, 4, 2, 52, 1772, 12, 27],
 [307, 174, 13, 140, 33, 4, 2, 52, 1772, 12, 21, 27],
 [307, 174, 194, 140, 4, 2, 49, 1772, 12, 27],
 [19,
  2,
  885,
  5,
  373,
  353,
  320,
  232,
  9,
  136,
  43,
  12,
  126,
  122,
  2,
  120,
  5,
  457,
  140,
  543,
  4,
  4209,
  7,
  44,
  6,
  14,
  321,
  752,
  19,
  2,
  786,
  3,
  128,
  497,
  5,
  6154,
  9,
  14,
  272,
  139,
  7,
  1,
  743,
  1899,
  5145,
  1,
  9485,
  2,
  217,
  411,
  35,
  36,
  16,
  18781],
 [19,
  2,
  885,
  5,
  1019,
  6,
  353,
  288,
  232,
  9,
  136,
  43,
  12,
  126,
  122,
  15,
  59,
  5,
  457,
  140,
  543,
  4,
  4209,
  7,
  44,
  6,
  14,
  321,
  752,
  2,
  786,
  3,
  128,
  5,
  6154,
  9,
  14,
  272,
  1084,
  1,
  743,
  1295,
  1807,
  3,
  1,
  9485,
  2,
  217,
  22,
  136,
  35,
  36,
  16,
  18781],
 [19,
  2,
  885,
  5,


In [13]:
max_sequence_length = max(len(seq) for seq in sequences)
padded_sequences = pad_sequences(sequences, maxlen=max_sequence_length)
max_sequence_length

166

In [14]:
words_count = len(tokenizer.word_index) + 1
words_count

33784

In [15]:
X_train, X_test, y_train, y_test = train_test_split(padded_sequences,y, test_size=0.2, random_state=42)

# **RNN**

In [16]:
model = Sequential([
    Embedding(words_count, 80, input_length=max_sequence_length),
    SimpleRNN(64),
    Dense(4, activation='softmax')
])
model.compile(optimizer=Adam(learning_rate=0.001), loss='binary_crossentropy'
, metrics=['accuracy'])



In [17]:
model.fit(X_train, y_train, epochs=5)

Epoch 1/5
[1m1850/1850[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m29s[0m 13ms/step - accuracy: 0.5047 - loss: 0.4707
Epoch 2/5
[1m1850/1850[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m24s[0m 13ms/step - accuracy: 0.8828 - loss: 0.1610
Epoch 3/5
[1m1850/1850[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m24s[0m 13ms/step - accuracy: 0.9389 - loss: 0.0852
Epoch 4/5
[1m1850/1850[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m24s[0m 13ms/step - accuracy: 0.9563 - loss: 0.0603
Epoch 5/5
[1m1850/1850[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m24s[0m 13ms/step - accuracy: 0.9590 - loss: 0.0530


<keras.src.callbacks.history.History at 0x7e2ea27a0a10>

In [18]:
print(model.evaluate(X_train, y_train))
print(model.evaluate(X_test, y_test))

[1m1850/1850[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 5ms/step - accuracy: 0.9672 - loss: 0.0385
[0.03822153061628342, 0.968308687210083]
[1m463/463[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 5ms/step - accuracy: 0.8592 - loss: 0.2101
[0.20881770551204681, 0.8606081008911133]


# **LSTM**

In [19]:
lstm_model = Sequential([
    Embedding(words_count, 50, input_length=max_sequence_length),
    LSTM(64),
    Dense(4, activation='softmax')
])
lstm_model.compile(optimizer=Adam(learning_rate=0.001), loss='binary_crossentropy', metrics=['accuracy'])

In [20]:
lstm_model.fit(X_train, y_train, epochs=5)

Epoch 1/5
[1m1850/1850[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m23s[0m 10ms/step - accuracy: 0.5070 - loss: 0.4641
Epoch 2/5
[1m1850/1850[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m18s[0m 9ms/step - accuracy: 0.8625 - loss: 0.1751
Epoch 3/5
[1m1850/1850[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m18s[0m 10ms/step - accuracy: 0.9093 - loss: 0.1108
Epoch 4/5
[1m1850/1850[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m18s[0m 9ms/step - accuracy: 0.9319 - loss: 0.0813
Epoch 5/5
[1m1850/1850[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m21s[0m 10ms/step - accuracy: 0.9426 - loss: 0.0649


<keras.src.callbacks.history.History at 0x7e2ea27963f0>

In [21]:
print(lstm_model.evaluate(X_train, y_train))
print(lstm_model.evaluate(X_test, y_test))

[1m1850/1850[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 5ms/step - accuracy: 0.9559 - loss: 0.0517
[0.05171939358115196, 0.9562470316886902]
[1m463/463[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 5ms/step - accuracy: 0.8828 - loss: 0.1586
[0.1563572883605957, 0.8868243098258972]
