In [19]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import tensorflow as tf
import nltk
import pickle
from nltk.stem import SnowballStemmer
from nltk.corpus import stopwords
import re
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.layers import Embedding, Conv1D, LSTM, Bidirectional, Dense, Input, Dropout, SpatialDropout1D
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import ModelCheckpoint, ReduceLROnPlateau
from tensorflow.keras import backend as K

In [3]:
df = pd.read_csv('/home/devcontainers/NLP/training.1600000.processed.noemoticon.csv',
                 header=None, 
                 encoding='ISO-8859-1')
df.head()

Unnamed: 0,0,1,2,3,4,5
0,0,1467810369,Mon Apr 06 22:19:45 PDT 2009,NO_QUERY,_TheSpecialOne_,"@switchfoot http://twitpic.com/2y1zl - Awww, t..."
1,0,1467810672,Mon Apr 06 22:19:49 PDT 2009,NO_QUERY,scotthamilton,is upset that he can't update his Facebook by ...
2,0,1467810917,Mon Apr 06 22:19:53 PDT 2009,NO_QUERY,mattycus,@Kenichan I dived many times for the ball. Man...
3,0,1467811184,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,ElleCTF,my whole body feels itchy and like its on fire
4,0,1467811193,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,Karoli,"@nationwideclass no, it's not behaving at all...."


In [4]:
df.columns = ['sentiment', 'id', 'date', 'query', 'user_name', 'tweet']
df = df.drop(['id', 'date', 'query', 'user_name'], axis=1)
df.head()

Unnamed: 0,sentiment,tweet
0,0,"@switchfoot http://twitpic.com/2y1zl - Awww, t..."
1,0,is upset that he can't update his Facebook by ...
2,0,@Kenichan I dived many times for the ball. Man...
3,0,my whole body feels itchy and like its on fire
4,0,"@nationwideclass no, it's not behaving at all...."


In [5]:
label_to_sentiment = {0:"Negative", 4:"Positive"}

In [6]:
nltk.download('stopwords')
stopWords = stopwords.words('english')
stemmer = SnowballStemmer('english')
text_cleaning_regex = "@S+|https?:S+|http?:S|[^A-Za-z0-9]+"

[nltk_data] Downloading package stopwords to
[nltk_data]     /home/devcontainers/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [7]:
def clean_texts(text, stem=False):
    text = re.sub(text_cleaning_regex, ' ', str(text).lower()).strip()
    tokens = []
    for token in text.split():
        if token not in stopWords:
            if stem:
                tokens.append(stemmer.stem(token))
            else:
                tokens.append(token)
    return " ".join(tokens)

df.tweet = df.tweet.apply(lambda x: clean_texts(x))
df.head()

Unnamed: 0,sentiment,tweet
0,0,switchfoot http twitpic com 2y1zl awww bummer ...
1,0,upset update facebook texting might cry result...
2,0,kenichan dived many times ball managed save 50...
3,0,whole body feels itchy like fire
4,0,nationwideclass behaving mad see


In [8]:
train_data, test_data = train_test_split(df, test_size=0.3, random_state=42)
print(train_data.shape)
print(test_data.shape)

(1120000, 2)
(480000, 2)


In [9]:
tokenizer = Tokenizer(num_words=20000, oov_token='<OOV>')
tokenizer.fit_on_texts(train_data.tweet)
print(len(tokenizer.word_index))
maxLen = 40
X_train = pad_sequences(tokenizer.texts_to_sequences(train_data.tweet), maxlen=maxLen)
X_test = pad_sequences(tokenizer.texts_to_sequences(test_data.tweet), maxlen=maxLen)
encoder = LabelEncoder()
encoder.fit(train_data.sentiment.to_list())
y_train = encoder.transform(train_data.sentiment.to_list())
y_test = encoder.transform(test_data.sentiment.to_list())
print(y_train.shape)

516824
(1120000,)


In [10]:
y_train = y_train.reshape(-1,1)
y_test = y_test.reshape(-1,1)
print(f"X_train shape -> {X_train.shape}")
print(f"X_test shape -> {X_test.shape}")
print(f"y_train shape -> {y_train.shape}")
print(f"y_test shape -> {y_test.shape}")

X_train shape -> (1120000, 40)
X_test shape -> (480000, 40)
y_train shape -> (1120000, 1)
y_test shape -> (480000, 1)


In [11]:
embeddings = {}
with open('/home/devcontainers/NLP/glove.6B.200d.txt', 'r') as f:
    for line in f:
        line = line.strip().split()
        word = line[0]
        emb = np.array(line[1:], dtype=np.float32)
        embeddings[word] = emb
print(len(embeddings))

400000


In [12]:
vocab_size = 20000 + 1
embedding_matrix = np.zeros((vocab_size, 200), dtype=np.float32)
for word, idx in tokenizer.word_index.items():
    if idx >= vocab_size:
        continue
    emb_vector = embeddings.get(word)
    if emb_vector is not None:
        embedding_matrix[idx] = emb_vector
        
embedding_layer = Embedding(vocab_size, 200, weights=[embedding_matrix], input_length=maxLen, trainable=False)

I0000 00:00:1752500526.563391  168976 gpu_device.cc:2019] Created device /job:localhost/replica:0/task:0/device:GPU:0 with 1767 MB memory:  -> device: 0, name: NVIDIA GeForce RTX 3050 Laptop GPU, pci bus id: 0000:01:00.0, compute capability: 8.6


In [13]:
input = Input(shape=(maxLen,), dtype=np.int32)
embedding = embedding_layer(input)
x = SpatialDropout1D(0.2)(embedding)
x = Conv1D(64, 5, activation='relu')(x)
x = Bidirectional(LSTM(64, dropout=0.2, recurrent_dropout=0.2))(x)
x = Dense(512, activation='relu')(x)
x = Dropout(0.5)(x)
x = Dense(512, activation='relu')(x)
output = Dense(1, activation='sigmoid')(x)
model = tf.keras.Model(input, output)

In [14]:
model.summary()

In [None]:
# def f1_score(y_true, y_pred):
#     y_pred = K.round(y_pred)  # 0 or 1
#     tp = K.sum(K.cast(y_true * y_pred, 'float'), axis=0)
#     fp = K.sum(K.cast((1 - y_true) * y_pred, 'float'), axis=0)
#     fn = K.sum(K.cast(y_true * (1 - y_pred), 'float'), axis=0)

#     precision = tp / (tp + fp + K.epsilon())
#     recall = tp / (tp + fn + K.epsilon())

#     f1 = 2 * precision * recall / (precision + recall + K.epsilon())
#     return K.mean(f1)

In [17]:
model.compile(optimizer=Adam(learning_rate=0.001), loss='binary_crossentropy', metrics=['accuracy'])
checkpoint = ModelCheckpoint('sentiment_tweets_model.h5', monitor='val_loss', mode='min', save_best_only=True, verbose=1)
reduce_LR = ReduceLROnPlateau(factor=0.5, patience=2, min_lr = 1e-6, monitor = 'val_loss',verbose = 1)
history = model.fit(X_train, y_train, batch_size=512, epochs=10, validation_data=(X_test, y_test), callbacks=[reduce_LR, checkpoint])


Epoch 1/10


2025-07-14 19:12:58.994551: W external/local_xla/xla/tsl/framework/cpu_allocator_impl.cc:83] Allocation of 179200000 exceeds 10% of free system memory.
I0000 00:00:1752500585.514920  169560 cuda_dnn.cc:529] Loaded cuDNN version 90501


[1m2188/2188[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1s/step - accuracy: 0.7081 - loss: 0.5566

2025-07-14 20:00:36.610385: W external/local_xla/xla/tsl/framework/cpu_allocator_impl.cc:83] Allocation of 76800000 exceeds 10% of free system memory.



Epoch 1: val_loss improved from inf to 0.49224, saving model to sentiment_tweets_model.h5




[1m2188/2188[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3001s[0m 1s/step - accuracy: 0.7081 - loss: 0.5566 - val_accuracy: 0.7590 - val_loss: 0.4922 - learning_rate: 0.0010
Epoch 2/10
[1m2188/2188[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 826ms/step - accuracy: 0.7526 - loss: 0.5012
Epoch 2: val_loss improved from 0.49224 to 0.47939, saving model to sentiment_tweets_model.h5




[1m2188/2188[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2011s[0m 919ms/step - accuracy: 0.7526 - loss: 0.5012 - val_accuracy: 0.7671 - val_loss: 0.4794 - learning_rate: 0.0010
Epoch 3/10
[1m2188/2188[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 974ms/step - accuracy: 0.7617 - loss: 0.4883
Epoch 3: val_loss improved from 0.47939 to 0.47137, saving model to sentiment_tweets_model.h5




[1m2188/2188[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2268s[0m 1s/step - accuracy: 0.7617 - loss: 0.4883 - val_accuracy: 0.7717 - val_loss: 0.4714 - learning_rate: 0.0010
Epoch 4/10
[1m2188/2188[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 856ms/step - accuracy: 0.7659 - loss: 0.4822
Epoch 4: val_loss did not improve from 0.47137
[1m2188/2188[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2020s[0m 923ms/step - accuracy: 0.7659 - loss: 0.4822 - val_accuracy: 0.7731 - val_loss: 0.4723 - learning_rate: 0.0010
Epoch 5/10
[1m2188/2188[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1s/step - accuracy: 0.7694 - loss: 0.4767
Epoch 5: val_loss improved from 0.47137 to 0.46674, saving model to sentiment_tweets_model.h5




[1m2188/2188[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3289s[0m 2s/step - accuracy: 0.7694 - loss: 0.4767 - val_accuracy: 0.7753 - val_loss: 0.4667 - learning_rate: 0.0010
Epoch 6/10
[1m2188/2188[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 803ms/step - accuracy: 0.7718 - loss: 0.4733
Epoch 6: val_loss improved from 0.46674 to 0.46428, saving model to sentiment_tweets_model.h5




[1m2188/2188[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1903s[0m 870ms/step - accuracy: 0.7718 - loss: 0.4733 - val_accuracy: 0.7768 - val_loss: 0.4643 - learning_rate: 0.0010
Epoch 7/10
[1m2188/2188[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 784ms/step - accuracy: 0.7731 - loss: 0.4704
Epoch 7: val_loss improved from 0.46428 to 0.46305, saving model to sentiment_tweets_model.h5




[1m2188/2188[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1847s[0m 844ms/step - accuracy: 0.7731 - loss: 0.4704 - val_accuracy: 0.7771 - val_loss: 0.4630 - learning_rate: 0.0010
Epoch 8/10
[1m2188/2188[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 774ms/step - accuracy: 0.7740 - loss: 0.4690
Epoch 8: val_loss did not improve from 0.46305
[1m2188/2188[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1829s[0m 836ms/step - accuracy: 0.7740 - loss: 0.4690 - val_accuracy: 0.7775 - val_loss: 0.4652 - learning_rate: 0.0010
Epoch 9/10
[1m2188/2188[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 782ms/step - accuracy: 0.7760 - loss: 0.4660
Epoch 9: val_loss improved from 0.46305 to 0.46171, saving model to sentiment_tweets_model.h5




[1m2188/2188[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1847s[0m 844ms/step - accuracy: 0.7760 - loss: 0.4660 - val_accuracy: 0.7783 - val_loss: 0.4617 - learning_rate: 0.0010
Epoch 10/10
[1m2188/2188[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 795ms/step - accuracy: 0.7761 - loss: 0.4653
Epoch 10: val_loss improved from 0.46171 to 0.46077, saving model to sentiment_tweets_model.h5




[1m2188/2188[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1878s[0m 859ms/step - accuracy: 0.7761 - loss: 0.4653 - val_accuracy: 0.7790 - val_loss: 0.4608 - learning_rate: 0.0010


In [20]:
with open('tokenizer.pkl', 'wb') as f:
    pickle.dump(tokenizer, f)

with open('label_encoder.pkl', 'wb') as f:
    pickle.dump(encoder, f)