dataset:https://www.kaggle.com/datasets/kazanova/sentiment140

In [1]:
import os, re, sys, gc, random
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, accuracy_score
import tensorflow as tf
from tensorflow.keras.layers import Input, Embedding, SpatialDropout1D, Conv1D, GlobalMaxPooling1D
from tensorflow.keras.layers import Dense, Dropout, Concatenate
from tensorflow.keras.models import Model
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.regularizers import l2

In [2]:
data = pd.read_csv("training.1600000.processed.noemoticon.csv",encoding="latin-1",header=None,names=["target", "ids", "date", "flag", "user", "text"],on_bad_lines="skip",low_memory=False)

In [3]:
data = data[["target", "text"]].dropna()

In [4]:
data["target"] = data["target"].apply(lambda x: 1 if int(x) == 4 else 0)

In [5]:
URL_RE      = re.compile(r"https?://\S+|www\.\S+")
MENTION_RE  = re.compile(r"@\w+")
HASHTAG_RE  = re.compile(r"#(\w+)")
RT_RE       = re.compile(r"\brt\b")
NONALNUM_RE = re.compile(r"[^a-z0-9\s']")


def clean_tweet(t: str) -> str:
    t = str(t).lower()
    t = URL_RE.sub(" ", t)
    t = MENTION_RE.sub(" ", t)
    t = HASHTAG_RE.sub(r"\1", t)   
    t = RT_RE.sub(" ", t)
    t = NONALNUM_RE.sub(" ", t)
    t = re.sub(r"\s+", " ", t).strip()
    return t

data["text"] = data["text"].apply(clean_tweet)
data = data[(data["text"].str.len() > 0)]

In [6]:
X = data["text"].values
y = data["target"].values.astype(np.int32)

X_temp, X_test, y_temp, y_test = train_test_split(X, y, test_size=0.10, random_state=42, stratify=y)

X_train, X_val, y_train, y_val = train_test_split(X_temp, y_temp, test_size=0.10, random_state=42, stratify=y_temp)

del data, X, y, X_temp, y_temp
gc.collect()

45

In [7]:
MAX_WORDS = 200000
MAX_LEN   = 100

In [8]:
tokenizer = Tokenizer(num_words=MAX_WORDS, oov_token="<OOV>")
tokenizer.fit_on_texts(list(X_train))

def to_pad(texts):
    seqs = tokenizer.texts_to_sequences(texts)
    return pad_sequences(seqs, maxlen=MAX_LEN, padding="pre", truncating="pre")

X_train_pad = to_pad(X_train)
X_val_pad   = to_pad(X_val)
X_test_pad  = to_pad(X_test)

In [10]:
AUTOTUNE   = tf.data.AUTOTUNE
BATCH_SIZE = 4096
SHUFFLE_SZ = 100_000

In [11]:
train_ds = tf.data.Dataset.from_tensor_slices((X_train_pad, y_train))
train_ds = train_ds.shuffle(min(SHUFFLE_SZ, len(X_train_pad)), seed=42, reshuffle_each_iteration=True)
train_ds = train_ds.batch(BATCH_SIZE).prefetch(AUTOTUNE)

In [12]:
val_ds = tf.data.Dataset.from_tensor_slices((X_val_pad, y_val)).batch(BATCH_SIZE).prefetch(AUTOTUNE)
test_ds = tf.data.Dataset.from_tensor_slices((X_test_pad, y_test)).batch(BATCH_SIZE).prefetch(AUTOTUNE)

In [14]:
conv_pools = []
KERNEL_SIZES = [3, 4, 5]

inp = Input(shape=(MAX_LEN,), name="input_ids")
emb = Embedding(input_dim=MAX_WORDS, output_dim=128, input_length=MAX_LEN, name="embedding")(inp)
emb = SpatialDropout1D(0.2, name="spatial_dropout")(emb)

for k in KERNEL_SIZES:
    c = Conv1D(
        filters=128,
        kernel_size=k,
        activation="relu",
        padding="valid",
        kernel_regularizer=l2(0.0001),
        name=f"conv_{k}"
    )(emb)
    p = GlobalMaxPooling1D(name=f"gmp_{k}")(c)
    conv_pools.append(p)

x = Concatenate(name="concat")(conv_pools)
x = Dropout(0.4, name="dropout_concat")(x)
x = Dense(128, activation="relu", kernel_regularizer=l2(0.0001), name="dense_128")(x)
x = Dropout(0.4, name="dropout_dense")(x)
out = Dense(1, activation="sigmoid", name="output")(x)

In [15]:
model = Model(inputs=inp, outputs=out)

In [16]:
model.compile(loss="binary_crossentropy", optimizer="adam", metrics=["accuracy"])

In [17]:
model.summary()

In [18]:
history = model.fit(train_ds,validation_data=val_ds,epochs=5,verbose=1)

Epoch 1/5
[1m316/316[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m889s[0m 3s/step - accuracy: 0.7864 - loss: 0.4788 - val_accuracy: 0.8166 - val_loss: 0.4201
Epoch 2/5
[1m316/316[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m890s[0m 3s/step - accuracy: 0.8297 - loss: 0.3979 - val_accuracy: 0.8226 - val_loss: 0.4077
Epoch 3/5
[1m316/316[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1028s[0m 3s/step - accuracy: 0.8510 - loss: 0.3588 - val_accuracy: 0.8221 - val_loss: 0.4163
Epoch 4/5
[1m316/316[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m998s[0m 3s/step - accuracy: 0.8690 - loss: 0.3245 - val_accuracy: 0.8184 - val_loss: 0.4341
Epoch 5/5
[1m316/316[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1498s[0m 5s/step - accuracy: 0.8831 - loss: 0.2954 - val_accuracy: 0.8154 - val_loss: 0.4691


In [19]:
test_loss, test_acc = model.evaluate(test_ds, verbose=1)

[1m39/39[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m38s[0m 971ms/step - accuracy: 0.8172 - loss: 0.4686


In [20]:
y_pred_prob = model.predict(test_ds, verbose=1).ravel()
y_pred = (y_pred_prob >= 0.5).astype(int)

print(classification_report(y_test, y_pred, digits=4))

[1m39/39[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m30s[0m 765ms/step
              precision    recall  f1-score   support

           0     0.8167    0.8180    0.8173     79834
           1     0.8176    0.8163    0.8170     79807

    accuracy                         0.8172    159641
   macro avg     0.8172    0.8172    0.8172    159641
weighted avg     0.8172    0.8172    0.8172    159641



# Test

In [21]:
def predict_sentiment(texts, threshold=0.5):
    if isinstance(texts, str):
        texts = [texts]
    cleaned = [clean_tweet(t) for t in texts]
    padded = pad_sequences(tokenizer.texts_to_sequences(cleaned), maxlen=MAX_LEN, padding="pre", truncating="pre")
    probs = model.predict(padded, verbose=0).ravel()
    preds = (probs >= threshold).astype(int)
    return list(zip(texts, probs.tolist(), preds.tolist()))

In [27]:
samples = [
    "Just watched an awesome movie! Totally loved it! #movie #happy",
    "This new phone is amazing, super fast and cool features! 😍",
    "Worst day ever, stuck in traffic for hours 😡 #annoying",
    "I’m so disappointed with this restaurant, food was cold!",
    "Feeling okay, just chilling at home. Nothing special.",
    "The weather is nice today, good for a walk! ☀️",
    "Lost my keys again, so frustrating! #badluck",
    "Just finished a book, it was alright, not the best.",
    "Love my new headphones, sound quality is top-notch! 🎶",
    "Can’t believe this app crashed again, so annoying! 😣"
]

In [29]:
results = predict_sentiment(samples)
for s, p, yhat in results:
    sentiment_text = "+" if yhat == 1 else "-"
    print(f"{sentiment_text} ({p:.3f}) -> {s}")

+ (0.998) -> Just watched an awesome movie! Totally loved it! #movie #happy
+ (0.987) -> This new phone is amazing, super fast and cool features! 😍
- (0.005) -> Worst day ever, stuck in traffic for hours 😡 #annoying
- (0.012) -> I’m so disappointed with this restaurant, food was cold!
+ (0.578) -> Feeling okay, just chilling at home. Nothing special.
+ (0.986) -> The weather is nice today, good for a walk! ☀️
- (0.001) -> Lost my keys again, so frustrating! #badluck
- (0.430) -> Just finished a book, it was alright, not the best.
+ (0.997) -> Love my new headphones, sound quality is top-notch! 🎶
- (0.014) -> Can’t believe this app crashed again, so annoying! 😣
