dataset:https://www.kaggle.com/datasets/kazanova/sentiment140

In [1]:
import os, re, sys, gc, random
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, accuracy_score
import tensorflow as tf
from tensorflow.keras.layers import Input, Embedding, SpatialDropout1D, Conv1D, GlobalMaxPooling1D
from tensorflow.keras.layers import Dense, Dropout, Concatenate
from tensorflow.keras.models import Model
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.regularizers import l2

In [2]:
data = pd.read_csv("training.1600000.processed.noemoticon.csv",encoding="latin-1",header=None,names=["target", "ids", "date", "flag", "user", "text"],on_bad_lines="skip",low_memory=False)

In [3]:
data = data[["target", "text"]].dropna()

In [4]:
data["target"] = data["target"].apply(lambda x: 1 if int(x) == 4 else 0)

In [5]:
URL_RE      = re.compile(r"https?://\S+|www\.\S+")
MENTION_RE  = re.compile(r"@\w+")
HASHTAG_RE  = re.compile(r"#(\w+)")
RT_RE       = re.compile(r"\brt\b")
NONALNUM_RE = re.compile(r"[^a-z0-9\s']")

In [6]:
def clean_tweet(t: str) -> str:
    t = str(t).lower()
    t = URL_RE.sub(" ", t)
    t = MENTION_RE.sub(" ", t)
    t = HASHTAG_RE.sub(r"\1", t)   
    t = RT_RE.sub(" ", t)
    t = NONALNUM_RE.sub(" ", t)
    t = re.sub(r"\s+", " ", t).strip()
    return t

data["text"] = data["text"].apply(clean_tweet)
data = data[(data["text"].str.len() > 0)]

In [7]:
X = data["text"].values
y = data["target"].values.astype(np.int32)

In [8]:
X_temp, X_test, y_temp, y_test = train_test_split(X, y, test_size=0.10, random_state=42, stratify=y)

In [9]:
X_train, X_val, y_train, y_val = train_test_split(X_temp, y_temp, test_size=0.10, random_state=42, stratify=y_temp)

In [10]:
del data, X, y, X_temp, y_temp
gc.collect()

45

In [11]:
MAX_WORDS = 50000  
MAX_LEN   = 60

In [12]:
tokenizer = Tokenizer(num_words=MAX_WORDS, oov_token="<OOV>")
tokenizer.fit_on_texts(list(X_train))

In [13]:
def to_pad(texts):
    seqs = tokenizer.texts_to_sequences(texts)
    return pad_sequences(seqs, maxlen=MAX_LEN, padding="pre", truncating="pre")

In [14]:
X_train_pad = to_pad(X_train)
X_val_pad   = to_pad(X_val)
X_test_pad  = to_pad(X_test)

In [15]:
AUTOTUNE   = tf.data.AUTOTUNE
BATCH_SIZE = 1024
SHUFFLE_SZ = 1_000_000

In [16]:
train_ds = tf.data.Dataset.from_tensor_slices((X_train_pad, y_train))
train_ds = train_ds.shuffle(min(SHUFFLE_SZ, len(X_train_pad)), seed=42, reshuffle_each_iteration=True)
train_ds = train_ds.batch(BATCH_SIZE).prefetch(AUTOTUNE)

In [17]:
val_ds = tf.data.Dataset.from_tensor_slices((X_val_pad, y_val)).batch(BATCH_SIZE).prefetch(AUTOTUNE)
test_ds = tf.data.Dataset.from_tensor_slices((X_test_pad, y_test)).batch(BATCH_SIZE).prefetch(AUTOTUNE)

In [None]:
conv_pools = []
KERNEL_SIZES = [3, 4, 5]

inp = Input(shape=(MAX_LEN,), name="input_ids")
emb = Embedding(input_dim=MAX_WORDS, output_dim=128, input_length=MAX_LEN, name="embedding")(inp)
emb = SpatialDropout1D(0.2, name="spatial_dropout")(emb)

for k in KERNEL_SIZES:
    c = Conv1D(
        filters=128,
        kernel_size=k,
        activation="relu",
        padding="valid",
        kernel_regularizer=l2(0.0001),
        name=f"conv_{k}"
    )(emb)
    p = GlobalMaxPooling1D(name=f"gmp_{k}")(c)
    conv_pools.append(p)

x = Concatenate(name="concat")(conv_pools)
x = Dropout(0.5, name="dropout_concat")(x)
x = Dense(128, activation="relu", kernel_regularizer=l2(0.0001), name="dense_128")(x)
x = Dropout(0.5, name="dropout_dense")(x)
out = Dense(1, activation="sigmoid", name="output")(x)



In [19]:
model = Model(inputs=inp, outputs=out)

In [20]:
model.compile(loss="binary_crossentropy", optimizer="adam", metrics=["accuracy"])

In [21]:
model.summary()

In [22]:
history = model.fit(train_ds,validation_data=val_ds,epochs=15,verbose=1)

Epoch 1/15
[1m1263/1263[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2496s[0m 2s/step - accuracy: 0.8001 - loss: 0.4577 - val_accuracy: 0.8207 - val_loss: 0.4149
Epoch 2/15
[1m1263/1263[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2530s[0m 2s/step - accuracy: 0.8307 - loss: 0.4021 - val_accuracy: 0.8241 - val_loss: 0.4110
Epoch 3/15
[1m1263/1263[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2484s[0m 2s/step - accuracy: 0.8436 - loss: 0.3809 - val_accuracy: 0.8225 - val_loss: 0.4164
Epoch 4/15
[1m1263/1263[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2542s[0m 2s/step - accuracy: 0.8537 - loss: 0.3635 - val_accuracy: 0.8252 - val_loss: 0.4255
Epoch 5/15
[1m1263/1263[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2485s[0m 2s/step - accuracy: 0.8627 - loss: 0.3472 - val_accuracy: 0.8215 - val_loss: 0.4442
Epoch 6/15
[1m1263/1263[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2503s[0m 2s/step - accuracy: 0.8698 - loss: 0.3339 - val_accuracy: 0.8213 - val_loss: 0.4613
Epoc

In [23]:
test_loss, test_acc = model.evaluate(test_ds, verbose=1)

[1m156/156[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m57s[0m 363ms/step - accuracy: 0.8125 - loss: 0.5729


In [24]:
y_pred_prob = model.predict(test_ds, verbose=1).ravel()
y_pred = (y_pred_prob >= 0.5).astype(int)

print(classification_report(y_test, y_pred, digits=4))

[1m156/156[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m56s[0m 355ms/step
              precision    recall  f1-score   support

           0     0.8080    0.8200    0.8140     79834
           1     0.8173    0.8051    0.8111     79807

    accuracy                         0.8125    159641
   macro avg     0.8126    0.8125    0.8125    159641
weighted avg     0.8126    0.8125    0.8125    159641



# Test

In [25]:
def predict_sentiment(texts, threshold=0.5):
    if isinstance(texts, str):
        texts = [texts]
    cleaned = [clean_tweet(t) for t in texts]
    padded = pad_sequences(tokenizer.texts_to_sequences(cleaned), maxlen=MAX_LEN, padding="pre", truncating="pre")
    probs = model.predict(padded, verbose=0).ravel()
    preds = (probs >= threshold).astype(int)
    return list(zip(texts, probs.tolist(), preds.tolist()))

In [26]:
samples = [
    "I love this movie! Absolutely fantastic.",
    "Worst service ever. I'm so disappointed.",
    "it was okay, not great, not terrible.",
    "it was nice food!"
    ]

In [27]:
for s, p, yhat in predict_sentiment(samples):
    print(f"{yhat}  ({p:.3f})  ->  {s}")

1  (0.997)  ->  I love this movie! Absolutely fantastic.
0  (0.000)  ->  Worst service ever. I'm so disappointed.
0  (0.317)  ->  it was okay, not great, not terrible.
1  (0.960)  ->  it was nice food!
