dataset:https://www.kaggle.com/datasets/kazanova/sentiment140

In [30]:
import os, re, sys, json, math, gc, random
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, accuracy_score
import tensorflow as tf
from tensorflow.keras.layers import Input, Embedding, SpatialDropout1D, Conv1D, GlobalMaxPooling1D
from tensorflow.keras.layers import Dense, Dropout, Concatenate
from tensorflow.keras.models import Model
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau, ModelCheckpoint
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.regularizers import l2

In [31]:
SEED = 42
np.random.seed(SEED)
random.seed(SEED)
tf.random.set_seed(SEED)

In [32]:
data = pd.read_csv("training.1600000.processed.noemoticon.csv",encoding="latin-1",header=None,names=["target", "ids", "date", "flag", "user", "text"],on_bad_lines="skip",low_memory=False)

In [33]:
data = data[["target", "text"]].dropna()

In [34]:
data["target"] = data["target"].apply(lambda x: 1 if int(x) == 4 else 0)

In [35]:
URL_RE      = re.compile(r"https?://\S+|www\.\S+")
MENTION_RE  = re.compile(r"@\w+")
HASHTAG_RE  = re.compile(r"#(\w+)")
RT_RE       = re.compile(r"\brt\b")
NONALNUM_RE = re.compile(r"[^a-z0-9\s']")

In [36]:
def clean_tweet(t: str) -> str:
    t = str(t).lower()
    t = URL_RE.sub(" ", t)
    t = MENTION_RE.sub(" ", t)
    t = HASHTAG_RE.sub(r"\1", t)   
    t = RT_RE.sub(" ", t)
    t = NONALNUM_RE.sub(" ", t)
    t = re.sub(r"\s+", " ", t).strip()
    return t

data["text"] = data["text"].apply(clean_tweet)
data = data[(data["text"].str.len() > 0)]

In [37]:
X = data["text"].values
y = data["target"].values.astype(np.int32)

In [38]:
X_temp, X_test, y_temp, y_test = train_test_split(X, y, test_size=0.10, random_state=SEED, stratify=y)
X_train, X_val, y_train, y_val = train_test_split(X_temp, y_temp, test_size=0.10, random_state=SEED, stratify=y_temp)
del data, X, y, X_temp, y_temp
gc.collect()

3517

In [39]:
MAX_WORDS = 80000     
MAX_LEN   = 60   

In [40]:
tokenizer = Tokenizer(num_words=MAX_WORDS, oov_token="<OOV>")
tokenizer.fit_on_texts(list(X_train))

In [41]:
def to_pad(texts):
    seqs = tokenizer.texts_to_sequences(texts)
    return pad_sequences(seqs, maxlen=MAX_LEN, padding="pre", truncating="pre")

X_train_pad = to_pad(X_train)
X_val_pad   = to_pad(X_val)
X_test_pad  = to_pad(X_test)

In [42]:
AUTOTUNE   = tf.data.AUTOTUNE
BATCH_SIZE = 1024 
SHUFFLE_SZ = 1_000_000

In [43]:
train_ds = tf.data.Dataset.from_tensor_slices((X_train_pad, y_train))
train_ds = train_ds.shuffle(min(SHUFFLE_SZ, len(X_train_pad)), seed=SEED, reshuffle_each_iteration=True)
train_ds = train_ds.batch(BATCH_SIZE).prefetch(AUTOTUNE)

In [44]:
val_ds = tf.data.Dataset.from_tensor_slices((X_val_pad, y_val)).batch(BATCH_SIZE).prefetch(AUTOTUNE)
test_ds = tf.data.Dataset.from_tensor_slices((X_test_pad, y_test)).batch(BATCH_SIZE).prefetch(AUTOTUNE)

In [45]:
EMBED_DIM = 128
KERNEL_SIZES = [3, 4, 5]
FILTERS = 256
L2_REG = 1e-4
DROPOUT_RATE = 0.5

In [47]:
inp = Input(shape=(MAX_LEN,), name="input_ids")
emb = Embedding(input_dim=MAX_WORDS, output_dim=EMBED_DIM, input_length=MAX_LEN, name="embedding")(inp)
emb = SpatialDropout1D(0.2, name="spatial_dropout")(emb)

In [48]:
conv_pools = []

for k in KERNEL_SIZES:
    c = Conv1D(
        filters=FILTERS,
        kernel_size=k,
        activation="relu",
        padding="valid",
        kernel_regularizer=l2(L2_REG),
        name=f"conv_{k}"
    )(emb)
    p = GlobalMaxPooling1D(name=f"gmp_{k}")(c)
    conv_pools.append(p)

x = Concatenate(name="concat")(conv_pools)
x = Dropout(DROPOUT_RATE, name="dropout_concat")(x)
x = Dense(128, activation="relu", kernel_regularizer=l2(L2_REG), name="dense_128")(x)
x = Dropout(DROPOUT_RATE, name="dropout_dense")(x)
out = Dense(1, activation="sigmoid", name="output")(x)

In [49]:
model = Model(inputs=inp, outputs=out)

In [50]:
opt = Adam(learning_rate=2e-3)
model.compile(optimizer=opt, loss="binary_crossentropy", metrics=["accuracy"])

In [51]:
model.summary()

In [None]:
callbacks = [
    EarlyStopping(monitor="val_loss", patience=2, restore_best_weights=True, verbose=1),
    ReduceLROnPlateau(monitor="val_loss", factor=0.5, patience=1, min_lr=1e-6, verbose=1),
    ModelCheckpoint("textcnn_best.keras", monitor="val_loss", save_best_only=True, verbose=1)
]

In [53]:
EPOCHS = 20
history = model.fit(train_ds,validation_data=val_ds,epochs=EPOCHS,callbacks=callbacks,verbose=1)

Epoch 1/20
[1m1263/1263[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 985ms/step - accuracy: 0.7759 - loss: 0.4974
Epoch 1: val_loss improved from None to 0.41996, saving model to textcnn_best.keras
[1m1263/1263[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1269s[0m 1s/step - accuracy: 0.8018 - loss: 0.4572 - val_accuracy: 0.8210 - val_loss: 0.4200 - learning_rate: 0.0020
Epoch 2/20
[1m1263/1263[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 721ms/step - accuracy: 0.8316 - loss: 0.4084
Epoch 2: val_loss improved from 0.41996 to 0.41914, saving model to textcnn_best.keras
[1m1263/1263[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m934s[0m 739ms/step - accuracy: 0.8304 - loss: 0.4105 - val_accuracy: 0.8232 - val_loss: 0.4191 - learning_rate: 0.0020
Epoch 3/20
[1m1263/1263[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 725ms/step - accuracy: 0.8461 - loss: 0.3840
Epoch 3: ReduceLROnPlateau reducing learning rate to 0.0010000000474974513.

Epoch 3: val_loss di

In [54]:
test_loss, test_acc = model.evaluate(test_ds, verbose=1)
print(f"\n[TEST] loss={test_loss:.4f}  acc={test_acc:.4f}")


y_pred_prob = model.predict(test_ds, verbose=1).ravel()
y_pred = (y_pred_prob >= 0.5).astype(int)

print("\nClassification Report:")
print(classification_report(y_test, y_pred, digits=4))

[1m156/156[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m25s[0m 157ms/step - accuracy: 0.8233 - loss: 0.4184

[TEST] loss=0.4184  acc=0.8233
[1m156/156[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m25s[0m 157ms/step

Classification Report:
              precision    recall  f1-score   support

           0     0.8131    0.8396    0.8261     79834
           1     0.8341    0.8069    0.8203     79807

    accuracy                         0.8233    159641
   macro avg     0.8236    0.8233    0.8232    159641
weighted avg     0.8236    0.8233    0.8232    159641



In [55]:
artifacts = {
    "MAX_WORDS": MAX_WORDS,
    "MAX_LEN": MAX_LEN,
    "EMBED_DIM": EMBED_DIM,
    "KERNEL_SIZES": KERNEL_SIZES,
    "FILTERS": FILTERS,
    "L2_REG": L2_REG,
    "DROPOUT_RATE": DROPOUT_RATE
}

with open("textcnn_tokenizer.json", "w", encoding="utf-8") as f:
    f.write(tokenizer.to_json())
with open("textcnn_config.json", "w", encoding="utf-8") as f:
    json.dump(artifacts, f, ensure_ascii=False, indent=2)

print("\nArtifacts saved: textcnn_best.keras, textcnn_tokenizer.json, textcnn_config.json")


Artifacts saved: textcnn_best.keras, textcnn_tokenizer.json, textcnn_config.json


In [56]:
def predict_sentiment(texts, threshold=0.5):
    if isinstance(texts, str):
        texts = [texts]
    cleaned = [clean_tweet(t) for t in texts]
    padded = pad_sequences(tokenizer.texts_to_sequences(cleaned), maxlen=MAX_LEN, padding="pre", truncating="pre")
    probs = model.predict(padded, verbose=0).ravel()
    preds = (probs >= threshold).astype(int)
    return list(zip(texts, probs.tolist(), preds.tolist()))


In [71]:
samples = [
    "I love this movie! Absolutely fantastic.",
    "Worst service ever. I'm so disappointed.",
    "it was okay, not great, not terrible.",
    "it was nice food!"
]
print("\nSample predictions:")
for s, p, yhat in predict_sentiment(samples):
    print(f"{yhat}  ({p:.3f})  ->  {s}")


Sample predictions:
1  (0.994)  ->  I love this movie! Absolutely fantastic.
0  (0.007)  ->  Worst service ever. I'm so disappointed.
0  (0.261)  ->  it was okay, not great, not terrible.
1  (0.900)  ->  it was nice food!
