
# Sentiment Analysis — Refined 
This notebook contains a cleaned, fixed, and robust pipeline for Twitter sentiment analysis using an LSTM-based model.

**What was fixed & added**
- Tokenizer vocabulary increased to 10,000
- Negations preserved in stopword removal (`no`, `not`, `nor`)
- Stemming and lemmatization functions corrected
- Consistent preprocessing for training and inference
- Optional GloVe Twitter embeddings loader (falls back to random embeddings if missing)
- Bidirectional LSTM model with early stopping
- Train/test split, model saving, tokenizer saving
- `ModelWrapper` class for inference with neutral band and VADER fallback


In [1]:

# Imports and environment setup
import os
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2'  
import re
import string
import numpy as np
import pandas as pd
from collections import Counter
import pickle

import nltk
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('vader_lexicon')

from nltk.corpus import stopwords
from nltk.stem import PorterStemmer, WordNetLemmatizer
from nltk.tokenize import RegexpTokenizer

from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix

from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing import sequence
from tensorflow.keras.layers import LSTM, Dense, Dropout, Input, Embedding, Bidirectional
from tensorflow.keras.models import Model, load_model
from tensorflow.keras.optimizers import RMSprop, Adam
from tensorflow.keras.callbacks import EarlyStopping

from nltk.sentiment.vader import SentimentIntensityAnalyzer

print('Imports done')


[nltk_data] Downloading package stopwords to /usr/share/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /usr/share/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package vader_lexicon to
[nltk_data]     /usr/share/nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!
2025-10-24 08:06:54.087835: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1761293214.508639      37 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1761293214.633644      37 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


Imports done


In [2]:
DATA_PATH = "/kaggle/input/sentiment/training.1600000.processed.noemoticon.csv"
if not os.path.exists(DATA_PATH):
    print("Warning: default dataset path not found:", DATA_PATH)
    print("Please update DATA_PATH to where you placed the CSV file.")
else:
    data = pd.read_csv(DATA_PATH, encoding="ISO-8859-1", engine="python", header=None)
    data = data.iloc[:, :6]
    data.columns = ["label", "id", "date", "query", "username", "text"]
    print("Loaded dataset with shape:", data.shape)


Loaded dataset with shape: (1600000, 6)


In [None]:
#------------------------------
# SECTION 1: Data Preprocessing
#------------------------------

In [3]:

STOPWORDS = set(stopwords.words('english')) - {"no", "not", "nor"}

tokenizer_re = RegexpTokenizer(r'\w+')
ps = PorterStemmer()
wnl = WordNetLemmatizer()

def clean_text(text):
    if not isinstance(text, str):
        return ""
    text = text.lower()
    
    # remove URLs and emails
    text = re.sub(r'((www\.[^\s]+)|(https?://[^\s]+))', ' ', text)
    text = re.sub(r'@[A-Za-z0-9_]+', ' ', text)
    
    text = re.sub(r'[^\w\s]', ' ', text)  # remove punctuation
    
    text = re.sub(r'\d+', ' ', text)  # remove numbers
    
    text = re.sub(r'(.)\1{2,}', r'\1', text)  # reduce repeated chars
    
    tokens = tokenizer_re.tokenize(text)
    
    # remove stopwords but keep negations 
    tokens = [t for t in tokens if t not in STOPWORDS]
    
    # lemmatize 
    tokens = [wnl.lemmatize(t) for t in tokens]
    return " ".join(tokens)

print('Preprocessing functions ready')


Preprocessing functions ready


In [4]:

# Prepare a manageable subset (configurable)
try:
    data  
except NameError:
    raise RuntimeError("Dataset not loaded. Please set DATA_PATH correctly and run the load cell.")

# Keep only needed columns
data = data[['text', 'label']].copy()
data['label'] = data['label'].map({0:0, 4:1})

# Take a subset to fit in memory.(adjust)
POS_SAMPLES = 100000 
NEG_SAMPLES = 100000 
pos = data[data['label']==1].sample(frac=1, random_state=42)
neg = data[data['label']==0].sample(frac=1, random_state=42)

pos = pos.iloc[:POS_SAMPLES]
neg = neg.iloc[:NEG_SAMPLES]
data = pd.concat([pos, neg]).sample(frac=1, random_state=42).reset_index(drop=True)

print('Subset prepared, shape =', data.shape)

# Clean the text column 
data['clean_text'] = data['text'].apply(clean_text)
data = data[data['clean_text'].str.strip().astype(bool)].reset_index(drop=True)
print('After cleaning (removed empty), shape =', data.shape)


Subset prepared, shape = (200000, 2)
After cleaning (removed empty), shape = (199088, 3)


In [5]:

# Tokenizer and sequences
MAX_LEN = 50  
VOCAB_SIZE = 10000

tok = Tokenizer(num_words=VOCAB_SIZE, oov_token='<UNK>')
tok.fit_on_texts(data['clean_text'].values)
sequences = tok.texts_to_sequences(data['clean_text'].values)
sequences_matrix = sequence.pad_sequences(sequences, maxlen=MAX_LEN)

# Save tokenizer for later use
with open('/kaggle/working/tokenizer.pickle', 'wb') as handle:
    pickle.dump(tok, handle, protocol=pickle.HIGHEST_PROTOCOL)
print('Tokenizer fit. Vocab size (len tok.word_index) =', len(tok.word_index))


Tokenizer fit. Vocab size (len tok.word_index) = 67509


In [None]:
#------------------------------------------------------------
# SECTION 2: Preparation for model training and Glove
#------------------------------------------------------------

# GloVe (Global Vectors for Word Representation)  is a pre-trained word embedding model created by 
# Stanford, trained on billions of tokens (Twitter, Wikipedia, etc.).

# Each word is represented as a vector of real numbers (like a coordinate in a 100-dimensional space), 
# where semantically similar words are close together.

# vec("happy") ≈ vec("joyful")
# vec("sad") ≈ vec("unhappy")
# vec("king") - vec("man") + vec("woman") ≈ vec("queen")

# Without GloVe, semantic understanding of model is poor (model must learn from scratch)
# With GloVe, it becomes excellent as the model already “knows” word relationships. 
# It also works well even with smaller subsets.

In [6]:

# Train/test split
X = sequences_matrix
y = data['label'].values

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)
print('Train shape:', X_train.shape, 'Test shape:', X_test.shape)


Train shape: (159270, 50) Test shape: (39818, 50)


In [7]:

# load GloVe Twitter embeddings 
EMBEDDING_DIM = 100
embedding_matrix = None

glove_paths = [
    '/kaggle/input/glove-twitter/glove.twitter.27B.100d.txt',
    '/kaggle/input/glove-twitter-100/glove.twitter.27B.100d.txt',
]

found = False
for gp in glove_paths:
    if os.path.exists(gp):
        print('Found GloVe file at', gp)
        embeddings_index = {}
        with open(gp, encoding='utf8') as f:
            for line in f:
                values = line.split()
                word = values[0]
                coefs = np.asarray(values[1:], dtype='float32')
                embeddings_index[word] = coefs
        embedding_matrix = np.zeros((VOCAB_SIZE, EMBEDDING_DIM))
        for word, i in tok.word_index.items():
            if i < VOCAB_SIZE:
                vec = embeddings_index.get(word)
                if vec is not None:
                    embedding_matrix[i] = vec
        found = True
        break

if not found:
    print('GloVe not found in common paths. The model will use random embeddings as fallback.')


Found GloVe file at /kaggle/input/glove-twitter/glove.twitter.27B.100d.txt


In [None]:
#------------------------------------------------------------
# SECTION 3: Model Building and Training Process
#------------------------------------------------------------

# - Embedding (GloVe if available), Bidirectional LSTM, Dense

In [8]:
def build_model(vocab_size=VOCAB_SIZE, embed_dim=EMBEDDING_DIM, max_len=MAX_LEN, embedding_matrix=None):
    inputs = Input(shape=(max_len,))
    
    if embedding_matrix is not None:
        emb = Embedding(input_dim=vocab_size, output_dim=embed_dim, weights=[embedding_matrix], input_length=max_len, trainable=False)(inputs)
    else:
        emb = Embedding(input_dim=vocab_size, output_dim=embed_dim, input_length=max_len)(inputs)
    
    x = Bidirectional(LSTM(128))(emb)
    x = Dense(256, activation='relu')(x)
    x = Dropout(0.4)(x)
    outputs = Dense(1, activation='sigmoid')(x)
    model = Model(inputs, outputs)
    
    return model

model = build_model(embedding_matrix=embedding_matrix)
model.compile(loss='binary_crossentropy', optimizer=RMSprop(learning_rate=0.001), metrics=['accuracy'])
model.summary()


I0000 00:00:1761293296.139823      37 gpu_device.cc:2022] Created device /job:localhost/replica:0/task:0/device:GPU:0 with 13942 MB memory:  -> device: 0, name: Tesla T4, pci bus id: 0000:00:04.0, compute capability: 7.5
I0000 00:00:1761293296.140572      37 gpu_device.cc:2022] Created device /job:localhost/replica:0/task:0/device:GPU:1 with 13942 MB memory:  -> device: 1, name: Tesla T4, pci bus id: 0000:00:05.0, compute capability: 7.5


In [None]:
# Train with EarlyStopping

In [9]:

es = EarlyStopping(monitor='val_loss', patience=2, restore_best_weights=True)
history = model.fit(X_train, y_train, batch_size=128, epochs=6, validation_split=0.1, callbacks=[es])


# Save model
model_path = '/kaggle/working/sentiment_lstm_refined.h5'
model.save(model_path)
print('Model saved to', model_path)


Epoch 1/6


I0000 00:00:1761293302.005545     103 cuda_dnn.cc:529] Loaded cuDNN version 90300


[1m1120/1120[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m18s[0m 11ms/step - accuracy: 0.7257 - loss: 0.5366 - val_accuracy: 0.7526 - val_loss: 0.5032
Epoch 2/6
[1m1120/1120[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 10ms/step - accuracy: 0.7688 - loss: 0.4775 - val_accuracy: 0.7677 - val_loss: 0.4779
Epoch 3/6
[1m1120/1120[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 10ms/step - accuracy: 0.7795 - loss: 0.4624 - val_accuracy: 0.7711 - val_loss: 0.4752
Epoch 4/6
[1m1120/1120[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 10ms/step - accuracy: 0.7871 - loss: 0.4504 - val_accuracy: 0.7800 - val_loss: 0.4638
Epoch 5/6
[1m1120/1120[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m12s[0m 10ms/step - accuracy: 0.7925 - loss: 0.4403 - val_accuracy: 0.7817 - val_loss: 0.4608
Epoch 6/6
[1m1120/1120[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m12s[0m 10ms/step - accuracy: 0.7965 - loss: 0.4345 - val_accuracy: 0.7797 - val_loss: 0.4678
Model saved to /kagg

In [None]:
#------------------------------------------------------------
# SECTION 4: Model Evaluation on Test Set
#------------------------------------------------------------

In [10]:

loss, acc = model.evaluate(X_test, y_test, verbose=1)
print('Test accuracy:', acc)
y_pred_prob = model.predict(X_test)
y_pred = (y_pred_prob > 0.5).astype(int)
print(classification_report(y_test, y_pred, digits=4))


[1m1245/1245[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 4ms/step - accuracy: 0.7829 - loss: 0.4565
Test accuracy: 0.783665657043457
[1m1245/1245[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 3ms/step
              precision    recall  f1-score   support

           0     0.7707    0.8078    0.7888     19917
           1     0.7979    0.7595    0.7782     19901

    accuracy                         0.7837     39818
   macro avg     0.7843    0.7837    0.7835     39818
weighted avg     0.7843    0.7837    0.7835     39818



In [11]:

# ModelWrapper: preprocessing + inference + VADER fallback
class ModelWrapper:
    def __init__(self, model, tokenizer, max_len=MAX_LEN, neutral_low=0.4, neutral_high=0.6):
        self.model = model
        self.tok = tokenizer
        self.max_len = max_len
        self.neutral_low = neutral_low
        self.neutral_high = neutral_high
        self.vader = SentimentIntensityAnalyzer()
    
    def preprocess(self, text):
        return clean_text(text)
    
    def to_sequence(self, text):
        cleaned = self.preprocess(text)
        seq = self.tok.texts_to_sequences([cleaned])
        return sequence.pad_sequences(seq, maxlen=self.max_len)
    
    def predict(self, text):
        x = self.to_sequence(text)
        prob = float(self.model.predict(x)[0][0])
        if prob > self.neutral_high:
            return {'sentiment':'Positive', 'prob':prob}
        elif prob < self.neutral_low:
            return {'sentiment':'Negative', 'prob':prob}
        else:
            vader_score = self.vader.polarity_scores(text)['compound']
            fallback = 'Positive' if vader_score >= 0.05 else 'Negative'
            return {'sentiment':'Neutral_fallback', 'prob':prob, 'vader':vader_score, 'vader_sentiment':fallback}

# Save wrapper artifacts:(tokenizer, model)  already saved above.
print('ModelWrapper ready')


ModelWrapper ready


In [12]:

# Example usage
from tensorflow.keras.models import load_model
with open('/kaggle/working/tokenizer.pickle', 'rb') as handle:
    tok2 = pickle.load(handle)

model2 = load_model('/kaggle/working/sentiment_lstm_refined.h5')
wrapper = ModelWrapper(model2, tok2)

tests = ["I like candy", "I don't like this", "This is the worst movie ever", "I love it"]
for t in tests:
    print(t, "->", wrapper.predict(t))


[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 326ms/step
I like candy -> {'sentiment': 'Positive', 'prob': 0.6377827525138855}
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 34ms/step
I don't like this -> {'sentiment': 'Positive', 'prob': 0.629992663860321}
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 37ms/step
This is the worst movie ever -> {'sentiment': 'Negative', 'prob': 0.0828750729560852}
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 36ms/step
I love it -> {'sentiment': 'Positive', 'prob': 0.8720142841339111}


In [None]:
#------------------------------------------------------------
# SECTION 5: User Input Sentiment Prediction
#------------------------------------------------------------

In [13]:

import pickle
from tensorflow.keras.models import load_model

with open('/kaggle/working/tokenizer.pickle', 'rb') as handle:
    tok = pickle.load(handle)

model = load_model('/kaggle/working/sentiment_lstm_refined.h5')

wrapper = ModelWrapper(model, tok)

print("Sentiment Analysis (type 'exit' to quit)")
while True:
    user_input = input("Enter a sentence: ").strip()
    if user_input.lower() in ["exit", "quit"]:
        print("Goodbye!")
        break
    result = wrapper.predict(user_input)
    
    print(f"\nInput: {user_input}")
    print(f"Predicted Sentiment: {result['sentiment']}")
    print(f"Confidence: {result['prob']:.2f}")
    if 'vader' in result:
        print(f"VADER Fallback → {result['vader_sentiment']} (score={result['vader']:.3f})")
    print("-" * 60)


Sentiment Analysis (type 'exit' to quit)


Enter a sentence:  theu are sweet couple


[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 300ms/step

Input: theu are sweet couple
Predicted Sentiment: Positive
Confidence: 0.79
------------------------------------------------------------


Enter a sentence:  That's not bad


[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 33ms/step

Input: That's not bad
Predicted Sentiment: Negative
Confidence: 0.24
------------------------------------------------------------


Enter a sentence:  you are briliant


[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 34ms/step

Input: you are briliant
Predicted Sentiment: Neutral_fallback
Confidence: 0.55
VADER Fallback → Negative (score=0.000)
------------------------------------------------------------


Enter a sentence:  you wrong


[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 35ms/step

Input: you wrong
Predicted Sentiment: Negative
Confidence: 0.23
------------------------------------------------------------


Enter a sentence:  you are super, I admire


[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 35ms/step

Input: you are super, I admire
Predicted Sentiment: Positive
Confidence: 0.99
------------------------------------------------------------


Enter a sentence:  exit


Goodbye!
