In [1]:
import pandas as pd
import numpy as np
import re
import nltk
import tensorflow as tf
import matplotlib.pyplot as plt
from nltk.tokenize import word_tokenize
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Embedding, Bidirectional, LSTM, Dense, Dropout
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split

In [2]:
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [3]:
df = pd.read_csv('training.1600000.processed.noemoticon.csv', encoding='ISO-8859-1', header=None)
df = df[[0, 5]]
df.columns = ['label', 'text']
df['label'] = df['label'].replace(4, 1)
df['text'] = df['text'].astype(str)

In [4]:
def clean_text(text):
    text = re.sub(r"http\S+|www\S+|https\S+", '', text, flags=re.MULTILINE)
    text = re.sub(r'@\w+|#', '', text)
    text = re.sub(r"[^a-zA-Z']", " ", text)
    text = text.lower()
    return text

In [5]:
df['text'] = df['text'].apply(clean_text)

In [None]:
tokenizer = Tokenizer()
tokenizer.fit_on_texts(df['text'])
vocab_size = len(tokenizer.word_index) + 1
sequences =tokenizer.texts_to_sequences(df['text'])
max_length = 50
padded = pad_sequences(sequences, maxlen=max_length, padding='post')
labels =df['label'].values


In [7]:
X_train, X_test, y_train, y_test = train_test_split(padded, labels, test_size=0.2, random_state=42)


In [None]:
class Attention(tf.keras.layers.Layer):
    def __init__(self, **kwargs):
        super(Attention, self).__init__(**kwargs)

    def build(self, input_shape):
        self.W =self.add_weight(shape=(input_shape[-1], 1),initializer='glorot_uniform', trainable=True)
        self.b = self.add_weight(shape=(1,), initializer='zeros', trainable=True)
        super(Attention, self).build(input_shape)

    def call(self, inputs):
        score= tf.nn.tanh(tf.matmul(inputs, self.W) + self.b)
        attention_weights =tf.nn.softmax(score, axis=1)
        context_vector= attention_weights * inputs
        context_vector = tf.reduce_sum(context_vector, axis=1)
        return context_vector

In [None]:
embedding_dim = 100
input_layer= Input(shape=(max_length,))
embedding =Embedding(input_dim=vocab_size, output_dim=embedding_dim, input_length=max_length)(input_layer)
bilstm= Bidirectional(LSTM(64, return_sequences=True))(embedding)
attention = Attention()(bilstm)
drop =Dropout(0.5)(attention)
output_layer = Dense(1, activation='sigmoid')(drop)

model = Model(inputs=input_layer, outputs=output_layer)
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
model.summary()




In [10]:
history = model.fit(X_train, y_train, epochs=5, batch_size=128, validation_data=(X_test, y_test))


Epoch 1/5
[1m10000/10000[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m256s[0m 25ms/step - accuracy: 0.7853 - loss: 0.4502 - val_accuracy: 0.8235 - val_loss: 0.3888
Epoch 2/5
[1m10000/10000[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m260s[0m 25ms/step - accuracy: 0.8447 - loss: 0.3527 - val_accuracy: 0.8271 - val_loss: 0.3848
Epoch 3/5
[1m10000/10000[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m262s[0m 25ms/step - accuracy: 0.8708 - loss: 0.3032 - val_accuracy: 0.8196 - val_loss: 0.4031
Epoch 4/5
[1m10000/10000[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m251s[0m 25ms/step - accuracy: 0.8922 - loss: 0.2577 - val_accuracy: 0.8169 - val_loss: 0.4361
Epoch 5/5
[1m10000/10000[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m250s[0m 25ms/step - accuracy: 0.9071 - loss: 0.2232 - val_accuracy: 0.8119 - val_loss: 0.4628


In [11]:
loss, accuracy = model.evaluate(X_test, y_test)
print(f'Test Accuracy: {accuracy * 100:.2f}%')

[1m10000/10000[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m49s[0m 5ms/step - accuracy: 0.8118 - loss: 0.4635
Test Accuracy: 81.19%


In [13]:
model.save("sentiment_model.h5")
import pickle
with open("tokenizer.pkl", "wb") as f:
    pickle.dump(tokenizer, f)



In [14]:
def predict_sentiment(text):
    text = clean_text(text)
    seq = tokenizer.texts_to_sequences([text])
    padded_seq = pad_sequences(seq, maxlen=max_length, padding='post')
    pred = model.predict(padded_seq)[0][0]
    sentiment = "Positive" if pred > 0.5 else "Negative"
    print(f"Input: {text}\nPrediction: {sentiment} ({pred:.4f})")


In [None]:
input1 = input("Enter a sentence: ")
predict_sentiment(input1)