In [36]:
import pandas as pd
import tensorflow as tf
import re
import nltk

from nltk.corpus import stopwords
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report

In [27]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [48]:
# nltk.download('stopwords')

df = pd.read_csv('sentiment_analysis.csv')
df = df[['text', 'sentiment']]
df.dropna(inplace=True)

stop_words = set(stopwords.words('english'))

def clean_text(text):
    text = text.lower()
    text = re.sub(r'[^a-zA-Z0-9\s]', '', text)
    words = text.split()
    words = [w for w in words if w not in stop_words]
    return " ".join(words)

df['clean_text'] = df['text'].astype(str).apply(clean_text)

texts = df['clean_text'].tolist()
labels = df['sentiment'].values

tokenizer = Tokenizer(
    num_words=10000,
    oov_token="<OOV>",
    lower=True
)

tokenizer.fit_on_texts(texts)

sequences = tokenizer.texts_to_sequences(texts)

le = LabelEncoder()
X = pad_sequences(sequences, padding='post', maxlen=100)
y = le.fit_transform(df['sentiment'])

df_final = pd.DataFrame({
    'sentence': texts,
    'sentiment': labels
})

print(df_final.head())
print("Input shape:", X.shape)

                                   sentence sentiment
0                        id responded going   neutral
1                   sooo sad miss san diego  negative
2                             boss bullying  negative
3                     interview leave alone  negative
4  sons couldnt put releases already bought  negative
Input shape: (27480, 100)


In [49]:
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size = 0.8, random_state = 42)

In [50]:
X_train

array([[  33,  187, 1026, ...,    0,    0,    0],
       [  35,  130,  450, ...,    0,    0,    0],
       [  92,  126, 1828, ...,    0,    0,    0],
       ...,
       [   1, 1252, 7069, ...,    0,    0,    0],
       [2396,   63,    1, ...,    0,    0,    0],
       [6308,    9,   92, ...,    0,    0,    0]], dtype=int32)

In [51]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, SimpleRNN, Embedding, LSTM, Dropout
from tensorflow.keras.callbacks import EarlyStopping
import numpy as np

model = Sequential([
    Embedding(
        input_dim = 10000,
        output_dim=64,
        input_length = np.percentile([len(x) for x in X_train], 90),
    ),
    LSTM(128, return_sequences=False),
    Dropout(0.2),
    Dense(128, activation = 'tanh'),
    Dense(3, activation='softmax')
])

# Compile the model
model.compile(
    optimizer='adam',
    loss='sparse_categorical_crossentropy',
    metrics=['accuracy']
)

model.summary()



In [55]:
early_stop = EarlyStopping(
    monitor='val_loss',
    patience=10,
    restore_best_weights=True
)

model.fit(
    X_train,
    y_train,
    epochs = 10,
    batch_size = 32,
    validation_split = 0.2,
    verbose = 1,
    callbacks = early_stop
)

Epoch 1/10
[1m550/550[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 17ms/step - accuracy: 0.4030 - loss: 1.0877 - val_accuracy: 0.4094 - val_loss: 1.0842
Epoch 2/10
[1m550/550[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 14ms/step - accuracy: 0.3980 - loss: 1.0893 - val_accuracy: 0.4094 - val_loss: 1.0845
Epoch 3/10
[1m550/550[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 9ms/step - accuracy: 0.4063 - loss: 1.0867 - val_accuracy: 0.4094 - val_loss: 1.0851
Epoch 4/10
[1m550/550[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 10ms/step - accuracy: 0.3987 - loss: 1.0887 - val_accuracy: 0.4094 - val_loss: 1.0852
Epoch 5/10
[1m550/550[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 9ms/step - accuracy: 0.4049 - loss: 1.0872 - val_accuracy: 0.4094 - val_loss: 1.0853
Epoch 6/10
[1m550/550[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 10ms/step - accuracy: 0.4038 - loss: 1.0878 - val_accuracy: 0.4094 - val_loss: 1.0839
Epoch 7/10
[1m550/550[

<keras.src.callbacks.history.History at 0x7a87b78f2c30>