In [60]:
import pandas as pd
import numpy as np
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder


In [61]:
# Load dataset
df = pd.read_csv(r"D:\3rd year second semester\NLP\nlp task\sentiment.csv", encoding="ISO-8859-1")
print(df.head())

       textID                                               text sentiment  \
0  f87dea47db  Last session of the day  http://twitpic.com/67ezh   neutral   
1  96d74cb729   Shanghai is also really exciting (precisely -...  positive   
2  eee518ae67  Recession hit Veronique Branquinho, she has to...  negative   
3  01082688c6                                        happy bday!  positive   
4  33987a8ee5             http://twitpic.com/4w75p - I like it!!  positive   

  Time of Tweet Age of User      Country  Population -2020  Land Area (Km²)  \
0       morning        0-20  Afghanistan        38928346.0         652860.0   
1          noon       21-30      Albania         2877797.0          27400.0   
2         night       31-45      Algeria        43851044.0        2381740.0   
3       morning       46-60      Andorra           77265.0            470.0   
4          noon       60-70       Angola        32866272.0        1246700.0   

   Density (P/Km²)  
0             60.0  
1            1

In [62]:
df.dropna(inplace=True)  # Remove missing values
X = df['text'].astype(str).values
y = df['sentiment'].values  # Updated column name

In [63]:
# Encode labels
label_encoder = LabelEncoder()
y = label_encoder.fit_transform(y)

In [64]:
# Tokenization
tokenizer = Tokenizer(num_words=5000, oov_token="<OOV>")
tokenizer.fit_on_texts(X)
X_seq = tokenizer.texts_to_sequences(X)


In [65]:
# Padding
X_pad = pad_sequences(X_seq, maxlen=150, padding='post', truncating='post')

In [66]:
# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X_pad, y, test_size=0.2, random_state=42)

In [67]:
# Build Improved RNN model
model = Sequential([
    Embedding(input_dim=10000, output_dim=128, input_length=200),
    SimpleRNN(128, return_sequences=True),
    Dropout(0.5),
    SimpleRNN(64, return_sequences=False),
    Dropout(0.5),
    Dense(64, activation='relu'),
    Dense(3, activation='softmax')  # 3 output classes
])



In [68]:
# Compile model
model.compile(loss='sparse_categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

In [69]:
# Train model
model.fit(X_train, y_train, epochs=10, batch_size=32, validation_data=(X_test, y_test))

Epoch 1/10
[1m89/89[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 71ms/step - accuracy: 0.3544 - loss: 1.1494 - val_accuracy: 0.2956 - val_loss: 1.1120
Epoch 2/10
[1m89/89[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 63ms/step - accuracy: 0.3628 - loss: 1.1645 - val_accuracy: 0.3027 - val_loss: 1.2012
Epoch 3/10
[1m89/89[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 59ms/step - accuracy: 0.3564 - loss: 1.1322 - val_accuracy: 0.4045 - val_loss: 1.0949
Epoch 4/10
[1m89/89[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 65ms/step - accuracy: 0.3498 - loss: 1.1224 - val_accuracy: 0.4045 - val_loss: 1.1023
Epoch 5/10
[1m89/89[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 63ms/step - accuracy: 0.3644 - loss: 1.1168 - val_accuracy: 0.3041 - val_loss: 1.1074
Epoch 6/10
[1m89/89[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 63ms/step - accuracy: 0.3598 - loss: 1.1142 - val_accuracy: 0.4045 - val_loss: 1.0891
Epoch 7/10
[1m89/89[0m [32m━━━

<keras.src.callbacks.history.History at 0x219f15d3b90>

In [70]:
# Evaluate model
test_loss, test_acc = model.evaluate(X_test, y_test)
print(f"Test Accuracy: {test_acc:.2f}")

[1m23/23[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 20ms/step - accuracy: 0.3880 - loss: 1.0941
Test Accuracy: 0.40
