**6.Sentiment analysis using LSTM network or GRU.**  

In [1]:
import sqlite3
import pandas as pd

# Connect to the SQLite database
conn = sqlite3.connect("database.sqlite")

# See what tables are inside
print(pd.read_sql("SELECT name FROM sqlite_master WHERE type='table';", conn))

# Load the main table (usually called 'Tweets')
df = pd.read_sql("SELECT * FROM Tweets", conn)

# Show some rows
print(df.head())


     name
0  Tweets
             tweet_id airline_sentiment  airline_sentiment_confidence  \
0  567588278875213824           neutral                           1.0   
1  567590027375702016          negative                           1.0   
2  567591480085463040          negative                           1.0   
3  567592368451248130          negative                           1.0   
4  567594449874587648          negative                           1.0   

           negativereason negativereason_confidence    airline  \
0                                                        Delta   
1              Can't Tell                    0.6503      Delta   
2             Late Flight                     0.346     United   
3             Late Flight                         1     United   
4  Customer Service Issue                    0.3451  Southwest   

  airline_sentiment_gold         name negativereason_gold  retweet_count  \
0                         JetBlueNews                               

In [4]:
# Keep only tweet text and sentiment
# data = df[['airline_sentiment', 'text']] # No longer needed

# Encode labels as numbers
df['label'] = df['airline_sentiment'].map({'negative':0, 'neutral':1, 'positive':2})

In [5]:
#Split into Train and Test Sets
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    df['text'], df['label'], test_size=0.2, random_state=42, stratify=df['label'])


In [6]:
#Tokenize and Pad Sequences
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

vocab_size = 10000
maxlen = 50

tokenizer = Tokenizer(num_words=vocab_size, oov_token="<OOV>")
tokenizer.fit_on_texts(X_train)

X_train_seq = tokenizer.texts_to_sequences(X_train)
X_test_seq = tokenizer.texts_to_sequences(X_test)

X_train_pad = pad_sequences(X_train_seq, maxlen=maxlen, padding='post')
X_test_pad = pad_sequences(X_test_seq, maxlen=maxlen, padding='post')


In [7]:
#Build the LSTM or GRU Model
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, GRU, Dense, Dropout

num_classes = 3  # negative, neutral, positive

model = Sequential([
    Embedding(vocab_size, 128, input_length=maxlen),
    LSTM(128, dropout=0.2, recurrent_dropout=0.2),  # or GRU(128,...)
    Dense(64, activation='relu'),
    Dropout(0.5),
    Dense(num_classes, activation='softmax')
])

model.compile(loss='sparse_categorical_crossentropy',
              optimizer='adam',
              metrics=['accuracy'])

model.summary()




In [8]:
#Train The Model
history = model.fit(
    X_train_pad, y_train,
    validation_split=0.2,
    epochs=5,
    batch_size=64
)


Epoch 1/5
[1m145/145[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m30s[0m 172ms/step - accuracy: 0.6240 - loss: 0.9399 - val_accuracy: 0.6665 - val_loss: 0.7659
Epoch 2/5
[1m145/145[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m23s[0m 159ms/step - accuracy: 0.6679 - loss: 0.7402 - val_accuracy: 0.7010 - val_loss: 0.6597
Epoch 3/5
[1m145/145[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m41s[0m 162ms/step - accuracy: 0.7361 - loss: 0.5872 - val_accuracy: 0.6739 - val_loss: 0.6764
Epoch 4/5
[1m145/145[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m42s[0m 172ms/step - accuracy: 0.7523 - loss: 0.5234 - val_accuracy: 0.7092 - val_loss: 0.6668
Epoch 5/5
[1m145/145[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m39s[0m 162ms/step - accuracy: 0.7667 - loss: 0.4773 - val_accuracy: 0.7282 - val_loss: 0.7749


In [9]:
#Evaluate the Model
loss, acc = model.evaluate(X_test_pad, y_test)
print(f"Test Accuracy: {acc:.2f}")


[1m91/91[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 19ms/step - accuracy: 0.7428 - loss: 0.7303
Test Accuracy: 0.74


In [10]:
# Some new unseen tweets to test
new_tweets = [
    "The flight was delayed for 3 hours and staff were rude.",
    "Excellent service, on-time flight and friendly crew!",
    "It was okay, nothing special but nothing bad either.",
    "Lost my luggage again. So frustrating!",
    "Loved the extra legroom seats, super comfortable."
]

# Convert text to sequences using the same tokenizer used for training
seqs = tokenizer.texts_to_sequences(new_tweets)

# Pad sequences to the same maxlen as training data
pads = pad_sequences(seqs, maxlen=maxlen, padding='post')

# Predict sentiment for each tweet
preds = model.predict(pads)

# Map predicted label numbers back to sentiment names
sentiments = ['negative', 'neutral', 'positive']

for tweet, p in zip(new_tweets, preds):
    print(f"Tweet: {tweet}")
    print(f"Predicted Sentiment: {sentiments[p.argmax()]}")
    print("-" * 50)


[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 639ms/step
Tweet: The flight was delayed for 3 hours and staff were rude.
Predicted Sentiment: negative
--------------------------------------------------
Tweet: Excellent service, on-time flight and friendly crew!
Predicted Sentiment: positive
--------------------------------------------------
Tweet: It was okay, nothing special but nothing bad either.
Predicted Sentiment: negative
--------------------------------------------------
Tweet: Lost my luggage again. So frustrating!
Predicted Sentiment: negative
--------------------------------------------------
Tweet: Loved the extra legroom seats, super comfortable.
Predicted Sentiment: neutral
--------------------------------------------------
