### Step 1: Import Necessary Libraries

In [1]:
import pandas as pd
import numpy as np
import re
from sklearn.model_selection import train_test_split
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, SimpleRNN, Dense


### Step 2: Load the Dataset 

In [2]:
file_path = "C:\\Users\\Lenovo\\Downloads\\archive (13)\\Tweets.csv"
# We use a try-except block to handle the error if the file isn't found.
try:
    full_dataset = pd.read_csv(file_path)
except FileNotFoundError:
    print(f"Error: The file '{file_path}' was not found. Make sure it's in the same folder as your script.")
    exit()

# We only need two columns: the tweet's text and its sentiment.
data = full_dataset[['text', 'airline_sentiment']]
print("Dataset loaded successfully! Here are the first 5 rows:")
print(data.head())
print("\nHere's how many tweets we have for each sentiment:")
print(data['airline_sentiment'].value_counts())

Dataset loaded successfully! Here are the first 5 rows:
                                                text airline_sentiment
0                @VirginAmerica What @dhepburn said.           neutral
1  @VirginAmerica plus you've added commercials t...          positive
2  @VirginAmerica I didn't today... Must mean I n...           neutral
3  @VirginAmerica it's really aggressive to blast...          negative
4  @VirginAmerica and it's a really big bad thing...          negative

Here's how many tweets we have for each sentiment:
airline_sentiment
negative    9178
neutral     3099
positive    2363
Name: count, dtype: int64


### Step 3: Clean the Tweet Text 

In [3]:
# Tweets are messy! They have @mentions, links, and symbols.
# We'll create a function to clean this up.
def clean_tweet_text(text):
    # Remove usernames (e.g., "@virginamerica")
    text = re.sub(r'@\w+', '', text)
    # Remove website links
    text = re.sub(r'http\S+', '', text)
    # Remove characters that aren't letters or spaces
    text = re.sub(r'[^a-zA-Z\s]', '', text)
    # Make all text lowercase
    text = text.lower()
    return text

In [4]:
# Apply our cleaning function to every tweet in the 'text' column.
data['cleaned_text'] = data['text'].apply(clean_tweet_text)
print("Text cleaning complete! Here's the same 5 rows after cleaning:")
print(data.head())

Text cleaning complete! Here's the same 5 rows after cleaning:
                                                text airline_sentiment  \
0                @VirginAmerica What @dhepburn said.           neutral   
1  @VirginAmerica plus you've added commercials t...          positive   
2  @VirginAmerica I didn't today... Must mean I n...           neutral   
3  @VirginAmerica it's really aggressive to blast...          negative   
4  @VirginAmerica and it's a really big bad thing...          negative   

                                        cleaned_text  
0                                         what  said  
1   plus youve added commercials to the experienc...  
2   i didnt today must mean i need to take anothe...  
3   its really aggressive to blast obnoxious ente...  
4            and its a really big bad thing about it  


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['cleaned_text'] = data['text'].apply(clean_tweet_text)


### Step 4: Convert Text to Numbers (Tokenization & Padding)

In [5]:
# A neural network can only understand numbers, not words.
# So, we need to convert our cleaned text into numerical sequences.

# We'll set some limits for our model.
max_words_in_vocab = 5000  # The model will only learn the top 5,000 most common words.
max_tweet_length = 100     # Each tweet will be treated as if it has 100 words (we'll add padding).

# This Tokenizer object learns the vocabulary from our tweets.
tokenizer = Tokenizer(num_words=max_words_in_vocab)
tokenizer.fit_on_texts(data['cleaned_text'])

In [6]:
# Now, we convert the text into sequences of numbers.
sequences = tokenizer.texts_to_sequences(data['cleaned_text'])

# Since all tweets have different lengths, we need to "pad" them with zeros
# so they are all the same length (100 in this case).
padded_sequences = pad_sequences(sequences, maxlen=max_tweet_length)
X = padded_sequences # Our features (the tweet text

In [7]:
# We do the same for the labels (the sentiment).
# 'get_dummies' converts the text labels ('positive', 'negative', 'neutral')
# into a "one-hot encoded" format, like [0, 0, 1] for positive.
labels = pd.get_dummies(data['airline_sentiment'])
Y = labels.values # Our labels
label_names = labels.columns.tolist() # Store the names for later

print("Data is now ready for the model!")
print("Shape of our features (X):", X.shape)
print("Shape of our labels (Y):", Y.shape)

Data is now ready for the model!
Shape of our features (X): (14640, 100)
Shape of our labels (Y): (14640, 3)


### Step 5: Split the Data for Training and Testing 

In [8]:
# We'll use 80% of the data to train our model and 20% to test how well it learned.
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=42)
print("Data split complete.")


Data split complete.


### Step 6: Build the Neural Network (RNN Model) 

In [9]:
# We'll create a simple, sequential model, where each layer feeds into the next.
model = Sequential()
# Layer 1: Embedding Layer. This layer learns meaningful vector representations for each word.
model.add(Embedding(input_dim=max_words_in_vocab, output_dim=32, input_length=max_tweet_length))
# Layer 2: SimpleRNN Layer. This layer processes the sequence of word vectors.
model.add(SimpleRNN(32))
# Layer 3: Output Layer. This layer gives us the final prediction.
# It has 3 neurons (one for each sentiment) and 'softmax' activation to give a probability for each.
model.add(Dense(3, activation='softmax'))

# "Compile" the model with settings for how it should learn.
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
# Print a summary of our model's architecture.
model.summary()




### Step 7: Train the Model 

In [10]:
# We "fit" the model to our training data.
# An "epoch" is one full pass through the entire training dataset.
model.fit(X_train, Y_train, epochs=5, batch_size=32, validation_data=(X_test, Y_test))
print("Model training complete!")

Epoch 1/5
[1m366/366[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 17ms/step - accuracy: 0.6337 - loss: 0.8446 - val_accuracy: 0.7462 - val_loss: 0.6029
Epoch 2/5
[1m366/366[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 17ms/step - accuracy: 0.7972 - loss: 0.5281 - val_accuracy: 0.7613 - val_loss: 0.5795
Epoch 3/5
[1m366/366[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 17ms/step - accuracy: 0.8752 - loss: 0.3707 - val_accuracy: 0.7640 - val_loss: 0.6130
Epoch 4/5
[1m366/366[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 17ms/step - accuracy: 0.9353 - loss: 0.2209 - val_accuracy: 0.7579 - val_loss: 0.7091
Epoch 5/5
[1m366/366[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 17ms/step - accuracy: 0.9583 - loss: 0.1389 - val_accuracy: 0.7305 - val_loss: 0.8280
Model training complete!


### Step 8: Create a Prediction Function and Test in Real-Time 

In [None]:

def predict_tweet_sentiment(tweet_text):
    # First, clean the input text just like we did for the training data.
    cleaned_text = clean_tweet_text(tweet_text)
    # Convert the cleaned text to a numerical sequence.
    sequence = tokenizer.texts_to_sequences([cleaned_text])
    # Pad the sequence so it's the correct length.
    padded_sequence = pad_sequences(sequence, maxlen=max_tweet_length)
    
    # Get the model's prediction.
    prediction = model.predict(padded_sequence)
    
    # The prediction is an array of probabilities, e.g., [0.1, 0.2, 0.7].
    # We find the index of the highest probability.
    predicted_index = np.argmax(prediction)
    # Use the index to get the sentiment name.
    predicted_sentiment = label_names[predicted_index]
    # Also get the confidence score (the highest probability).
    confidence = prediction[0][predicted_index]
    
    return predicted_sentiment, confidence

# A loop to let the user enter tweets continuously.
while True:
    user_input = input("\nEnter a tweet to analyze (or type 'quit' to exit): ")
    if user_input.lower() == 'quit':
        break
        
    sentiment, confidence = predict_tweet_sentiment(user_input)
    print(f"\nPrediction: {sentiment.upper()}")
    print(f"Confidence: {confidence:.2%}")
    print("-" * 40)


Enter a tweet to analyze (or type 'quit' to exit): Splendid
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 207ms/step

Prediction: NEUTRAL
Confidence: 51.73%
----------------------------------------

Enter a tweet to analyze (or type 'quit' to exit): It's extraordinary
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 53ms/step

Prediction: POSITIVE
Confidence: 54.26%
----------------------------------------

Enter a tweet to analyze (or type 'quit' to exit): The flight crashed
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 53ms/step

Prediction: NEGATIVE
Confidence: 40.82%
----------------------------------------

Enter a tweet to analyze (or type 'quit' to exit): Quick service but bad washrooms
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 55ms/step

Prediction: POSITIVE
Confidence: 79.16%
----------------------------------------
