In [30]:
##importing libraries
import pandas as pd
import re
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, SimpleRNN, Dense

In [31]:
# Load CSV with correct encoding
df = pd.read_csv("/content/sample_data/judge-1377884607_tweet_product_company.csv", encoding="ISO-8859-1")
df.head()

Unnamed: 0,tweet_text,emotion_in_tweet_is_directed_at,is_there_an_emotion_directed_at_a_brand_or_product
0,.@wesley83 I have a 3G iPhone. After 3 hrs twe...,iPhone,Negative emotion
1,@jessedee Know about @fludapp ? Awesome iPad/i...,iPad or iPhone App,Positive emotion
2,@swonderlin Can not wait for #iPad 2 also. The...,iPad,Positive emotion
3,@sxsw I hope this year's festival isn't as cra...,iPad or iPhone App,Negative emotion
4,@sxtxstate great stuff on Fri #SXSW: Marissa M...,Google,Positive emotion


In [32]:
df.describe()

Unnamed: 0,tweet_text,emotion_in_tweet_is_directed_at,is_there_an_emotion_directed_at_a_brand_or_product
count,9092,3291,9093
unique,9065,9,4
top,RT @mention Marissa Mayer: Google Will Connect...,iPad,No emotion toward brand or product
freq,5,946,5389


In [34]:
# Renaming columns
df.columns = ["tweet_text", "target_entity", "sentiment"]

In [35]:
df

Unnamed: 0,tweet_text,target_entity,sentiment
0,.@wesley83 I have a 3G iPhone. After 3 hrs twe...,iPhone,Negative emotion
1,@jessedee Know about @fludapp ? Awesome iPad/i...,iPad or iPhone App,Positive emotion
2,@swonderlin Can not wait for #iPad 2 also. The...,iPad,Positive emotion
3,@sxsw I hope this year's festival isn't as cra...,iPad or iPhone App,Negative emotion
4,@sxtxstate great stuff on Fri #SXSW: Marissa M...,Google,Positive emotion
...,...,...,...
9088,Ipad everywhere. #SXSW {link},iPad,Positive emotion
9089,"Wave, buzz... RT @mention We interrupt your re...",,No emotion toward brand or product
9090,"Google's Zeiger, a physician never reported po...",,No emotion toward brand or product
9091,Some Verizon iPhone customers complained their...,,No emotion toward brand or product


In [36]:
print(df.isnull().sum())

tweet_text          1
target_entity    5802
sentiment           0
dtype: int64


In [37]:
df = df.dropna(subset=["tweet_text", "sentiment"])

In [38]:
print(df.isnull().sum())

tweet_text          0
target_entity    5801
sentiment           0
dtype: int64


In [39]:
label_map = {
    "Positive emotion": "positive",
    "Negative emotion": "negative",
    "No emotion": "neutral",
    "I can't tell": "no_idea"
}
df = df[df["sentiment"].isin(label_map.keys())].copy()
df["sentiment"] = df["sentiment"].map(label_map)

In [40]:
# Define cleaning function
def clean_text(text):
    text = text.lower()
    text = re.sub(r"http\S+", "", text)         # remove URLs
    text = re.sub(r"@\w+", "", text)            # remove mentions
    text = re.sub(r"#", "", text)               # remove hashtag symbols but keep words
    text = re.sub(r"[^a-z0-9\s!?.,]", "", text) # allow basic punctuation
    return text.strip()

# Apply cleaning and create the column
df["cleaned_text"] = df["tweet_text"].apply(clean_text)

# Drop rows where cleaned text is empty
df = df[df["cleaned_text"].str.strip() != ""].copy()

# Check if it exists now
print("'cleaned_text' in df.columns →", 'cleaned_text' in df.columns)
print("Sample count after cleaning:", len(df))
print("Sample cleaned tweets:\n", df["cleaned_text"].head())

'cleaned_text' in df.columns → True
Sample count after cleaning: 3704
Sample cleaned tweets:
 0    . i have a 3g iphone. after 3 hrs tweeting at ...
1    know about  ? awesome ipadiphone app that youl...
2    can not wait for ipad 2 also. they should sale...
3    i hope this years festival isnt as crashy as t...
4    great stuff on fri sxsw marissa mayer google, ...
Name: cleaned_text, dtype: object


In [41]:
# Tokenization
tokenizer = Tokenizer(num_words=10000, oov_token="<OOV>")
tokenizer.fit_on_texts(df["cleaned_text"])
sequences = tokenizer.texts_to_sequences(df["cleaned_text"])

In [42]:
# Padding sequences
max_len = 30  # or use: max_len = max(len(x) for x in sequences)
X = pad_sequences(sequences, maxlen=max_len, padding="post")


In [43]:
# Label encoding
le = LabelEncoder()
y = le.fit_transform(df["sentiment"])

In [44]:
# Train-test split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)


In [45]:
print("Training samples:", len(X_train))
print("Test samples:", len(X_test))
print("Classes:", le.classes_)

Training samples: 2963
Test samples: 741
Classes: ['negative' 'no_idea' 'positive']


In [47]:
# Build the SimpleRNN model
model = Sequential()
model.add(Embedding(input_dim=10000, output_dim=64))  # 10k words, 64-dim embeddings
model.add(SimpleRNN(64, return_sequences=False))  # Can tune units
model.add(Dense(32, activation='relu'))
model.add(Dense(4, activation='softmax'))  # 4 sentiment classes

In [48]:
model.compile(loss='sparse_categorical_crossentropy',
              optimizer='adam',
              metrics=['accuracy'])

In [49]:
# Train the model
history = model.fit(
    X_train, y_train,
    validation_data=(X_test, y_test),
    epochs=5,
    batch_size=32
)

Epoch 1/5
[1m93/93[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 17ms/step - accuracy: 0.8014 - loss: 0.7218 - val_accuracy: 0.8043 - val_loss: 0.5989
Epoch 2/5
[1m93/93[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 13ms/step - accuracy: 0.7974 - loss: 0.6084 - val_accuracy: 0.8043 - val_loss: 0.6082
Epoch 3/5
[1m93/93[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 14ms/step - accuracy: 0.8092 - loss: 0.5953 - val_accuracy: 0.8043 - val_loss: 0.5974
Epoch 4/5
[1m93/93[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 14ms/step - accuracy: 0.8066 - loss: 0.5955 - val_accuracy: 0.8043 - val_loss: 0.6023
Epoch 5/5
[1m93/93[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 14ms/step - accuracy: 0.8138 - loss: 0.4995 - val_accuracy: 0.7571 - val_loss: 0.6501


In [50]:
loss, acc = model.evaluate(X_test, y_test, verbose=0)
print(f"✅ Test Accuracy: {acc:.2f}")

✅ Test Accuracy: 0.76
