In [1]:
import pandas as pd

**Load Dataset**

In [2]:
df=pd.read_csv('/content/judge-1377884607_tweet_product_company.csv', encoding='latin1')

In [3]:
df.head()

Unnamed: 0,tweet_text,emotion_in_tweet_is_directed_at,is_there_an_emotion_directed_at_a_brand_or_product
0,.@wesley83 I have a 3G iPhone. After 3 hrs twe...,iPhone,Negative emotion
1,@jessedee Know about @fludapp ? Awesome iPad/i...,iPad or iPhone App,Positive emotion
2,@swonderlin Can not wait for #iPad 2 also. The...,iPad,Positive emotion
3,@sxsw I hope this year's festival isn't as cra...,iPad or iPhone App,Negative emotion
4,@sxtxstate great stuff on Fri #SXSW: Marissa M...,Google,Positive emotion


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9093 entries, 0 to 9092
Data columns (total 3 columns):
 #   Column                                              Non-Null Count  Dtype 
---  ------                                              --------------  ----- 
 0   tweet_text                                          9092 non-null   object
 1   emotion_in_tweet_is_directed_at                     3291 non-null   object
 2   is_there_an_emotion_directed_at_a_brand_or_product  9093 non-null   object
dtypes: object(3)
memory usage: 213.2+ KB


In [5]:
df.drop('emotion_in_tweet_is_directed_at', axis=1, inplace=True)

**Text cleaning**

In [6]:
import re
def clean_text(text):
    if pd.isna(text):
        return ""
    text = str(text)
    text = re.sub(r"http\S+|www\.\S+", "", text)
    text = re.sub(r"@\w+", "", text)
    text = re.sub(r"#", "", text)
    text = re.sub(r"[^0-9A-Za-z\s!?]", " ", text)
    text = re.sub(r"\s+", " ", text).strip().lower()
    return text

In [7]:
df["clean_text"] = df["tweet_text"].apply(clean_text)

In [8]:
df.head()

Unnamed: 0,tweet_text,is_there_an_emotion_directed_at_a_brand_or_product,clean_text
0,.@wesley83 I have a 3G iPhone. After 3 hrs twe...,Negative emotion,i have a 3g iphone after 3 hrs tweeting at ris...
1,@jessedee Know about @fludapp ? Awesome iPad/i...,Positive emotion,know about ? awesome ipad iphone app that you ...
2,@swonderlin Can not wait for #iPad 2 also. The...,Positive emotion,can not wait for ipad 2 also they should sale ...
3,@sxsw I hope this year's festival isn't as cra...,Negative emotion,i hope this year s festival isn t as crashy as...
4,@sxtxstate great stuff on Fri #SXSW: Marissa M...,Positive emotion,great stuff on fri sxsw marissa mayer google t...


In [9]:
df['is_there_an_emotion_directed_at_a_brand_or_product'].unique()

array(['Negative emotion', 'Positive emotion',
       'No emotion toward brand or product', "I can't tell"], dtype=object)

In [10]:
df['is_there_an_emotion_directed_at_a_brand_or_product'].value_counts()

Unnamed: 0_level_0,count
is_there_an_emotion_directed_at_a_brand_or_product,Unnamed: 1_level_1
No emotion toward brand or product,5389
Positive emotion,2978
Negative emotion,570
I can't tell,156


**Map sentiment labels**

In [11]:

mapping = {
    "Negative emotion": "negative",
    "Positive emotion": "positive",
    "No emotion toward brand or product": "neutral",
    "I can't tell": "no_idea"
}
df["label"] = df["is_there_an_emotion_directed_at_a_brand_or_product"].map(mapping)
df = df.dropna(subset=["label"])


**Encode labels**

In [12]:
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras.utils import to_categorical

le = LabelEncoder()
y = le.fit_transform(df["label"])
num_classes = len(le.classes_)



In [13]:
df["label"].value_counts(normalize=True)

Unnamed: 0_level_0,proportion
label,Unnamed: 1_level_1
neutral,0.592654
positive,0.327505
negative,0.062686
no_idea,0.017156


**Compute class weights to fix imbalance**


In [14]:
from sklearn.utils.class_weight import compute_class_weight
import numpy as np
class_weights = compute_class_weight(
    class_weight="balanced",
    classes=np.unique(y),
    y=y
)
class_weights = dict(enumerate(class_weights))
print("Computed class weights:", class_weights)

Computed class weights: {0: np.float64(3.988157894736842), 1: np.float64(0.42183150862868807), 2: np.float64(14.572115384615385), 3: np.float64(0.7633478844862324)}


**Tokenization and padding**

In [15]:
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

MAX_WORDS = 10000
MAX_LEN = 100

tokenizer = Tokenizer(num_words=MAX_WORDS, oov_token="<OOV>")
tokenizer.fit_on_texts(df["clean_text"])
sequences = tokenizer.texts_to_sequences(df["clean_text"])
padded = pad_sequences(sequences, maxlen=MAX_LEN, padding="post", truncating="post")



**Train-test split**

In [16]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(
    padded, y, test_size=0.2, random_state=42, stratify=y
)


**Build LSTM model**

In [17]:
#Did with simple RNN, simple LSTM and bidurectional LSTM. Got higher accuracy with bidirectional LSTM
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout, Bidirectional
from tensorflow.keras import regularizers


model = Sequential([
    Embedding(MAX_WORDS, 200, input_length=MAX_LEN),
    Bidirectional(LSTM(128, return_sequences=True, dropout=0.3, recurrent_dropout=0.3)),
    Bidirectional(LSTM(64, dropout=0.3, recurrent_dropout=0.3)),
    Dense(64, activation="relu"),
    Dropout(0.3),
    Dense(num_classes, activation="softmax")
])

from tensorflow.keras.optimizers import Adam
model.compile(
    loss="sparse_categorical_crossentropy",
    optimizer=Adam(learning_rate=2e-4),
    metrics=["accuracy"]
)




**Train Model with Class Weights**

In [18]:
from tensorflow.keras.callbacks import EarlyStopping

early_stop = EarlyStopping(monitor="val_loss", patience=3, restore_best_weights=True)

history = model.fit(
    X_train, y_train,
    epochs=15,
    batch_size=64,
    validation_split=0.2,
    class_weight=class_weights,
    callbacks=[early_stop],
    verbose=1
)

Epoch 1/15
[1m91/91[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m147s[0m 1s/step - accuracy: 0.2113 - loss: 1.4119 - val_accuracy: 0.1938 - val_loss: 1.3864
Epoch 2/15
[1m91/91[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m135s[0m 1s/step - accuracy: 0.1668 - loss: 1.4142 - val_accuracy: 0.0509 - val_loss: 1.4229
Epoch 3/15
[1m91/91[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m129s[0m 1s/step - accuracy: 0.1800 - loss: 1.4308 - val_accuracy: 0.2893 - val_loss: 1.2622
Epoch 4/15
[1m91/91[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m142s[0m 1s/step - accuracy: 0.3929 - loss: 1.2381 - val_accuracy: 0.4426 - val_loss: 1.3031
Epoch 5/15
[1m91/91[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m127s[0m 1s/step - accuracy: 0.4641 - loss: 1.0697 - val_accuracy: 0.3993 - val_loss: 1.1120
Epoch 6/15
[1m91/91[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m146s[0m 1s/step - accuracy: 0.4921 - loss: 0.9194 - val_accuracy: 0.4330 - val_loss: 1.1308
Epoch 7/15
[1m91/91[0m [32m━━━━

**Evaluate model**

In [19]:
loss, acc = model.evaluate(X_test, y_test, verbose=0)
print(f"\nTest Accuracy: {acc:.4f}")


Test Accuracy: 0.5272


**Predict on Unseen Data**

In [20]:
def predict_sentiment(new_texts):
    clean_texts = [clean_text(t) for t in new_texts]
    seqs = tokenizer.texts_to_sequences(clean_texts)
    pads = pad_sequences(seqs, maxlen=MAX_LEN, padding="post", truncating="post")
    preds = model.predict(pads)
    classes = np.argmax(preds, axis=1)
    labels = le.inverse_transform(classes)
    for t, l in zip(new_texts, labels):
        print(f"Tweet: {t}\n→ Predicted Sentiment: {l}\n")

In [21]:
def predict_sentiment(new_tweets):
    # Preprocess new text data
    clean_texts = [clean_text(t) for t in new_tweets]
    seqs = tokenizer.texts_to_sequences(clean_texts)
    pads = pad_sequences(seqs, maxlen=MAX_LEN, padding="post", truncating="post")

    # Predict sentiment
    preds = model.predict(pads)
    classes = np.argmax(preds, axis=1)

    # Convert numeric predictions to label names
    labels = le.inverse_transform(classes)

    # Print all 10 results clearly
    for t, l in zip(new_tweets, labels):
        print(f"Tweet: {t}\n→ Predicted Sentiment: {l}\n")

# Example tweets (10 samples)
new_tweets = [
    "I love the new iPhone design! Apple nailed it.",
    "The latest Google update ruined my phone battery.",
    "I have no idea what this new Android feature does.",
    "Apple’s camera quality just keeps getting better every year!",
    "Not impressed with the new MacBook’s performance.",
    "The new Pixel looks fine, nothing special though.",
    "I can’t decide if I should upgrade to the new iPhone or not.",
    "Google Assistant is actually pretty useful for daily tasks.",
    "After the update, my iPhone started heating up a lot.",
    "Both Apple and Google are doing great with their latest products."
]

# Run predictions
predict_sentiment(new_tweets)


[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 2s/step
Tweet: I love the new iPhone design! Apple nailed it.
→ Predicted Sentiment: neutral

Tweet: The latest Google update ruined my phone battery.
→ Predicted Sentiment: no_idea

Tweet: I have no idea what this new Android feature does.
→ Predicted Sentiment: neutral

Tweet: Apple’s camera quality just keeps getting better every year!
→ Predicted Sentiment: no_idea

Tweet: Not impressed with the new MacBook’s performance.
→ Predicted Sentiment: neutral

Tweet: The new Pixel looks fine, nothing special though.
→ Predicted Sentiment: neutral

Tweet: I can’t decide if I should upgrade to the new iPhone or not.
→ Predicted Sentiment: no_idea

Tweet: Google Assistant is actually pretty useful for daily tasks.
→ Predicted Sentiment: no_idea

Tweet: After the update, my iPhone started heating up a lot.
→ Predicted Sentiment: no_idea

Tweet: Both Apple and Google are doing great with their latest products.
→ Predicted Sentiment: