#Import Libraries

In [19]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, SimpleRNN, LSTM, Dense


#Load The Dataset

* when the parser encounters a byte sequence (in this case, byte 0x89 at
position 5604) that cannot be decoded using the UTF-8 encoding scheme. This means the file is likely encoded using a different encoding, such as 'latin-1' or 'ISO-8859-1'.

In [20]:
data = pd.read_csv('/content/judge-1377884607_tweet_product_company.csv',encoding='latin-1' )



* or try 'ISO-8859-1', 'cp1252' if 'latin-1' doesn't work




#Data Preprocessing

In [22]:
#Drop unnecessary columns
data = data.drop(columns=['emotion_in_tweet_is_directed_at'], errors='ignore')

In [23]:
# missing value handling
data.dropna(inplace=True)

In [24]:
#Train Test Split
train_data, test_data = train_test_split(data, test_size=0.2, random_state=42)
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)
example=next(iter(train_data.values))
print('Text:\n',example[0])
print('\nlabel:',example[1])

Text:
 In my next life I'm coming back as an iPad 2. Women can't keep their hands off this thing. #SXSW

label: Positive emotion


In [25]:
#Tokenization
tokenizer = Tokenizer(num_words=10000)
tokenizer.fit_on_texts(data['tweet_text'])
sequences = tokenizer.texts_to_sequences(data['tweet_text'])

In [26]:
#Pad sequences
max_length = 100
X = pad_sequences(sequences, maxlen=max_length)
y = data['is_there_an_emotion_directed_at_a_brand_or_product']

In [27]:
#Encoding to numerical values
label_encoder = LabelEncoder()
y = label_encoder.fit_transform(y)

In [28]:
#Train Test Split(after encoding)
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

#Building The Model

In [29]:
model = Sequential([
    Embedding(input_dim=10000, output_dim=128, input_length=max_length),
    LSTM(128, return_sequences=False),
    Dense(4, activation='softmax')
])
model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])



In [34]:
#Train Test Split(for evaluation of model)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

#Train The Model

In [30]:
history = model.fit(X_train, y_train, epochs=5, batch_size=32, validation_split=0.2)

Epoch 1/5
[1m182/182[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m45s[0m 234ms/step - accuracy: 0.5759 - loss: 0.9819 - val_accuracy: 0.6151 - val_loss: 0.8894
Epoch 2/5
[1m182/182[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m44s[0m 239ms/step - accuracy: 0.6789 - loss: 0.7782 - val_accuracy: 0.6564 - val_loss: 0.8338
Epoch 3/5
[1m182/182[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m78s[0m 218ms/step - accuracy: 0.8035 - loss: 0.5427 - val_accuracy: 0.6667 - val_loss: 0.8822
Epoch 4/5
[1m182/182[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m38s[0m 208ms/step - accuracy: 0.8607 - loss: 0.3708 - val_accuracy: 0.6550 - val_loss: 1.0007
Epoch 5/5
[1m182/182[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m39s[0m 215ms/step - accuracy: 0.8921 - loss: 0.2980 - val_accuracy: 0.6550 - val_loss: 1.0714


#Evaluate The Model

In [33]:
loss, accuracy = model.evaluate(X_test, y_test)
print(f"Test Accuracy: {accuracy:.2f}")

[1m57/57[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 55ms/step - accuracy: 0.6543 - loss: 1.0733
Test Accuracy: 0.65
