# 1. Read the dataset

In [2]:
import pandas as pd

In [3]:
data=pd.read_csv('/content/drive/MyDrive/datasets/judge-1377884607_tweet_product_company.csv',encoding='unicode_escape')

In [4]:
# display first 3 rows of the dataset
data.head(3)

Unnamed: 0,tweet_text,emotion_in_tweet_is_directed_at,is_there_an_emotion_directed_at_a_brand_or_product
0,.@wesley83 I have a 3G iPhone. After 3 hrs twe...,iPhone,Negative emotion
1,@jessedee Know about @fludapp ? Awesome iPad/i...,iPad or iPhone App,Positive emotion
2,@swonderlin Can not wait for #iPad 2 also. The...,iPad,Positive emotion


In [5]:
# lets drop column emotion_in_tweet_is_directed_at

data=data.drop('emotion_in_tweet_is_directed_at',axis=1)

In [6]:
# lets check the categories in the target column
data['is_there_an_emotion_directed_at_a_brand_or_product'].value_counts()

is_there_an_emotion_directed_at_a_brand_or_product
No emotion toward brand or product    5389
Positive emotion                      2978
Negative emotion                       570
I can't tell                           156
Name: count, dtype: int64

Note: There are 4 classes in the target column- Its a multi class text classification problem.

In [7]:
# lets check the size of the dataset
data.shape

(9093, 2)

It contains 9093 rows & 2 columns.

In [8]:
# info of the dataset
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9093 entries, 0 to 9092
Data columns (total 2 columns):
 #   Column                                              Non-Null Count  Dtype 
---  ------                                              --------------  ----- 
 0   tweet_text                                          9092 non-null   object
 1   is_there_an_emotion_directed_at_a_brand_or_product  9093 non-null   object
dtypes: object(2)
memory usage: 142.2+ KB


Note: As per the info , there is a null value in the dataset.

In [9]:
data.isna().sum() # check for the null value

tweet_text                                            1
is_there_an_emotion_directed_at_a_brand_or_product    0
dtype: int64

In [10]:
data[data['tweet_text'].isna()] # one of the tweet is Nan value, lets drop that row

Unnamed: 0,tweet_text,is_there_an_emotion_directed_at_a_brand_or_product
6,,No emotion toward brand or product


In [11]:
data.dropna(inplace=True)

In [12]:
data.shape

(9092, 2)

Note: So our dataset contains 9092 rows.

# 2. LSTM Based Classifier

# Steps:
1. Prepared the text data for processing
2. Tokenise the text and generate get teh word index
3. Convert the text into sequences
4. Convert the sequences into same length
5. Encode the target column
6. Split the data into train & test
7. Define LSTM model
8. Fit the model
9. evaluate the model
10. Test it on unseen data

#### Prepared the text data for processing

In [13]:
# for preprocessing task , we ar using nltk library

import nltk
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [14]:
import re #import regular expression
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

In [15]:
# create a copy
data_copy=data.copy()

In [16]:
# create a function to preprocess the text data

def preprocess_text(text):
  text=text.lower() # convert it into lower case
  text = re.sub(r'<.*?>', '', text) # remove html tags, substitute with null
  text = re.sub(r'http\S+|www\S+|https\S+', '', text, flags=re.MULTILINE)# Remove URLs
  text = re.sub(r'[^\w\s]', '', text) # Remove punctuation and special characters  @#;substitute with null
  text = re.sub(r'[Ï¡Ïàü_ÊÎÒ£Áââ_£â_ÛâRT]+', '', text)
  text = re.sub(r'\d+', '', text)# remove numbers
  words=word_tokenize(text)
  words=[i for i in words if i not in stopwords.words('english')]# remove stopwords
  lemmatise_obj=WordNetLemmatizer()
  words=[lemmatise_obj.lemmatize(i) for i in words]# lemmatise words, convert it to its stem form
  text= " ".join(words)
  text = re.sub(r'\s+', ' ', text).strip()#remove extra spaces
  return text


In [17]:
data['cleaned_text']=data['tweet_text'].apply(lambda x: preprocess_text(x)) # prepare the cleaned test

In [18]:
data.head(3)

Unnamed: 0,tweet_text,is_there_an_emotion_directed_at_a_brand_or_product,cleaned_text
0,.@wesley83 I have a 3G iPhone. After 3 hrs twe...,Negative emotion,wesley g iphone hr tweeting riseaustin dead ne...
1,@jessedee Know about @fludapp ? Awesome iPad/i...,Positive emotion,jessedee know fludapp awesome ipadiphone app y...
2,@swonderlin Can not wait for #iPad 2 also. The...,Positive emotion,swonderlin wait ipad also sale sxsw


In [19]:
pd.set_option('display.max_colwidth',None)

#### Tokenise the text and generate the word index

In [20]:
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [21]:
max_words = 1000# Max number of words to tokenize
tokenizer = Tokenizer(num_words=max_words, oov_token="<OOV>")# Instantiate the obj, OOv stands out of vocabulary
tokenizer.fit_on_texts(data['cleaned_text']) # fit the text data
word_index = tokenizer.word_index # get teh word index

In [22]:
word_index

{'<OOV>': 1,
 'sxsw': 2,
 'mention': 3,
 'link': 4,
 'rt': 5,
 'ipad': 6,
 'google': 7,
 'apple': 8,
 'iphone': 9,
 'store': 10,
 'new': 11,
 'austin': 12,
 'app': 13,
 'amp': 14,
 'launch': 15,
 'social': 16,
 'circle': 17,
 'popup': 18,
 'today': 19,
 'android': 20,
 'network': 21,
 'get': 22,
 'line': 23,
 'via': 24,
 'free': 25,
 'party': 26,
 'called': 27,
 'sxswi': 28,
 'mobile': 29,
 'one': 30,
 'major': 31,
 'time': 32,
 'like': 33,
 'day': 34,
 'map': 35,
 'temporary': 36,
 'u': 37,
 'opening': 38,
 'im': 39,
 'open': 40,
 'possibly': 41,
 'win': 42,
 'need': 43,
 'go': 44,
 'come': 45,
 'apps': 46,
 'people': 47,
 'see': 48,
 'downtown': 49,
 'great': 50,
 'going': 51,
 'mayer': 52,
 'check': 53,
 'w': 54,
 'dont': 55,
 'know': 56,
 'marissa': 57,
 'got': 58,
 'want': 59,
 'make': 60,
 'good': 61,
 'first': 62,
 'say': 63,
 'year': 64,
 'ûïmention': 65,
 'set': 66,
 'pop': 67,
 'product': 68,
 'user': 69,
 'game': 70,
 'panel': 71,
 'news': 72,
 'guy': 73,
 'search': 74,
 'th

#### convert the text into sequences

In [23]:
# convert the text into sequences of word index
sequences=tokenizer.texts_to_sequences(data['cleaned_text'])
# find the max length
max_length=max([len(seq) for seq in sequences])

In [24]:
print(max_length) # max sequence length is 26

25


#### Convert the sequences into same length using pad_sequence


In [25]:
# make the text into same length using pad_sequence method
padded_sequences=pad_sequences(sequences,maxlen=max_length,padding='post')
print(padded_sequences)

[[  1 209   9 ...   0   0   0]
 [  1  56   1 ...   0   0   0]
 [  1 164   6 ...   0   0   0]
 ...
 [  7   1   1 ...   0   0   0]
 [563   9 564 ...   0   0   0]
 [  1   3   7 ...   0   0   0]]


#### Encode the target column

In [26]:
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras.utils import to_categorical

In [27]:
X=padded_sequences
encoder_obj=LabelEncoder()
y_numeric=encoder_obj.fit_transform(data['is_there_an_emotion_directed_at_a_brand_or_product'])
y = to_categorical(y_numeric)# to_categorical expects numerical input

#### Split the data into train & test

In [28]:
from sklearn.model_selection import train_test_split

In [29]:
x_train,x_test,y_train,y_test=train_test_split(X, y, test_size=0.2, random_state=42) # 80% data will be used for training & the rest for testing

#### Define LSTM model

In [30]:
# LSTM is a deep learning model
vocab_size = len(tokenizer.word_index) + 1  # Vocabulary size (+1 for OOV token)

In [31]:
#import necessary libraries
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, SpatialDropout1D,Dropout # ID dropout layer
from tensorflow.keras.layers import Bidirectional
from tensorflow.keras.optimizers import Adam

In [32]:
model=Sequential()
model.add(Embedding(input_length=max_length,input_dim=vocab_size,output_dim=100)) # gives vectors of dim 100
model.add(SpatialDropout1D(0.3)) # ID dropout layer
model.add(Bidirectional(LSTM(100, dropout=0.3, recurrent_dropout=0.3, return_sequences=True)))# LSTM layer
model.add(Bidirectional(LSTM(150, dropout=0.3, recurrent_dropout=0.3)))# another LSTM layer
model.add(Dense(128, activation='relu'))
model.add(Dropout(0.5))
model.add(Dense(64, activation='relu'))
model.add(Dense(4, activation='softmax')) # Fully connected layer
model.compile(loss='categorical_crossentropy', optimizer=Adam(learning_rate=0.001), metrics=['accuracy']) # compile the model



In [33]:
# print model summary
print(model.summary())

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 25, 100)           963800    
                                                                 
 spatial_dropout1d (Spatial  (None, 25, 100)           0         
 Dropout1D)                                                      
                                                                 
 bidirectional (Bidirection  (None, 25, 200)           160800    
 al)                                                             
                                                                 
 bidirectional_1 (Bidirecti  (None, 300)               421200    
 onal)                                                           
                                                                 
 dense (Dense)               (None, 128)               38528     
                                                        

Note: a total of 1592844 trainable parameters are there.

#### Train the model

In [34]:
from tensorflow.keras.callbacks import EarlyStopping

epochs = 20 # how many time training will happen
batch_size = 32 # no of batches in each training

early_stopping = EarlyStopping(monitor='val_loss', patience=3, restore_best_weights=True) # Stop training when validation loss stops improving
history = model.fit(x_train, y_train, epochs=epochs, batch_size=batch_size, validation_data=(x_test, y_test), verbose=2, callbacks=[early_stopping])

Epoch 1/20
228/228 - 104s - loss: 0.9342 - accuracy: 0.5919 - val_loss: 0.8801 - val_accuracy: 0.6091 - 104s/epoch - 455ms/step
Epoch 2/20
228/228 - 80s - loss: 0.8340 - accuracy: 0.6431 - val_loss: 0.8159 - val_accuracy: 0.6416 - 80s/epoch - 352ms/step
Epoch 3/20
228/228 - 88s - loss: 0.7625 - accuracy: 0.6774 - val_loss: 0.8192 - val_accuracy: 0.6427 - 88s/epoch - 386ms/step
Epoch 4/20
228/228 - 77s - loss: 0.7211 - accuracy: 0.7034 - val_loss: 0.8035 - val_accuracy: 0.6520 - 77s/epoch - 336ms/step
Epoch 5/20
228/228 - 75s - loss: 0.6889 - accuracy: 0.7166 - val_loss: 0.8008 - val_accuracy: 0.6614 - 75s/epoch - 331ms/step
Epoch 6/20
228/228 - 77s - loss: 0.6653 - accuracy: 0.7238 - val_loss: 0.8251 - val_accuracy: 0.6427 - 77s/epoch - 339ms/step
Epoch 7/20
228/228 - 74s - loss: 0.6422 - accuracy: 0.7416 - val_loss: 0.8668 - val_accuracy: 0.6267 - 74s/epoch - 326ms/step
Epoch 8/20
228/228 - 76s - loss: 0.6241 - accuracy: 0.7459 - val_loss: 0.8690 - val_accuracy: 0.6553 - 76s/epoch - 3

Note: training accuracy is 0.74 and validation accuracy is 0.65. So model is little bit overfitting.

#### Evaluate the model

In [35]:
from sklearn.metrics import accuracy_score

In [36]:
y_pred = (model.predict(x_test) > 0.5).astype('int32')
accuracy_score(y_test,y_pred)



0.5909840571742716

Note: We got the accuracy score as 0.59


#### Test on a new data

In [40]:
x_sample='Just got my hands on the new iPhone 11! The camera is amazing and the battery life is a game-changer. Loving the new colors too. #iPhone11 #Apple'
x_sample=preprocess_text(x_sample)
x_sample=tokenizer.texts_to_sequences([x_sample])
x_sample=pad_sequences(x_sample,maxlen=max_length,padding='post')
y_pred = (model.predict(x_sample) > 0.5).astype('int32')
y_pred



array([[0, 0, 0, 1]], dtype=int32)

In [41]:
# Get the index of the predicted class, to get the class no
predicted_index = y_pred.argmax(axis=-1)[0]
# Inverse transform to get the class name
predicted_class = encoder_obj.inverse_transform([predicted_index])[0]
print(f"Predicted class: {predicted_class}")

Predicted class: Positive emotion
