RNN stands for Recurrent Neural Network.
It's a type of neural network that's designed for sequence data, like time series, text, or speech.
It remembers information from previous steps (it has a "memory"), which is important when what happened before matters (like understanding a sentence word by word).
LSTM stands for Long Short-Term Memory.
It's a special type of RNN that solves a big problem normal RNNs have: they forget things too quickly (this is called the "vanishing gradient problem").
RNN = good for sequences, but forgets too easily.
LSTM = smarter RNN that remembers better over long sequences.

In [17]:
#Step1 -import required libraries and Load the CSV file as a TensorFlow dataset

import tensorflow as tf
import pandas as pd
# Try ISO-8859-1 encoding

dataset = tf.data.experimental.make_csv_dataset(
    "/content/sample_data/Copy of Sentiment.csv",
    encoding='ISO-8859-1',
    batch_size=1,  # Use batch_size=1 to treat each row as one example
    num_epochs=1,
    ignore_errors=True)





In [48]:
#Step2-Convert the dataset to a list of dictionaries
data = []
for batch in dataset:
    row = {key: value.numpy()[0]for key, value in batch.items()}
    data.append(row)

In [27]:
#Step3-Remove UTF charcter
import pandas as pd

# Specify the encoding manually
df = pd.read_csv( "/content/sample_data/Copy of Sentiment.csv", encoding='ISO-8859-1')
df

Unnamed: 0,textID,text,selected_text,sentiment,Time of Tweet,Age of User,Country,Population -2020,Land Area (Km²),Density (P/Km²)
0,cb774db0d1,"I`d have responded, if I were going","I`d have responded, if I were going",neutral,morning,0-20,Afghanistan,38928346,652860.0,60
1,549e992a42,Sooo SAD I will miss you here in San Diego!!!,Sooo SAD,negative,noon,21-30,Albania,2877797,27400.0,105
2,088c60f138,my boss is bullying me...,bullying me,negative,night,31-45,Algeria,43851044,2381740.0,18
3,9642c003ef,what interview! leave me alone,leave me alone,negative,morning,46-60,Andorra,77265,470.0,164
4,358bd9e861,"Sons of ****, why couldn`t they put them on t...","Sons of ****,",negative,noon,60-70,Angola,32866272,1246700.0,26
...,...,...,...,...,...,...,...,...,...,...
27476,4eac33d1c0,wish we could come see u on Denver husband l...,d lost,negative,night,31-45,Ghana,31072940,227540.0,137
27477,4f4c4fc327,I`ve wondered about rake to. The client has ...,", don`t force",negative,morning,46-60,Greece,10423054,128900.0,81
27478,f67aae2310,Yay good for both of you. Enjoy the break - y...,Yay good for both of you.,positive,noon,60-70,Grenada,112523,340.0,331
27479,ed167662a5,But it was worth it ****.,But it was worth it ****.,positive,night,70-100,Guatemala,17915568,107160.0,167


In [28]:
pip install tensorflow



In [30]:
df['text']

Unnamed: 0,text
0,"I`d have responded, if I were going"
1,Sooo SAD I will miss you here in San Diego!!!
2,my boss is bullying me...
3,what interview! leave me alone
4,"Sons of ****, why couldn`t they put them on t..."
...,...
27476,wish we could come see u on Denver husband l...
27477,I`ve wondered about rake to. The client has ...
27478,Yay good for both of you. Enjoy the break - y...
27479,But it was worth it ****.


In [36]:
#Step4:Text Preprocessing: Preprocess the text data by tokenizing it into words, padding sequences to a fixed length, and converting labels to numerical format.
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

# Fill NaNs with empty strings and convert all entries to string type
df['text'] = df['text'].fillna('').astype(str)
# 1: Tokenize text into words
tokenizer = Tokenizer()
tokenizer.fit_on_texts(df['text'])
#2.Convert text to sequences of integers
sequences = tokenizer.texts_to_sequences(df['text'])
#3.Pad sequences to a fixed length
max_sequence_length = 10  # You can choose the length based on your needs
padded_sequences = pad_sequences(sequences, maxlen=max_sequence_length, padding='post')
X=padded_sequences
#4.Convert labels to numerical format
label_encoder = LabelEncoder()
numerical_labels = label_encoder.fit_transform(df['sentiment'])
y=numerical_labels
print("Word Index:", tokenizer.word_index)
print("Sequences:", sequences)
print("Padded Sequences:\n", padded_sequences)
print("Numerical Labels:", numerical_labels)


Sequences: [[1, 165, 19, 10456, 73, 1, 150, 48], [423, 119, 1, 62, 94, 7, 91, 10, 1469, 2304], [5, 1367, 9, 10457, 17], [52, 1171, 353, 17, 497], [3945, 13, 117, 472, 14, 72, 331, 132, 16, 3, 7113, 50, 215, 577], [41, 395, 10458, 63, 10459, 71, 10460, 5551, 11, 3, 164, 5552, 2654, 16, 1321], [3141, 10461, 11, 3, 297, 38, 105, 84, 92, 9, 29, 2305, 8, 10462], [578, 579], [419, 13, 7], [2306, 249, 56, 22, 2456, 3484, 538, 9, 15, 1274], [82, 90, 82, 1, 46, 2, 25, 3142, 1, 3485, 3, 3946, 38, 7114, 229, 1, 21, 177, 129, 36, 5, 703, 8, 321], [1, 60, 60, 39, 3, 291, 46, 782, 125, 1141, 2307], [5, 10463, 9, 539, 10464, 916, 16, 5553], [1, 78, 2, 40, 2, 319, 120, 20, 1, 316, 5, 1058], [612, 612, 55, 3, 4566, 10465], [1470, 87, 1, 67, 2865], [12, 175, 224, 2, 4567, 7115, 82, 50, 1597, 593], [1, 123, 104, 199, 11, 3, 661, 326, 154, 8, 3486, 5, 332, 242, 7116, 73, 1, 410, 19, 4, 1666, 16, 6, 112, 200, 41, 298, 63, 10466], [9, 61, 76, 30, 129, 94, 426, 54], [1142, 22, 27, 15, 243, 7], [87, 10467, 1,

In [41]:
#Step5:Train/Test Split
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
X_train

array([[ 1218,   173,    97, ...,     0,     0,     0],
       [    7,   142,     0, ...,     0,     0,     0],
       [   87,   157,     1, ...,     2,  1355,   120],
       ...,
       [   11,     3,    32, ...,   590,    13,   212],
       [  260,     3,   294, ...,     0,     0,     0],
       [24448,  6593,   136, ...,   262,   312,  1250]], dtype=int32)

In [45]:
#Step6:Build LSTM
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense
import numpy as np
vocab_size = len(tokenizer.word_index) + 1  # Adding 1 because indexing starts from 1

model = Sequential()
model.add(Embedding(input_dim=vocab_size, output_dim=64, input_shape=(max_sequence_length,)))
model.add(LSTM(64))
model.add(Dense(32, activation='relu'))
model.add(Dense(len(np.unique(y)), activation='softmax'))


model.compile(loss='sparse_categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
model.summary()




In [None]:
#Step7:Fit the model
history = model.fit(X_train, y_train, epochs=10, batch_size=16, validation_data=(X_test, y_test))


Epoch 1/10
[1m1374/1374[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m35s[0m 24ms/step - accuracy: 0.5202 - loss: 0.9449 - val_accuracy: 0.6454 - val_loss: 0.7924
Epoch 2/10
[1m1374/1374[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m42s[0m 25ms/step - accuracy: 0.7417 - loss: 0.6280 - val_accuracy: 0.6484 - val_loss: 0.7948
Epoch 3/10
[1m1374/1374[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m41s[0m 25ms/step - accuracy: 0.8252 - loss: 0.4516 - val_accuracy: 0.6165 - val_loss: 0.8917
Epoch 4/10
[1m1374/1374[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m32s[0m 23ms/step - accuracy: 0.8733 - loss: 0.3338 - val_accuracy: 0.6307 - val_loss: 1.0473
Epoch 5/10
[1m1374/1374[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m44s[0m 25ms/step - accuracy: 0.9078 - loss: 0.2459 - val_accuracy: 0.6285 - val_loss: 1.1531
Epoch 6/10
[1m 121/1374[0m [32m━[0m[37m━━━━━━━━━━━━━━━━━━━[0m [1m26s[0m 21ms/step - accuracy: 0.9400 - loss: 0.1718

In [47]:
#Step8:Evaluate performance of LSTM
loss, accuracy = model.evaluate(X_test, y_test)
print(f'Test Accuracy: {accuracy:.2f}')


[1m172/172[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 7ms/step - accuracy: 0.5913 - loss: 2.1564
Test Accuracy: 0.59
