<a href="https://colab.research.google.com/github/SriRamK345/Sentiment-Analysis-using-LSTM/blob/main/Sentiment_Analysis_using_LSTM.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
! unzip twitter_training.csv.zip

# Import Necessary Library

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import re
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score, classification_report
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout, Bidirectional, BatchNormalization
from tensorflow.keras.regularizers import l2
import spacy
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import string
import warnings
warnings.filterwarnings("ignore")

# Reading The Data

In [None]:
df_train = pd.read_csv("/content/twitter_training.csv")
df_test = pd.read_csv("//content/twitter_validation.csv")

# Explore The Data

In [None]:
df_train.head()

In [None]:
df_test.head()

# Adding headers

In [None]:
df_test.columns = ['Header1', 'company','labels','text']
df_train.columns = ['Header1', 'company','labels','text']

In [None]:
df_train.head()

In [None]:
df_test.head()

In [None]:
df_train.shape, df_test.shape

In [None]:
df_train.info()
print("\n")
df_train.info()

In [None]:
df_train.duplicated().sum()

In [None]:
df_test.duplicated().sum()

In [None]:
df_train.isnull().sum()

In [None]:
df_test.isnull().sum()

# Cleaning the Data

In [None]:
df_train.dropna(inplace=True)

In [None]:
df_train.isnull().sum()

In [None]:
df_train.drop_duplicates(inplace=True)

In [None]:
df_train.duplicated().sum()

# **Drop Uneeded Columns**

In [None]:
df_train.drop(columns=['Header1', 'company'], inplace=True)

In [None]:
df_test.drop(columns=['Header1', 'company'], inplace=True)

In [None]:
df_train

In [None]:
df_test

# Preprocessing the Data

In [None]:
nltk.download('stopwords')
nltk.download('wordnet')

In [None]:
stop_words = set(stopwords.words("english"))
lemmatizer = WordNetLemmatizer()
number_pattern = re.compile(r"\d+")

def preprocess_test(text):
  text = number_pattern.sub("", text)  # Remove numbers
  text = text.translate(str.maketrans("", "", string.punctuation)).lower()  # Remove punctuation and lowercase
  text = [lemmatizer.lemmatize(word) for word in text.split() if word not in stop_words] # Lemmatization & stop word removal
  return " ".join(text)

In [None]:
df_train['train_text'] = df_train['text'].apply(preprocess_test)
df_test['test_text'] = df_test['text'].apply(preprocess_test)

## Split the Data

In [None]:
# Separate features and labels for training data
train_texts = df_train['train_text'].values
train_labels = df_train['labels'].values

# Separate features and labels for test data
test_texts = df_test['test_text'].values
test_labels = df_test['labels'].values

In [None]:
# Initialize the label encoder
label_encoder = LabelEncoder()

# Fit and transform the labels
train_labels_encoded = label_encoder.fit_transform(train_labels)
test_labels_encoded = label_encoder.transform(test_labels)

In [None]:
# Initialize the tokenizer
tokenizer = Tokenizer()  # You can adjust the number of words

# Fit the tokenizer on the training texts
tokenizer.fit_on_texts(train_texts)
tokenizer.fit_on_texts(test_texts)

In [None]:
# Convert texts to sequences
train_sequences = tokenizer.texts_to_sequences(train_texts)
test_sequences = tokenizer.texts_to_sequences(test_texts)

In [None]:
# Maximum Length in X_train_sequences
maxlen = max(len(tokens) for tokens in train_sequences)
print("Maximum sequence length (maxlen):", maxlen)

In [None]:
train_padded = pad_sequences(train_sequences, maxlen=maxlen, padding='post')
test_padded = pad_sequences(test_sequences, maxlen=maxlen, padding='post')

In [None]:
# Embedding Input Size
input_size = np.max(train_padded) + 1
input_size

# Building the Model

In [None]:
# Define the model
model = Sequential()

# Add an embedding layer
model.add(Embedding(input_dim=input_size+2, output_dim=100, input_shape=(maxlen,)))

# Add a bidirectional LSTM layer with 128 units
model.add(Bidirectional(LSTM(128, kernel_regularizer=l2(0.1), return_sequences=True, recurrent_regularizer=l2(0.1))))
model.add(BatchNormalization())
model.add(Dropout(0.5))

# Add another LSTM layer
model.add(Bidirectional(LSTM(64, kernel_regularizer=l2(0.01), recurrent_regularizer=l2(0.01))))
# Add batch normalization layer
model.add(BatchNormalization())
model.add(Dropout(0.5))

# Add a dense layer with 64 units and ReLU activation
model.add(Dense(64, activation='relu', kernel_regularizer=l2(0.01)))

# Add dropout regularization
model.add(Dropout(0.5))

# Add the output layer with 5 units for 5 labels and softmax activation
model.add(Dense(5, activation='softmax'))

In [None]:
model.summary()

In [None]:
model.compile(
    loss='sparse_categorical_crossentropy',
    optimizer='adam',
    metrics=['accuracy']
)

In [None]:
# Define the EarlyStopping callback
early_stopping = EarlyStopping(
    monitor="val_loss",  # Metric to monitor
    patience=10,          # Number of epochs to wait before stopping
    restore_best_weights=True  # Restore the best model weights
)

In [None]:
history = model.fit(
    train_padded,
    train_labels_encoded ,
    validation_data=(test_padded,test_labels_encoded),
    verbose=1,
    callbacks=[early_stopping],
    epochs=50,
    batch_size=32
)

# Evaluate the model

In [None]:
test_loss, test_accuracy = model.evaluate(test_padded , test_labels_encoded)
print(f'Test Loss: {test_loss}')
print(f'Test Accuracy: {test_accuracy}')

# Visualize training history

In [None]:
# Plot training & validation accuracy values
plt.figure(figsize=(12, 4))
plt.subplot(1, 2, 1)
plt.plot(history.history['accuracy'])
plt.plot(history.history['val_accuracy'])
plt.title('Model accuracy')
plt.ylabel('Accuracy')
plt.xlabel('Epoch')
plt.legend(['Train', 'Test'], loc='upper left')

# Plot training & validation loss values
plt.subplot(1, 2, 2)
plt.plot(history.history['loss'])
plt.plot(history.history['val_loss'])
plt.title('Model loss')
plt.ylabel('Loss')
plt.xlabel('Epoch')
plt.legend(['Train', 'Test'], loc='upper left')

plt.show()