In [9]:
import pandas as pd
import re
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout
import nltk

# Download necessary NLTK data files
nltk.download("stopwords")
nltk.download("punkt")

# Step 1: Data Collection
# Load the dataset
df = pd.read_csv("sentiment_analysis.csv", encoding="latin1")

# Step 2: Text Preprocessing
stop_words = set(stopwords.words("english"))
stemmer = PorterStemmer()


def preprocess_text(text):
    text = re.sub(r"\W", " ", text)
    text = re.sub(r"\d+", "", text)
    text = text.lower()
    tokens = word_tokenize(text)
    tokens = [stemmer.stem(word) for word in tokens if word not in stop_words]
    return " ".join(tokens)


# Apply the preprocess_text function
df["selected_text"] = df["selected_text"].fillna("")
df["clean_text"] = df["selected_text"].apply(preprocess_text)

# Step 3: Feature Extraction
tokenizer = Tokenizer(num_words=5000)
tokenizer.fit_on_texts(df["clean_text"])
X = tokenizer.texts_to_sequences(df["clean_text"])
X = pad_sequences(X, maxlen=100)
y = pd.get_dummies(df["sentiment"]).values

# Step 4: Model Selection and Training
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

model = Sequential()
model.add(Embedding(input_dim=5000, output_dim=128))
model.add(LSTM(units=128, dropout=0.2, recurrent_dropout=0.2, return_sequences=True))
model.add(LSTM(units=64, dropout=0.2, recurrent_dropout=0.2))
model.add(Dense(3, activation="softmax"))

model.compile(optimizer="adam", loss="categorical_crossentropy", metrics=["accuracy"])

# Train the model
model.fit(X_train, y_train, epochs=10, batch_size=32, validation_split=0.1, verbose=2)

# Step 6: Model Evaluation
y_pred = model.predict(X_test, batch_size=32, verbose=2)
y_pred = y_pred.argmax(axis=1)
y_test = y_test.argmax(axis=1)

print("LSTM Accuracy:", accuracy_score(y_test, y_pred))
print("LSTM Classification Report:")
print(classification_report(y_test, y_pred))

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\apspk\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\apspk\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


Epoch 1/10
619/619 - 56s - 90ms/step - accuracy: 0.7080 - loss: 0.6808 - val_accuracy: 0.8104 - val_loss: 0.5103
Epoch 2/10
619/619 - 64s - 103ms/step - accuracy: 0.8327 - loss: 0.4450 - val_accuracy: 0.8154 - val_loss: 0.4950
Epoch 3/10
619/619 - 61s - 99ms/step - accuracy: 0.8606 - loss: 0.3816 - val_accuracy: 0.8204 - val_loss: 0.5125
Epoch 4/10
619/619 - 58s - 93ms/step - accuracy: 0.8713 - loss: 0.3444 - val_accuracy: 0.8122 - val_loss: 0.5244
Epoch 5/10
619/619 - 50s - 80ms/step - accuracy: 0.8860 - loss: 0.3168 - val_accuracy: 0.8131 - val_loss: 0.5431
Epoch 6/10
619/619 - 52s - 85ms/step - accuracy: 0.8949 - loss: 0.2934 - val_accuracy: 0.8049 - val_loss: 0.5830
Epoch 7/10
619/619 - 53s - 85ms/step - accuracy: 0.9015 - loss: 0.2732 - val_accuracy: 0.8040 - val_loss: 0.6123
Epoch 8/10
619/619 - 53s - 86ms/step - accuracy: 0.9088 - loss: 0.2522 - val_accuracy: 0.8045 - val_loss: 0.6455
Epoch 9/10
619/619 - 58s - 94ms/step - accuracy: 0.9126 - loss: 0.2357 - val_accuracy: 0.8049 -

In [11]:
# Save the trained model
model.save("sentiment_analysis_model.h5")

