In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, SpatialDropout1D, LSTM, Dense
import re
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize



In [3]:
# Function to clean text
def clean_text(text):
    # Remove non-alphanumeric characters and punctuation
    text = re.sub(r'[^a-zA-Z\s]', '', text)
    # Convert to lowercase
    text = text.lower()
    # Tokenize the text
    tokens = word_tokenize(text)
    # Remove stopwords
    stop_words = set(stopwords.words('english'))
    tokens = [word for word in tokens if word not in stop_words]
    # Join tokens back into sentence
    text = ' '.join(tokens)
    return text

In [4]:
# Step 1: Data Loading and Preprocessing
data = pd.read_csv('Sentiment.csv')
data = data[['candidate', 'sentiment', 'text']]

In [5]:
# Drop rows with missing values in 'candidate' column
data = data.dropna(subset=['candidate'])

In [6]:
# Preprocessing
data['clean_text'] = data['text'].apply(clean_text)

In [7]:
# Step 2: Text Vectorization
tokenizer = Tokenizer()
tokenizer.fit_on_texts(data['clean_text'])
X = tokenizer.texts_to_sequences(data['clean_text'])
X = pad_sequences(X, maxlen=100)  # Assuming a maximum sequence length of 100

In [8]:
# Encoding the sentiment labels
label_encoder = LabelEncoder()
y = label_encoder.fit_transform(data['sentiment'])

In [9]:
# Encoding the candidate labels
candidate_encoder = LabelEncoder()
data['candidate_encoded'] = candidate_encoder.fit_transform(data['candidate'])

In [10]:
# Step 3: Model Development
vocab_size = len(tokenizer.word_index) + 1
embedding_dim = 100

model = Sequential()
model.add(Embedding(vocab_size, embedding_dim))
model.add(SpatialDropout1D(0.2))
model.add(LSTM(100, dropout=0.2, recurrent_dropout=0.2))
model.add(Dense(3, activation='softmax'))

model.compile(loss='sparse_categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

In [11]:
# Step 4: Model Training
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

history = model.fit(X_train, y_train, epochs=10, batch_size=64, validation_data=(X_val, y_val), verbose=2)

Epoch 1/10
173/173 - 24s - 137ms/step - accuracy: 0.6258 - loss: 0.8724 - val_accuracy: 0.6548 - val_loss: 0.7500
Epoch 2/10
173/173 - 17s - 98ms/step - accuracy: 0.7380 - loss: 0.6433 - val_accuracy: 0.6802 - val_loss: 0.7214
Epoch 3/10
173/173 - 17s - 101ms/step - accuracy: 0.8123 - loss: 0.4761 - val_accuracy: 0.6773 - val_loss: 0.7826
Epoch 4/10
173/173 - 17s - 100ms/step - accuracy: 0.8683 - loss: 0.3510 - val_accuracy: 0.6570 - val_loss: 0.8619
Epoch 5/10
173/173 - 17s - 100ms/step - accuracy: 0.8984 - loss: 0.2763 - val_accuracy: 0.6603 - val_loss: 0.9397
Epoch 6/10
173/173 - 17s - 98ms/step - accuracy: 0.9096 - loss: 0.2363 - val_accuracy: 0.6537 - val_loss: 1.0337
Epoch 7/10
173/173 - 17s - 100ms/step - accuracy: 0.9201 - loss: 0.2081 - val_accuracy: 0.6563 - val_loss: 1.1594
Epoch 8/10
173/173 - 17s - 97ms/step - accuracy: 0.9268 - loss: 0.1889 - val_accuracy: 0.6555 - val_loss: 1.1663
Epoch 9/10
173/173 - 17s - 96ms/step - accuracy: 0.9315 - loss: 0.1745 - val_accuracy: 0.65

In [12]:
# Print model summary
print(model.summary())

None


In [13]:
# Step 5: Model Evaluation
print("Evaluating the model...")
val_loss, val_acc = model.evaluate(X_val, y_val, verbose=0)
print(f"Validation Loss: {val_loss:.4f}, Validation Accuracy: {val_acc:.4f}")

Evaluating the model...
Validation Loss: 1.2993, Validation Accuracy: 0.6653


# These metrics give an indication of how well the model is performing on unseen data. 
