In [16]:
import pandas as pd
import numpy as np
import re
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, LSTM, Embedding
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.metrics import accuracy_score, f1_score

# Load the data
file_path = 'bert train data.csv'
df = pd.read_csv(file_path, encoding='ISO-8859-1')

# Basic text cleaning function
def clean_text(text):
    text = text.lower()  # Convert text to lowercase
    text = re.sub(r'\d+', '', text)  # Remove numbers
    text = re.sub(r'\W+', ' ', text)  # Remove non-alphanumeric characters
    return text

# Apply text cleaning to the 'Text' column
df['Text'] = df['Text'].apply(clean_text)

# Encode the sentiment labels
label_encoder = LabelEncoder()
df['Sentiment'] = label_encoder.fit_transform(df['Sentiment'])

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(df['Text'], df['Sentiment'], test_size=0.2, random_state=42)

# TF-IDF Vectorization
tfidf_vectorizer = TfidfVectorizer(max_features=10000)
X_train_tfidf = tfidf_vectorizer.fit_transform(X_train)
X_test_tfidf = tfidf_vectorizer.transform(X_test)


In [17]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout

# Convert sparse tensor to dense
X_train_tfidf_dense = X_train_tfidf.toarray()
X_test_tfidf_dense = X_test_tfidf.toarray()

# Build the TF-IDF + Neural Network model
model_tfidf = Sequential()
model_tfidf.add(Dense(512, input_dim=X_train_tfidf_dense.shape[1], activation='relu'))
model_tfidf.add(Dropout(0.5))
model_tfidf.add(Dense(256, activation='relu'))
model_tfidf.add(Dropout(0.5))
model_tfidf.add(Dense(len(label_encoder.classes_), activation='softmax'))

# Compile the model
model_tfidf.compile(loss='sparse_categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

# Train the model
model_tfidf.fit(X_train_tfidf_dense, y_train, epochs=10, batch_size=32, validation_data=(X_test_tfidf_dense, y_test))

# Evaluate the model
y_pred_tfidf = model_tfidf.predict(X_test_tfidf_dense)
y_pred_tfidf_classes = np.argmax(y_pred_tfidf, axis=1)
accuracy_tfidf = accuracy_score(y_test, y_pred_tfidf_classes)
f1_tfidf = f1_score(y_test, y_pred_tfidf_classes, average='macro')

print(f"TF-IDF Model Accuracy: {accuracy_tfidf}")
print(f"TF-IDF Model F1 Score: {f1_tfidf}")


Epoch 1/10


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m6/6[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 43ms/step - accuracy: 0.4408 - loss: 1.0950 - val_accuracy: 0.5581 - val_loss: 1.0554
Epoch 2/10
[1m6/6[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 11ms/step - accuracy: 0.4623 - loss: 1.0584 - val_accuracy: 0.6047 - val_loss: 1.0177
Epoch 3/10
[1m6/6[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 11ms/step - accuracy: 0.5226 - loss: 1.0396 - val_accuracy: 0.6279 - val_loss: 0.9871
Epoch 4/10
[1m6/6[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 11ms/step - accuracy: 0.5656 - loss: 0.9948 - val_accuracy: 0.6047 - val_loss: 0.9571
Epoch 5/10
[1m6/6[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 13ms/step - accuracy: 0.5940 - loss: 0.9460 - val_accuracy: 0.5116 - val_loss: 0.9253
Epoch 6/10
[1m6/6[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 11ms/step - accuracy: 0.6293 - loss: 0.8866 - val_accuracy: 0.5116 - val_loss: 0.8911
Epoch 7/10
[1m6/6[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m

In [18]:
# Tokenize the text for word embeddings
tokenizer = Tokenizer(num_words=10000)
tokenizer.fit_on_texts(X_train)
X_train_seq = tokenizer.texts_to_sequences(X_train)
X_test_seq = tokenizer.texts_to_sequences(X_test)

# Pad sequences
maxlen = 100
X_train_pad = pad_sequences(X_train_seq, maxlen=maxlen)
X_test_pad = pad_sequences(X_test_seq, maxlen=maxlen)

# Build the Word Embeddings + LSTM model
model_lstm = Sequential()
model_lstm.add(Embedding(input_dim=10000, output_dim=128, input_length=maxlen))
model_lstm.add(LSTM(128, dropout=0.2, recurrent_dropout=0.2))
model_lstm.add(Dense(len(label_encoder.classes_), activation='softmax'))

# Compile the model
model_lstm.compile(loss='sparse_categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

# Train the model
model_lstm.fit(X_train_pad, y_train, epochs=10, batch_size=32, validation_data=(X_test_pad, y_test))

# Evaluate the model
y_pred_lstm = model_lstm.predict(X_test_pad)
y_pred_lstm_classes = np.argmax(y_pred_lstm, axis=1)
accuracy_lstm = accuracy_score(y_test, y_pred_lstm_classes)
f1_lstm = f1_score(y_test, y_pred_lstm_classes, average='macro')

print(f"LSTM Model Accuracy: {accuracy_lstm}")
print(f"LSTM Model F1 Score: {f1_lstm}")


Epoch 1/10




[1m6/6[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 219ms/step - accuracy: 0.3861 - loss: 1.0962 - val_accuracy: 0.5116 - val_loss: 1.0489
Epoch 2/10
[1m6/6[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 155ms/step - accuracy: 0.3989 - loss: 1.0653 - val_accuracy: 0.5116 - val_loss: 0.9817
Epoch 3/10
[1m6/6[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 154ms/step - accuracy: 0.4786 - loss: 1.0216 - val_accuracy: 0.4651 - val_loss: 0.9745
Epoch 4/10
[1m6/6[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 269ms/step - accuracy: 0.6444 - loss: 1.0062 - val_accuracy: 0.6279 - val_loss: 0.9614
Epoch 5/10
[1m6/6[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 292ms/step - accuracy: 0.6366 - loss: 0.9691 - val_accuracy: 0.3721 - val_loss: 0.9395
Epoch 6/10
[1m6/6[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 154ms/step - accuracy: 0.5005 - loss: 0.8962 - val_accuracy: 0.5116 - val_loss: 0.8660
Epoch 7/10
[1m6/6[0m [32m━━━━━━━━━━━━━━━━━━━━[0

In [19]:
# Results summary
results = {
    "Model": ["TF-IDF + Neural Network", "Word Embeddings + LSTM"],
    "Accuracy": [accuracy_tfidf, accuracy_lstm],
    "F1 Score": [f1_tfidf, f1_lstm]
}

results_df = pd.DataFrame(results)
print(results_df)


                     Model  Accuracy  F1 Score
0  TF-IDF + Neural Network  0.627907  0.716153
1   Word Embeddings + LSTM  0.697674  0.747317


In [20]:
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.optimizers import Adam

# Hyperparameters
input_dim = X_train_tfidf_dense.shape[1]
learning_rate = 0.001
dropout_rate = 0.5
batch_size = 32
epochs = 20

# Build the TF-IDF + Neural Network model with fine-tuning
model_tfidf = Sequential()
model_tfidf.add(Dense(512, input_dim=input_dim, activation='relu'))
model_tfidf.add(Dropout(dropout_rate))
model_tfidf.add(Dense(256, activation='relu'))
model_tfidf.add(Dropout(dropout_rate))
model_tfidf.add(Dense(len(label_encoder.classes_), activation='softmax'))

# Compile the model
optimizer = Adam(learning_rate=learning_rate)
model_tfidf.compile(loss='sparse_categorical_crossentropy', optimizer=optimizer, metrics=['accuracy'])

# Early stopping
early_stopping = EarlyStopping(monitor='val_loss', patience=3, restore_best_weights=True)

# Train the model
model_tfidf.fit(X_train_tfidf_dense, y_train, epochs=epochs, batch_size=batch_size, validation_data=(X_test_tfidf_dense, y_test), callbacks=[early_stopping])

# Evaluate the model
y_pred_tfidf = model_tfidf.predict(X_test_tfidf_dense)
y_pred_tfidf_classes = np.argmax(y_pred_tfidf, axis=1)
accuracy_tfidf = accuracy_score(y_test, y_pred_tfidf_classes)
f1_tfidf = f1_score(y_test, y_pred_tfidf_classes, average='macro')

print(f"TF-IDF Model Accuracy: {accuracy_tfidf}")
print(f"TF-IDF Model F1 Score: {f1_tfidf}")


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


Epoch 1/20
[1m6/6[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 45ms/step - accuracy: 0.4038 - loss: 1.0992 - val_accuracy: 0.4651 - val_loss: 1.0645
Epoch 2/20
[1m6/6[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 15ms/step - accuracy: 0.5092 - loss: 1.0679 - val_accuracy: 0.5581 - val_loss: 1.0225
Epoch 3/20
[1m6/6[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 15ms/step - accuracy: 0.5703 - loss: 1.0337 - val_accuracy: 0.5581 - val_loss: 0.9897
Epoch 4/20
[1m6/6[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 11ms/step - accuracy: 0.5293 - loss: 0.9953 - val_accuracy: 0.5581 - val_loss: 0.9598
Epoch 5/20
[1m6/6[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 15ms/step - accuracy: 0.6090 - loss: 0.9461 - val_accuracy: 0.4884 - val_loss: 0.9339
Epoch 6/20
[1m6/6[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 15ms/step - accuracy: 0.6141 - loss: 0.9133 - val_accuracy: 0.4419 - val_loss: 0.9101
Epoch 7/20
[1m6/6[0m [32m━━━━━━━━━━━━━━━━━━

In [21]:
# Hyperparameters
max_features = 10000
embedding_dim = 128
maxlen = 100
learning_rate = 0.001
dropout_rate = 0.2
recurrent_dropout_rate = 0.2
batch_size = 32
epochs = 20

# Build the Word Embeddings + LSTM model with fine-tuning
model_lstm = Sequential()
model_lstm.add(Embedding(input_dim=max_features, output_dim=embedding_dim, input_length=maxlen))
model_lstm.add(LSTM(128, dropout=dropout_rate, recurrent_dropout=recurrent_dropout_rate))
model_lstm.add(Dense(len(label_encoder.classes_), activation='softmax'))

# Compile the model
optimizer = Adam(learning_rate=learning_rate)
model_lstm.compile(loss='sparse_categorical_crossentropy', optimizer=optimizer, metrics=['accuracy'])

# Early stopping
early_stopping = EarlyStopping(monitor='val_loss', patience=3, restore_best_weights=True)

# Train the model
model_lstm.fit(X_train_pad, y_train, epochs=epochs, batch_size=batch_size, validation_data=(X_test_pad, y_test), callbacks=[early_stopping])

# Evaluate the model
y_pred_lstm = model_lstm.predict(X_test_pad)
y_pred_lstm_classes = np.argmax(y_pred_lstm, axis=1)
accuracy_lstm = accuracy_score(y_test, y_pred_lstm_classes)
f1_lstm = f1_score(y_test, y_pred_lstm_classes, average='macro')

print(f"LSTM Model Accuracy: {accuracy_lstm}")
print(f"LSTM Model F1 Score: {f1_lstm}")

train_data = pd.DataFrame({
    'Text': X_train,
    'Actual Sentiment': y_train,
    'Predicted Sentiment': y_pred_lstm_train_classes
})

test_data = pd.DataFrame({
    'Text': X_test,
    'Actual Sentiment': y_test,
    'Predicted Sentiment': y_pred_lstm_classes
})

# Combine train and test data
combined_data = pd.concat([train_data, test_data])

# Map integer labels back to original labels
combined_data['Actual Sentiment'] = label_encoder.inverse_transform(combined_data['Actual Sentiment'])
combined_data['Predicted Sentiment'] = label_encoder.inverse_transform(combined_data['Predicted Sentiment'])

# Save the output dataframe to a CSV file
output_file_path = 'output_predictions_lstm.csv'  # Update with your desired file path
combined_data.to_csv(output_file_path, index=False)
print(f"Predictions saved to {output_file_path}")


Epoch 1/20




[1m6/6[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 218ms/step - accuracy: 0.4188 - loss: 1.0952 - val_accuracy: 0.4186 - val_loss: 1.0634
Epoch 2/20
[1m6/6[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 169ms/step - accuracy: 0.5669 - loss: 1.0685 - val_accuracy: 0.5814 - val_loss: 1.0072
Epoch 3/20
[1m6/6[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 173ms/step - accuracy: 0.5341 - loss: 1.0462 - val_accuracy: 0.6047 - val_loss: 0.9816
Epoch 4/20
[1m6/6[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 152ms/step - accuracy: 0.6115 - loss: 1.0078 - val_accuracy: 0.5814 - val_loss: 0.9855
Epoch 5/20
[1m6/6[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 153ms/step - accuracy: 0.6487 - loss: 0.9780 - val_accuracy: 0.5814 - val_loss: 0.9406
Epoch 6/20
[1m6/6[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 151ms/step - accuracy: 0.6862 - loss: 0.9000 - val_accuracy: 0.6512 - val_loss: 0.8692
Epoch 7/20
[1m6/6[0m [32m━━━━━━━━━━━━━━━━━━━━[0

In [22]:
# Results summary
results = {
    "Model": ["TF-IDF + Neural Network", "Word Embeddings + LSTM"],
    "Accuracy": [accuracy_tfidf, accuracy_lstm],
    "F1 Score": [f1_tfidf, f1_lstm]
}

results_df = pd.DataFrame(results)
print(results_df)


                     Model  Accuracy  F1 Score
0  TF-IDF + Neural Network  0.627907  0.693506
1   Word Embeddings + LSTM  0.720930  0.784091
