In [3]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import re
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

!pip install keras-tuner
from keras.models import Sequential
from keras.layers import Embedding, Bidirectional, LSTM, Conv1D, GlobalMaxPooling1D, Dense, Dropout
from keras.optimizers import Adam
from keras.losses import SparseCategoricalCrossentropy
from keras.callbacks import EarlyStopping, ReduceLROnPlateau, ModelCheckpoint
from kerastuner import HyperModel, BayesianOptimization
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from sklearn.preprocessing import LabelEncoder

nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')

In [None]:
tweets = pd.read_csv("/content/twitterNews.csv")
tweets.head()

In [None]:
features = tweets.iloc[:, 10].values
labels = tweets.iloc[:, 1].values

In [None]:
def preprocess_text(features):
    processed_features_list = []

    for sentence in features:
        # Remove HTML tags, URLs, and mentions starting with '@'
        processed_feature = re.sub(r'<[^>]+>|http\S+|@\w+', '', str(sentence))

        # Remove all special characters, punctuation, and numbers
        processed_feature = re.sub(r'\W|\d+', ' ', processed_feature)

        # Convert to lowercase
        processed_feature = processed_feature.lower()

        # Remove single characters
        processed_feature = re.sub(r'\s+[a-zA-Z]\s+|\^[a-zA-Z]\s+', ' ', processed_feature)

        # Substituting multiple spaces with single space
        processed_feature = re.sub(r'\s+', ' ', processed_feature).strip()

        # Removing prefixed 'b'
        processed_feature = re.sub(r'^b\s+', '', processed_feature)

        # Tokenize the text
        words = word_tokenize(processed_feature)


        # Remove stopwords and lemmatize
        processed_text = ' '.join(lemmatizer.lemmatize(word) for word in words if word not in stop_words)

        processed_features_list.append(processed_text)

    return processed_features_list


In [None]:
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()
processed_features = preprocess_text(features)


In [None]:
#tokenization and padding
tokenizer = Tokenizer(num_words=5000)
tokenizer.fit_on_texts(processed_features)
word_index = tokenizer.word_index
sequences = tokenizer.texts_to_sequences(processed_features)
padded_sequences = pad_sequences(sequences,padding='post')

In [None]:
# Initialize LabelEncoder
label_encoder = LabelEncoder()

# Fit and transform the textual labels to numerical values
encoded_labels = label_encoder.fit_transform(labels)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(padded_sequences, encoded_labels, test_size=0.2, random_state=0)

In [None]:
#calculate class weights for imbalanced dataset
class_counts = np.bincount(y_train)
print("class_counts : ",class_counts)
total_samples = sum(class_counts)
print("total_samples : ",total_samples)
class_weights = {cls: total_samples / count for cls, count in enumerate(class_counts) }
print("class_weights : ",class_weights)


In [None]:
max_len = len(padded_sequences[0])
max_words = len(tokenizer.word_index) + 1

In [None]:

class SentimentHyperModel(HyperModel):
    def build(self, hp):
        model = Sequential()
        model.add(Embedding(input_dim=max_words, output_dim=hp.Int('embedding_dim', min_value=64, max_value=256, step=32), input_length=max_len))
        model.add(Bidirectional(LSTM(units=hp.Int('lstm_units', min_value=64, max_value=128, step=32), return_sequences=True)))
        model.add(GlobalMaxPooling1D())  # Remove this layer for now
        model.add(Dense(units=hp.Int('dense_units', min_value=64, max_value=256, step=32), activation='relu'))
        model.add(Dropout(rate=hp.Float('dense_dropout', min_value=0.2, max_value=0.6, step=0.1)))
        model.add(Dense(units=3, activation='softmax'))
        optimizer = Adam(learning_rate=hp.Float('learning_rate', min_value=1e-4, max_value=1, sampling='log'))
        loss = SparseCategoricalCrossentropy()
        model.compile(loss=loss, optimizer=optimizer, metrics=['accuracy'])
        return model


In [None]:
hypermodel = SentimentHyperModel()

In [None]:
# Define batch size
batch_size = 32

# Perform hyperparameter search with BayesianOptimization tuner
tuner_bayesian = BayesianOptimization(
    hypermodel,
    objective='val_accuracy',
    max_trials=10,
    directory='./bayesian_batch2',
    project_name='sentiment_analysis_bayesian'
)


In [None]:
# Define the callbacks
callbacks = [
    EarlyStopping(patience=10, restore_best_weights=True),
    ReduceLROnPlateau(factor=0.2, patience=2)
]

# Perform hyperparameter search with BayesianOptimization tuner
tuner_bayesian.search(
    X_train,
    y_train,
    epochs=10,
    batch_size=batch_size,
    validation_split=0.2,
    class_weight=class_weights,
    callbacks=callbacks
)

In [None]:
best_model = tuner_bayesian.get_best_models(num_models=1)[0]
best_model.save("model.h5")
# Evaluate the best model from BayesianOptimization tuner
test_loss_bayesian, test_accuracy_bayesian = best_model.evaluate(X_test, y_test)
print("\nBayesian Optimization:")
print("Test Loss:", test_loss_bayesian)
print("Test Accuracy:", test_accuracy_bayesian)

In [None]:
#confussion matrix

from sklearn.metrics import confusion_matrix
import seaborn as sns
import matplotlib.pyplot as plt

# Assuming you have trained your model and obtained predictions
# Let's say your model predictions are stored in y_pred
# Generate predictions for the test set
y_pred = best_model.predict(X_test)

# Select the class with the highest probability as the predicted class
y_pred_classes = np.argmax(y_pred, axis=-1)

# Now y_pred_classes contains the predicted class labels

# Compute confusion matrix
cm = confusion_matrix(y_test, y_pred_classes)

# Display confusion matrix
plt.figure(figsize=(8, 6))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', cbar=False)
plt.xlabel('Predicted Labels')
plt.ylabel('True Labels')
plt.title('Confusion Matrix')
plt.show()
