In [5]:
import pandas as pd
import re
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score, roc_curve
from keras.models import Sequential
from keras.layers import Embedding, LSTM, Dense, Dropout, BatchNormalization
from keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from keras.optimizers import Adam
from keras.regularizers import l2
from bayes_opt import BayesianOptimization

In [3]:
!pip install bayesian-optimization
from tensorflow.keras.preprocessing.sequence import pad_sequences

Collecting bayesian-optimization
  Downloading bayesian_optimization-1.4.3-py3-none-any.whl (18 kB)
Installing collected packages: bayesian-optimization
Successfully installed bayesian-optimization-1.4.3




In [7]:
class TweetClassificationModel:
    def __init__(self, data_file):
        # Load the dataset
        self.df = pd.read_csv(data_file)
        self.df = self.df.drop_duplicates('Tweet', keep='first')
        self.df["Cleaned_Tweet"] = [re.sub(r"(@[A-Za-z0-9_]+)|[^\w\s]|#|http\S+", "", v.lower()) for v in self.df["Tweet"]]
        self.df.drop('Tweet', axis=1, inplace=True)
        vals_to_replace = {'Relevant': 1, 'Not Relevant': 0}
        self.df['Category'] = self.df['Category'].map(vals_to_replace)
        self.df.dropna(inplace=True)
        self.df['Category'].unique()

        # Train-test split
        self.X_train, self.X_test, self.y_train, self.y_test = train_test_split(self.df['Cleaned_Tweet'], self.df['Category'], test_size=0.2, random_state=42)

        # Initialize Tokenizer
        self.tokenizer = Tokenizer()
        self.tokenizer.fit_on_texts(self.X_train)
        self.max_sequence_length = 100

    def preprocess_data(self):
        # Tokenize and pad the sequences
        X_train_sequences = self.tokenizer.texts_to_sequences(self.X_train)
        X_test_sequences = self.tokenizer.texts_to_sequences(self.X_test)
        self.X_train_padded = pad_sequences(X_train_sequences, maxlen=self.max_sequence_length)
        self.X_test_padded = pad_sequences(X_test_sequences, maxlen=self.max_sequence_length)

    def build_model(self, optimizer, dropout_rate, lstm_units):
        # Build the LSTM model
        self.model = Sequential()
        self.model.add(Embedding(len(self.tokenizer.word_index) + 1, 32, input_length=self.max_sequence_length))
        self.model.add(LSTM(lstm_units, return_sequences=True))
        self.model.add(LSTM(lstm_units))
        self.model.add(Dense(64, activation='relu', kernel_regularizer=l2(0.01)))  # L2 regularization
        self.model.add(Dropout(dropout_rate))
        self.model.add(Dense(1, activation='sigmoid'))
        self.model.compile(loss='binary_crossentropy', optimizer=optimizer, metrics=['accuracy'])

    def train_model(self, epochs=5, batch_size=32):
        # Train the LSTM model
        history = self.model.fit(self.X_train_padded, self.y_train, validation_data=(self.X_test_padded, self.y_test), epochs=epochs, batch_size=batch_size)
        return history

    def evaluate_model(self):
        # Evaluate the model on the test set
        y_pred = self.model.predict(self.X_test_padded)
        auc = roc_auc_score(self.y_test, y_pred)
        print("AUC:", auc)
        return y_pred

    def plot_roc_curve(self, y_pred):
        # Plot the ROC curve
        auc = roc_auc_score(self.y_test, y_pred)
        fpr, tpr, thresholds = roc_curve(self.y_test, y_pred)
        plt.figure()
        plt.plot(fpr, tpr, label='ROC curve (area = %0.2f)' % auc)
        plt.plot([0, 1], [0, 1], 'k--')
        plt.legend(loc="lower right")
        plt.show()

    def plot_learning_curves(self, history):
        # Plot the learning curves
        plt.figure()
        plt.plot(history.history['accuracy'], label='Training Accuracy')
        plt.plot(history.history['val_accuracy'], label='Validation Accuracy')
        plt.xlabel('Epochs')
        plt.ylabel('Accuracy')
        plt.legend()
        plt.show()

        plt.figure()
        plt.plot(history.history['loss'], label='Training Loss')
        plt.plot(history.history['val_loss'], label='Validation Loss')
        plt.xlabel('Epochs')
        plt.ylabel('Loss')
        plt.legend()
        plt.show()

    def bayesian_optimization(self):
        # Define the hyperparameter search space
        pbounds = {
            'learning_rate': (1e-5, 1e-2),
            'dropout_rate': (0.1, 0.5),
            'lstm_units': (32, 128)
        }

        def evaluate_model(learning_rate, dropout_rate, lstm_units):
            # Build the model with the given hyperparameters
            optimizer = Adam(learning_rate=learning_rate)
            self.build_model(optimizer, dropout_rate, int(lstm_units))
            # Train the model with the current hyperparameters
            history = self.train_model(epochs=5, batch_size=32)
            # Return the validation accuracy as the objective to maximize
            return history.history['val_accuracy'][-1]

        # Initialize Bayesian Optimization
        optimizer = BayesianOptimization(
            f=evaluate_model,
            pbounds=pbounds,
            verbose=2,
            random_state=42
        )
        # Perform optimization
        optimizer.maximize(init_points=5, n_iter=10)

        # Get the best hyperparameters
        best_hyperparams = optimizer.max['params']
        print("Best Hyperparameters:")
        print(best_hyperparams)

        # Train the model with the best hyperparameters and more epochs
        optimizer = Adam(learning_rate=best_hyperparams['learning_rate'])
        self.build_model(optimizer, best_hyperparams['dropout_rate'], int(best_hyperparams['lstm_units']))
        history = self.train_model(epochs=100, batch_size=32)

        # Evaluate and plot ROC curve with the best model
        y_pred = self.evaluate_model()
        self.plot_roc_curve(y_pred)

        # Plot learning curves of the best model
        self.plot_learning_curves(history)


In [None]:
data_file = "C:/Users/yaswa/Downloads/Group3-news-text-sanitized.csv"
model = TweetClassificationModel(data_file)
model.preprocess_data()
model.bayesian_optimization()


|   iter    |  target   | dropou... | learni... | lstm_u... |
-------------------------------------------------------------
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
| [0m1        [0m | [0m0.7807   [0m | [0m0.2498   [0m | [0m0.009508 [0m | [0m102.3    [0m |
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
| [0m2        [0m | [0m0.7773   [0m | [0m0.3395   [0m | [0m0.001569 [0m | [0m46.98    [0m |
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
| [0m3        [0m | [0m0.7778   [0m | [0m0.1232   [0m | [0m0.008663 [0m | [0m89.71    [0m |
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
| [95m4        [0m | [95m0.7822   [0m | [95m0.3832   [0m | [95m0.0002156[0m | [95m125.1    [0m |
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
| [0m5        [0m | [0m0.7714   [0m | [0m0.433    [0m | [0m0.002131 [0m | [0m49.46    [0m |
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
| [0m6        [0m | [0m0.7793   [0m | [0m0.4817   [0m |