Importing all the necessary libraries
----

In [None]:
# Essential Libraries
import os
import shutil
import re
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from numpy import array, asarray, zeros

# Natural Language Processing
import nltk
from nltk.corpus import stopwords
from keras.preprocessing.text import one_hot, Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

# Machine Learning Libraries
from sklearn.model_selection import train_test_split, GridSearchCV, KFold
from sklearn.linear_model import LogisticRegression
from sklearn.utils import shuffle
from sklearn.metrics import classification_report, accuracy_score

# Deep Learning Libraries
from keras.models import Sequential
from keras.layers import (
    Activation, Dropout, Dense, Flatten, GlobalMaxPooling1D, Embedding, 
    Conv1D, LSTM, SimpleRNN, Bidirectional, GlobalAveragePooling1D
)
from tensorflow.keras.optimizers import Adam

# TensorFlow Libraries
import tensorflow as tf
import tensorflow_hub as hub
import tensorflow_text as text

# TensorFlow Configuration
tf.get_logger().setLevel('ERROR')

# Visualization
import matplotlib.pyplot as plt

Standard Models
----

In [None]:
data = pd.read_csv('embedding_target.csv')
embedding = data.drop(columns=['target'])
target = data['target']

In [None]:
# Assuming 'embeds' is your embeddings DataFrame (1590736 rows × 384 columns)
# and 'target' is the target column (1590736 rows)

# Combine embeddings and target into a single DataFrame for easy splitting

# Define the features (X) and labels (y)
X = data.iloc[:, :-1]  # All columns except the last one (features)
y = data.iloc[:, -1]   # The last column (target)

# Perform train-test split with stratification to maintain label balance
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

# Display the sizes of the splits to verify
print(f"Training set size: {X_train.shape[0]} samples")
print(f"Test set size: {X_test.shape[0]} samples")


In [None]:
lr = LogisticRegression(max_iter=1000)  # Increase max_iter if needed
lr.fit(X_train, y_train)

# Predict and evaluate
y_pred = lr.predict(X_test)
print(classification_report(y_test, y_pred))


In [None]:
lr_param_grid = {
    'C': [0.01, 0.1, 1],
    'solver' : ['newton-cg', 'lbfgs', 'sag', 'saga'],
    'penalty' : ['none', 'l2']
}

cv = KFold(n_splits=5, shuffle=True, random_state=42)
lr = LogisticRegression(max_iter=1000)
grid_search = GridSearchCV(lr, lr_param_grid, cv=cv, scoring='accuracy')
grid_search.fit(X_train, y_train)
print("Best hyperparameters:", grid_search.best_params_)
print("Best cross-validation score:", grid_search.best_score_)

In [None]:
lr = LogisticRegression(max_iter=1000, **grid_search.best_params_)
lr.fit(X_train, y_train)

# Re-evaluate
y_pred = lr.predict(X_test)

print("Classification Report:")
print(classification_report(y_test, y_pred))

print("Accuracy Score:", accuracy_score(y_test, y_pred))

In [None]:
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import accuracy_score, classification_report

# Assuming you already have X_train, X_test, y_train, y_test from your previous split

# Initialize the Gaussian Naive Bayes model
gnb = GaussianNB()

# Train the model on the training data
gnb.fit(X_train, y_train)

# Make predictions on the test data
y_pred = gnb.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
report = classification_report(y_test, y_pred)

print(f"Accuracy: {accuracy:.4f}")
print(f"Classification Report:\n{report}")


In [None]:
gnb_param_grid = {
    'var_smoothing' : np.logspace(0,-9, num=20),
}

cv = KFold(n_splits=5, shuffle=True, random_state=42)
gnb = GaussianNB()
grid_search = GridSearchCV(gnb, gnb_param_grid, cv=cv, scoring='accuracy')
grid_search.fit(X_train, y_train)
print("Best hyperparameters:", grid_search.best_params_)
print("Best cross-validation score:", grid_search.best_score_)

In [None]:
gnb = GaussianNB(**grid_search.best_params_)
gnb.fit(X_train, y_train)

# Re-evaluate
y_pred = gnb.predict(X_test)

print("Classification Report:")
print(classification_report(y_test, y_pred))

print("Accuracy Score:", accuracy_score(y_test, y_pred))

In [None]:
from sklearn.svm import SVC

svc_model = SVC()
svc_model.fit(X_train, y_train)

y_pred = svc_model.predict(X_test)

print("Classification Report:")
print(classification_report(y_test, y_pred))

print("Accuracy Score:", accuracy_score(y_test, y_pred))

In [None]:
svc_param_grid = {
    'C': [0.01, 1, 10, 20],
    "gamma": [0.01, 0.1]
}

cv = KFold(n_splits=5, shuffle=True, random_state=42)
svc_model = SVC()
grid_search = GridSearchCV(svc_model, svc_param_grid, cv=cv, scoring='accuracy')
grid_search.fit(X_train, y_train)
print("Best hyperparameters:", grid_search.best_params_)
print("Best cross-validation score:", grid_search.best_score_)


# Best hyperparameters: {'C': 20, 'gamma': 0.01}
# Best cross-validation score: 0.7310000000000001

In [None]:
svc_model = SVC(**grid_search.best_params_)
svc_model.fit(X_train, y_train)

# Re-evaluate
y_pred = svc_model.predict(X_test)

print("Classification Report:")
print(classification_report(y_test, y_pred))

print("Accuracy Score:", accuracy_score(y_test, y_pred))

In [None]:
from sklearn.neighbors import KNeighborsClassifier
knn = KNeighborsClassifier()
knn.fit(X_train, y_train)
y_pred = knn.predict(X_test)

print("Classification Report:")
print(classification_report(y_test, y_pred))

print("Accuracy Score:", accuracy_score(y_test, y_pred))

In [None]:
knn_param_grid = {
    'n_neighbors': [3, 5, 7, 9, 11, 13, 15, 17],
    'weights': ['uniform', 'distance'],
    'p': [1, 2]
}

cv = KFold(n_splits=5, shuffle=True, random_state=42)
knn = KNeighborsClassifier()
grid_search = GridSearchCV(knn, knn_param_grid, cv=cv, scoring='accuracy')
grid_search.fit(X_train, y_train)
print("Best hyperparameters:", grid_search.best_params_)
print("Best cross-validation score:", grid_search.best_score_)

In [None]:
knn = KNeighborsClassifier(**grid_search.best_params_)
knn.fit(X_train, y_train)

# Re-evaluate
y_pred = knn.predict(X_test)

print("Classification Report:")
print(classification_report(y_test, y_pred))

print("Accuracy Score:", accuracy_score(y_test, y_pred))

BERT Modeling and Hypertuning
----

In [None]:
data = pd.read_csv('../data.csv', encoding='utf-8', header=None, names=['target', 'ids', 'date', 'flag', 'user', 'text'])
data.drop(columns=['ids', 'date', 'flag', 'user'], inplace=True)
data.dropna(inplace=True)
data.reset_index(drop=True, inplace=True)
data = shuffle(data, random_state=42).reset_index(drop=True)
df = data

In [None]:
# Convert 'target' to bnary sentiment labels (0 or 1)
df['target'] = df['target'].apply(lambda x: 0 if x == 0 else 1)

# Split data into training and testing sets
train_df, test_df = train_test_split(df, test_size=0.2, random_state=42)

# Convert data to TensorFlow datasets
train_data = tf.data.Dataset.from_tensor_slices((train_df['text'].values, train_df['target'].values))
test_data = tf.data.Dataset.from_tensor_slices((test_df['text'].values, test_df['target'].values))


In [None]:
def build_model():
    text_input = tf.keras.layers.Input(shape=(), dtype=tf.string, name='text')
    preprocessed_text = bert_preprocessor(text_input)
    outputs = bert_encoder(preprocessed_text)

    # Extract the pooled output from the BERT encoder
    net = outputs['pooled_output']
    print(net)
    # Add dropout for regularization
    net = tf.keras.layers.Dropout(0.1)(net)
    
    # Add additional dense layers with ReLU activation
    net = tf.keras.layers.Dense(128, activation='relu')(net)
    net = tf.keras.layers.Dropout(0.2)(net)  # Add dropout to the new dense layer
    net = tf.keras.layers.Dense(32, activation='relu')(net)
    net = tf.keras.layers.Dropout(0.2)(net)
    net = tf.keras.layers.Dense(16, activation='relu')(net)
    net = tf.keras.layers.Dropout(0.2)(net)
    
    # Final output layer with sigmoid activation for binary classification
    net = tf.keras.layers.Dense(1, activation='sigmoid', name='classifier')(net)
    
    return tf.keras.Model(inputs=[text_input], outputs=[net])

# Initialize and build the model
model = build_model()

In [None]:
model.compile(optimizer= tf.keras.optimizers.Adam(learning_rate=0.01),
              loss='binary_crossentropy',
              metrics=['accuracy'])


In [None]:
# Original shape
original_shape = df.shape
print("Original DataFrame shape:", original_shape)

# Calculate the new size (1/100 of the original)
new_size = original_shape[0] // 100  # integer division to get the whole number
print("New size for training data:", new_size)

# Randomly sample the training data
smaller_train_df = train_df.sample(n=new_size, random_state=42)

# Check the shape of the new training dataset
print("Smaller Training DataFrame shape:", smaller_train_df.shape)

smaller_train_data = tf.data.Dataset.from_tensor_slices((smaller_train_df['text'].values, smaller_train_df['target'].values))

smaller_test_df = test_df.sample(n=new_size, random_state=42)

# Check the shape of the new training dataset
print("Smaller Training DataFrame shape:", smaller_test_df.shape)

smaller_test_data = tf.data.Dataset.from_tensor_slices((smaller_test_df['text'].values, smaller_test_df['target'].values))

In [None]:
BATCH_SIZE = 32
train_data =smaller_train_data.batch(BATCH_SIZE).prefetch(tf.data.AUTOTUNE)
test_data = smaller_test_data.batch(BATCH_SIZE).prefetch(tf.data.AUTOTUNE)
print("Num GPUs Available: ", len(tf.config.list_physical_devices('GPU')))

with tf.device('/GPU:0'):
    model.fit(train_data, epochs=5)

loss, accuracy = model.evaluate(test_data)
print(f"Test Accuracy: {accuracy:.2f}")
    
model.save('Test_79', include_optimizer=False)
# Load the model
loaded_model = tf.keras.models.load_model('Test_79', custom_objects={'KerasLayer': hub.KerasLayer})

In [None]:
import tensorflow as tf
import tensorflow_hub as hub
import keras_tuner as kt

# Hypermodel definition
def build_hypermodel(hp):
    text_input = tf.keras.layers.Input(shape=(), dtype=tf.string, name='text')
    preprocessed_text = bert_preprocessor(text_input)
    outputs = bert_encoder(preprocessed_text)
    
    # BERT pooled output
    net = outputs['pooled_output']
    print(net)
    # Add dense layers with hyperparameter tuning
    for i in range(hp.Int("num_layers", 1, 3)):  # Tune 1 to 3 additional layers
        net = tf.keras.layers.Dense(
            units=hp.Choice(f"units_{i}", [32, 64, 128]),  # Tune size per layer
            activation='relu'
        )(net)
        net = tf.keras.layers.Dropout(hp.Float(f"dropout_{i}", 0.1, 0.5, step=0.1))(net)
    
    # Output layer
    net = tf.keras.layers.Dense(1, activation='sigmoid', name='classifier')(net)
    
    # Compile the model
    model = tf.keras.Model(inputs=[text_input], outputs=[net])
    model.compile(
        optimizer=tf.keras.optimizers.Adam(
            learning_rate=hp.Choice("learning_rate", [1e-5, 3e-5, 1e-4])
        ),
        loss='binary_crossentropy',
        metrics=['accuracy']
    )
    return model

# Hyperparameter tuning setup
tuner = kt.Hyperband(
    build_hypermodel,
    objective='accuracy',
    max_epochs=10,
    factor=3,
    directory='hyperband_dir',
    project_name='text_sentiment_analysis'
)

# Run the search
tuner.search(train_data, epochs=5)

# Retrieve best model
best_model = tuner.get_best_models(num_models=1)[0]

# Summary of the best model
best_model.summary()


# Trial Information

## Trial ID
- **ID:** 0020

## Hyperparameters
- **Hyperparameter Space:**
  - `num_layers`: 
    - Type: Integer
    - Range: [1, 3]
  - `units_0`: 
    - Type: Choice
    - Options: [32, 64, 128]
    - Default: 32
  - `dropout_0`: 
    - Type: Float
    - Range: [0.1, 0.5]
    - Default: 0.1
  - `learning_rate`: 
    - Type: Choice
    - Options: [1e-05, 3e-05, 0.0001]
    - Default: 1e-05
  - `units_1`: 
    - Type: Choice
    - Options: [32, 64, 128]
    - Default: 32
  - `dropout_1`: 
    - Type: Float
    - Range: [0.1, 0.5]
    - Default: 0.1
  - `units_2`: 
    - Type: Choice
    - Options: [32, 64, 128]
    - Default: 32
  - `dropout_2`: 
    - Type: Float
    - Range: [0.1, 0.5]
    - Default: 0.1

- **Selected Values:**
  - `num_layers`: 1
  - `units_0`: 128
  - `dropout_0`: 0.2
  - `learning_rate`: 0.0001
  - `units_1`: 128
  - `dropout_1`: 0.2
  - `units_2`: 64
  - `dropout_2`: 0.4

## Tuner Information
- **Epochs:** 10
- **Initial Epoch:** 4
- **Bracket:** 1
- **Round:** 1
- **Trial ID:** 0017

## Metrics
- **Loss:**
  - Direction: Minimize
  - Observations: 
    - Value: 0.3865 at Step 5
- **Accuracy:**
  - Direction: Maximize
  - Observations: 
    - Value: 0.8277 at Step 5

## Score
- **Best Score:** 0.8277
- **Best Step:** 5
- **Status:** Completed
- **Message:** None


LSTM Modeling and Hypertuning
----

In [None]:
loaded_embedding_matrix = np.loadtxt('embedding_matrix_lstm.csv', delimiter=',')

Without Hypertuning

In [None]:
lstm_model = Sequential()

# Embedding layer (use pre-trained embeddings like GloVe or FastText, fine-tune during training)
embedding_layer = Embedding(vocab_length, 100, weights=[embedding_matrix], input_length=maxlen, trainable=False)
lstm_model.add(embedding_layer)

# Bidirectional LSTM Layer (captures context from both ends of the sentence)
lstm_model.add(Bidirectional(LSTM(128, return_sequences=True)))

# Attention Layer (self-attention)
# Pass the same tensor as both query and value
attention_output = Attention()([lstm_model.output, lstm_model.output])

# Global Average Pooling to reduce the output dimensionality
lstm_model.add(GlobalAveragePooling1D())

# Dropout Layer (regularization to prevent overfitting)
lstm_model.add(Dropout(0.5))

# Dense Layer (final classification layer)
lstm_model.add(Dense(64, activation='relu'))
lstm_model.add(Dropout(0.5))
lstm_model.add(Dense(1, activation='sigmoid'))

lstm_model.compile(optimizer=Adam(learning_rate=1e-4), loss='binary_crossentropy', metrics=['accuracy'])
print(lstm_model.summary())

In [None]:
lstm_model_history = lstm_model.fit(X_train, y_train, batch_size=128, epochs=6, verbose=1, validation_split=0.2)

In [None]:
plt.plot(lstm_model_history.history['accuracy'])
plt.plot(lstm_model_history.history['val_accuracy'])

plt.title('model accuracy')
plt.ylabel('accuracy')
plt.xlabel('epoch')
plt.legend(['train','test'], loc='upper left')
plt.show()

plt.plot(lstm_model_history.history['loss'])
plt.plot(lstm_model_history.history['val_loss'])

plt.title('model loss')
plt.ylabel('loss')
plt.xlabel('epoch')
plt.legend(['train','test'], loc='upper left')
plt.show()

Hypertuning

In [None]:
import keras_tuner as kt
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Bidirectional, Attention, GlobalAveragePooling1D, Dropout, Dense
from tensorflow.keras.optimizers import Adam

def build_model(hp):
    # Define the model architecture
    lstm_model = Sequential()

    # Embedding layer (use pre-trained embeddings like GloVe or FastText, fine-tune during training)
    embedding_layer = Embedding(vocab_length, 100, weights=[embedding_matrix], input_length=maxlen, trainable=False)
    lstm_model.add(embedding_layer)

    # Bidirectional LSTM Layer (captures context from both ends of the sentence)
    lstm_model.add(Bidirectional(LSTM(
        units=hp.Int('lstm_units', min_value=128, max_value=240, step=32), 
        return_sequences=True)))

    # Attention Layer (self-attention)
    attention_output = Attention()([lstm_model.output, lstm_model.output])

    # Global Average Pooling to reduce the output dimensionality
    lstm_model.add(GlobalAveragePooling1D())

    # Dropout Layer (regularization to prevent overfitting)
    lstm_model.add(Dropout(rate=hp.Float('dropout_rate', min_value=0.2, max_value=0.7, step=0.1)))

    # Dense Layer (final classification layer)
    lstm_model.add(Dense(
        units=hp.Int('dense_units', min_value=64, max_value=180, step=32),
        activation='relu'))
    lstm_model.add(Dropout(rate=hp.Float('dropout_rate_2', min_value=0.2, max_value=0.7, step=0.1)))
    lstm_model.add(Dense(1, activation='sigmoid'))

    # Compile the model with an optimizer
    lstm_model.compile(
        optimizer=Adam(learning_rate=hp.Float('learning_rate', min_value=1e-5, max_value=1e-2, sampling='LOG')),
        loss='binary_crossentropy',  # Use binary crossentropy for binary classification tasks
        metrics=['accuracy']
    )

    return lstm_model

# Define the tuner
tuner = kt.BayesianOptimization(
    build_model,
    objective='accuracy',
    max_trials=10,  # Number of trials to run
    executions_per_trial=1,  # Run each trial once
    directory='my_dir_2',  # Save results here
    project_name='lstm_bayesian_tuning_2'
)

# Perform the hyperparameter search
tuner.search(X_train, y_train, epochs=2)

# Get the best hyperparameterss
best_hp = tuner.get_best_hyperparameters(num_trials=1)[0]
print(f"Best Hyperparameters: {best_hp.values}")


In [None]:
best_model = tuner.get_best_models(num_models=1)[0]
best_model.save('best_lstm_model_new.h5')

Best Hyperparameters: {'lstm_units': 192, 'dropout_rate': 0.5, 'dense_units': 128, 'dropout_rate_2': 0.30000000000000004, 'learning_rate': 0.0010872353209015178}