In [None]:
import pandas as pd
import re
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split, KFold
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, SpatialDropout1D, Bidirectional
from tensorflow.keras.optimizers import Adam
from keras.regularizers import l2
from keras.callbacks import EarlyStopping
import tensorflow as tf
import numpy as np

# Verify TensorFlow is using the GPU
print("Num GPUs Available: ", len(tf.config.list_physical_devices('GPU')))

In [None]:
# Load the dataset
df = pd.read_csv('./data.csv', encoding='latin-1', header=None)
df.columns = ['target', 'ids', 'date', 'flag', 'user', 'text']
df = df[['text', 'target']]

# Map target to two classes: negative, positive
df['target'] = df['target'].map({0: 0, 4: 1})

# Sample 66,666 instances from each class to get a total of 200,000 samples
neg_df = df[df['target'] == 0].sample(n=66666, random_state=42)
pos_df = df[df['target'] == 1].sample(n=66666, random_state=42)
# Combine the sampled data
df_sampled = pd.concat([neg_df, pos_df])

# Shuffle the combined DataFrame
df_sampled = df_sampled.sample(frac=1, random_state=42).reset_index(drop=True)
df = df_sampled

In [None]:
# Clean the text data
def clean_text(text):
    text = re.sub(r'@[A-Za-z0-9]+', '', text)  # Remove @mentions
    text = re.sub(r'#', '', text)  # Remove hashtag symbol
    text = re.sub(r'RT[\s]+', '', text)  # Remove RT
    text = re.sub(r'https?://\S+', '', text)  # Remove the hyper link
    text = re.sub(r'\W', ' ', str(text))  # Remove special characters
    text = text.lower()  # Convert to lower case
    return text

df['text'] = df['text'].apply(clean_text)

# Tokenize and pad sequences
tokenizer = Tokenizer(num_words=5000, split=' ')
tokenizer.fit_on_texts(df['text'].values)
X = tokenizer.texts_to_sequences(df['text'].values)
X = pad_sequences(X, maxlen=100)

# One-hot encode the target
Y = df['target'].values

# Train-test split
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=42)

In [None]:
# Using pre-trained embeddings (commented, assuming embeddings are not provided)
# embedding_matrix = ...  # Load pre-trained embeddings
# embedding_layer = Embedding(input_dim=5000, output_dim=128, weights=[embedding_matrix], trainable=False)

# Model definition
model = Sequential()
model.add(Embedding(input_dim=5000, output_dim=128))  # Use embedding_layer if using pre-trained embeddings
model.add(SpatialDropout1D(0.5))
model.add(Bidirectional(LSTM(128, activation='relu', dropout=0.5, recurrent_dropout=0.5, return_sequences=True)))
model.add(Bidirectional(LSTM(128, activation='relu', dropout=0.5, recurrent_dropout=0.5, kernel_regularizer=l2(0.01))))
model.add(Dense(1, activation='sigmoid'))

# Compile the model
optimizer = Adam(learning_rate=0.0001, decay=1e-6)
model.compile(loss='binary_crossentropy', optimizer=optimizer, metrics=['accuracy'])

# Early stopping
early_stopping = EarlyStopping(monitor='val_loss', patience=5, restore_best_weights=True)

# Model summary
model.summary()

In [None]:
# Cross-validation (example)
kf = KFold(n_splits=5, shuffle=True, random_state=42)
for train_index, val_index in kf.split(X):
    X_train_k, X_val_k = X[train_index], X[val_index]
    Y_train_k, Y_val_k = Y[train_index], Y[val_index]
    history = model.fit(X_train_k, Y_train_k, epochs=8, batch_size=10, validation_data=(X_val_k, Y_val_k), callbacks=[early_stopping], verbose=2)


In [None]:

# Final training on the full dataset
history = model.fit(X_train, Y_train, epochs=8, batch_size=10, validation_data=(X_test, Y_test), callbacks=[early_stopping], verbose=2)

# Feature engineering (commented, as an example)
# additional_features = ...
# X_combined = np.concatenate([X, additional_features], axis=1)