In [2]:

# Step 3: Load the File
import pandas as pd
import re
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, SpatialDropout1D
import tensorflow as tf
from keras.callbacks import EarlyStopping
from keras.optimizers import Adam

# Verify TensorFlow is using the GPU
print("Num GPUs Available: ", tf.config.list_physical_devices('GPU'))


Num GPUs Available:  [PhysicalDevice(name='/physical_device:GPU:0', device_type='GPU')]


In [3]:
# Load the dataset
df = pd.read_csv('./data.csv', encoding='latin-1', header=None)
df.columns = ['target', 'ids', 'date', 'flag', 'user', 'text']
df = df[['text', 'target']]

# Map target to three classes: negative, neutral, positive
df['target'] = df['target'].map({0: 0, 4: 1})

In [4]:
# Sample 66,666 instances from each class to get a total of 200,000 samples
neg_df = df[df['target'] == 0].sample(n=66666, random_state=42)
neu_df = df[df['target'] == 1].sample(n=66666, random_state=42)
# Combine the sampled data
df_sampled = pd.concat([neg_df, neu_df])

# Shuffle the combined DataFrame
df_sampled = df_sampled.sample(frac=1, random_state=42).reset_index(drop=True)
df = df_sampled

In [5]:
# Clean the text data
def clean_text(text):
    text = re.sub(r'@[A-Za-z0-9]+', '', text)  # Remove @mentions
    text = re.sub(r'#', '', text)  # Remove hashtag symbol
    text = re.sub(r'RT[\s]+', '', text)  # Remove RT
    text = re.sub(r'https?://\S+', '', text)  # Remove the hyper link
    text = re.sub(r'\W', ' ', str(text))  # Remove special characters
    text = text.lower()  # Convert to lower case
    return text

df['text'] = df['text'].apply(clean_text)

In [6]:
# Tokenize and pad sequences
tokenizer = Tokenizer(num_words=5000, split=' ')
tokenizer.fit_on_texts(df['text'].values)
X = tokenizer.texts_to_sequences(df['text'].values)
X = pad_sequences(X, maxlen=100)

# One-hot encode the target
Y = df['target'].values

# Train-test split
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=42)

In [7]:
model = Sequential()
model.add(Embedding(input_dim=5000, output_dim=128, input_length=250))
model.add(SpatialDropout1D(0.5))
model.add(LSTM(100, activation='relu', dropout=0.5, recurrent_dropout=0.5))
model.add(Dense(2, activation='sigmoid'))  # 1 unit for binary classification

# Compile the model
model.compile(loss='categorical_crossentropy', optimizer=Adam(learning_rate=0.0001), metrics=['accuracy'])

# Set batch size and epochs for training
batch_size = 64
epochs = 5



In [56]:

model.summary()

In [59]:
"""
model = Sequential()
model.add(Embedding(input_dim=5000, output_dim=128, input_length=X.shape[1]))
model.add(SpatialDropout1D(0.2))
model.add(LSTM(100, dropout=0.2, recurrent_dropout=0.2))
model.add(Dense(2, activation='softmax'))  # 3 units for 3 classes

model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

### Set batch size and epochs for training
batch_size = 64
epochs = 5
"""
history = model.fit(X_train, Y_train, epochs=epochs, batch_size=batch_size, validation_data=(X_test, Y_test), verbose=2)

Epoch 1/5




KeyboardInterrupt: 

In [58]:
"""
model = Sequential()
model.add(Embedding(input_dim=5000, output_dim=128, input_length=X.shape[1]))
model.add(SpatialDropout1D(0.5))
model.add(LSTM(100, dropout=0.5, recurrent_dropout=0.5))
model.add(Dense(2, activation='softmax'))  # 3 units for 3 classes

model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

### Set batch size and epochs for training
batch_size = 64
epochs = 5
"""
history = model.fit(X_train, Y_train, epochs=epochs, batch_size=batch_size, validation_data=(X_test, Y_test), verbose=2)

Epoch 1/5
1667/1667 - 349s - 210ms/step - accuracy: 0.8426 - loss: 0.3474 - val_accuracy: 0.7959 - val_loss: 0.4628
Epoch 2/5
1667/1667 - 372s - 223ms/step - accuracy: 0.8511 - loss: 0.3313 - val_accuracy: 0.7908 - val_loss: 0.4895
Epoch 3/5
1667/1667 - 364s - 218ms/step - accuracy: 0.8609 - loss: 0.3141 - val_accuracy: 0.7875 - val_loss: 0.5037
Epoch 4/5
1667/1667 - 366s - 220ms/step - accuracy: 0.8675 - loss: 0.2984 - val_accuracy: 0.7857 - val_loss: 0.5233
Epoch 5/5
1667/1667 - 371s - 222ms/step - accuracy: 0.8748 - loss: 0.2840 - val_accuracy: 0.7847 - val_loss: 0.5436


In [None]:
# Save the entire model
model.save('model.h5')

# Evaluate the model
score, acc = model.evaluate(X_test, Y_test, verbose=2, batch_size=batch_size)
print("Test score:", score)
print("Test accuracy:", acc)

# Load the model back
model = tf.keras.models.load_model('model.h5')

In [60]:
model = Sequential()
model.add(Embedding(input_dim=5000, output_dim=128))
model.add(SpatialDropout1D(0.5))
model.add(LSTM(100, activation='relu', dropout=0.5, recurrent_dropout=0.5))
model.add(Dense(2, activation='sigmoid'))  # 1 unit for binary classification

# Compile the model
model.compile(loss='binary_crossentropy', optimizer=Adam(learning_rate=0.0001), metrics=['accuracy'])

# Set batch size and epochs for training
batch_size = 64
epochs = 5
history = model.fit(X_train, Y_train, epochs=epochs, batch_size=batch_size, validation_data=(X_test, Y_test), verbose=2)

Epoch 1/5
1667/1667 - 390s - 234ms/step - accuracy: 0.5156 - loss: nan - val_accuracy: 0.4987 - val_loss: nan
Epoch 2/5
1667/1667 - 370s - 222ms/step - accuracy: 0.5003 - loss: nan - val_accuracy: 0.4987 - val_loss: nan
Epoch 3/5


KeyboardInterrupt: 

In [63]:
model = Sequential()
model.add(Embedding(input_dim=5000, output_dim=128))
model.add(SpatialDropout1D(0.5))
model.add(Bidirectional(LSTM(100, activation='relu', dropout=0.5, recurrent_dropout=0.5, return_sequences=True)))
model.add(Bidirectional(LSTM(100, activation='relu', dropout=0.5, recurrent_dropout=0.5)))
model.add(Dense(1, activation='sigmoid'))

# Compile the model
model.compile(loss='binary_crossentropy', optimizer=Adam(learning_rate=0.0001), metrics=['accuracy'])

# Early stopping
early_stopping = EarlyStopping(monitor='val_loss', patience=5, restore_best_weights=True)

# Model summary
model.summary()

# Train the model
model.fit(X_train, Y_train, epochs=128, batch_size=10, validation_split=0.2, callbacks=[early_stopping])

NameError: name 'Bidirectional' is not defined

In [64]:
model = Sequential()
model.add(Embedding(input_dim=5000, output_dim=128))
model.add(SpatialDropout1D(0.5))
model.add(LSTM(100, activation='relu', dropout=0.5, recurrent_dropout=0.5))
model.add(Dense(1, activation='sigmoid'))  # 1 unit for binary classification

# Compile the model
model.compile(loss='binary_crossentropy', optimizer=Adam(learning_rate=0.0001), metrics=['accuracy'])

# Set batch size and epochs for training
batch_size = 10
epochs = 128
history = model.fit(X_train, Y_train, epochs=epochs, batch_size=batch_size, validation_data=(X_test, Y_test), verbose=2)

Epoch 1/128
10667/10667 - 278s - 26ms/step - accuracy: 0.6754 - loss: 972390080.0000 - val_accuracy: 0.7430 - val_loss: 0.5665
Epoch 2/128
10667/10667 - 270s - 25ms/step - accuracy: 0.7364 - loss: 33044895744.0000 - val_accuracy: 0.7582 - val_loss: 0.5281
Epoch 3/128
10667/10667 - 282s - 26ms/step - accuracy: 0.7227 - loss: 8763860992.0000 - val_accuracy: 0.7519 - val_loss: 0.5272
Epoch 4/128
10667/10667 - 266s - 25ms/step - accuracy: 0.6828 - loss: 6480850432.0000 - val_accuracy: 0.5013 - val_loss: 11201.3389
Epoch 5/128
10667/10667 - 266s - 25ms/step - accuracy: 0.5219 - loss: nan - val_accuracy: 0.4987 - val_loss: nan
Epoch 6/128
10667/10667 - 271s - 25ms/step - accuracy: 0.5003 - loss: nan - val_accuracy: 0.4987 - val_loss: nan
Epoch 7/128
10667/10667 - 268s - 25ms/step - accuracy: 0.5003 - loss: nan - val_accuracy: 0.4987 - val_loss: nan
Epoch 8/128
10667/10667 - 269s - 25ms/step - accuracy: 0.5003 - loss: nan - val_accuracy: 0.4987 - val_loss: nan
Epoch 9/128
10667/10667 - 366s -

KeyboardInterrupt: 

In [8]:
model = Sequential()
model.add(Embedding(input_dim=5000, output_dim=128))
model.add(SpatialDropout1D(0.5))
model.add(LSTM(100, activation='relu', dropout=0.5, recurrent_dropout=0.5))
model.add(Dense(1, activation='sigmoid'))  # 1 unit for binary classification

# Compile the model
model.compile(loss='binary_crossentropy', optimizer=Adam(learning_rate=0.0001), metrics=['accuracy'])

# Set batch size and epochs for training
batch_size = 10
epochs = 128
history = model.fit(X_train, Y_train, epochs=epochs, batch_size=batch_size, validation_data=(X_test, Y_test), verbose=2)

Epoch 1/128


KeyboardInterrupt: 