In [1]:
import tensorflow as tf
tf.config.set_visible_devices([], 'GPU')

In [2]:
import os
import pandas as pd
from keras.preprocessing.text import Tokenizer
from keras.utils import pad_sequences
from keras.models import Sequential
from keras.layers import Embedding, LSTM, Dense
from sklearn.model_selection import train_test_split

In [3]:
# Load the dataset
df = pd.read_csv('../datasets/cleaned_datasets/cleaned_twitter_dataset.csv')
df.head()

Unnamed: 0,text,target
0,"['upset', 'cant', 'update', 'facebook', 'texti...",0
1,"['kenichan', 'dived', 'many', 'time', 'ball', ...",0
2,"['whole', 'body', 'feel', 'itchy', 'like', 'fi...",0
3,"['nationwideclass', 'behaving', 'im', 'mad', '...",0
4,"['kwesidei', 'whole', 'crew']",0


In [4]:
# Remove rows with missing values
df = df.dropna(subset=['text', 'target'])
df.shape

(1599999, 2)

In [8]:
# Split the dataset into training and testing sets
df_train, df_test = train_test_split(df, test_size=0.2, random_state=42)

In [9]:
# Tokenize the text data
tokenizer = Tokenizer(num_words=200000, split=' ')
tokenizer.fit_on_texts(df_train['text'].values)

In [10]:
# Convert the text data to sequences and pad them
max_len = max([len(s.split()) for s in df_train['text'].values])

# Convert the text data into padded sequences of equal length
x_train = pad_sequences(tokenizer.texts_to_sequences(df_train['text'].values), maxlen=max_len)
x_test = pad_sequences(tokenizer.texts_to_sequences(df_test['text'].values), maxlen=max_len)

In [11]:
# Get the target variables
y_train = df_train['target']
y_test = df_test['target']

In [12]:
# Define the model architecture
model = Sequential()
model.add(Embedding(input_dim=200000, output_dim=128, input_length=max_len))
model.add(LSTM(256, dropout=0.3, recurrent_dropout=0.2))
model.add(Dense(1, activation='sigmoid'))
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

In [13]:
# Fit the model to the training data
os.environ['TF_METAL_DEVICE_PLACEMENT'] = 'metal:0'
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'

# Reduce the number of samples used per epoch to 1000
batch_size = 32
steps_per_epoch = 3600
num_epochs = 15

model.fit(x_train, y_train, validation_split=0.1, batch_size=batch_size, epochs=num_epochs, steps_per_epoch=steps_per_epoch)

Epoch 1/15
 334/3600 [=>............................] - ETA: 14:45 - loss: 0.5940 - accuracy: 0.6749

KeyboardInterrupt: 

In [1]:
# Save model
model.save('final_models/lstm_e10_b32_u256.h5')

NameError: name 'model' is not defined

In [None]:
# Evaluate model
loss, accuracy = model.evaluate(x_test, y_test, verbose=0)
print('Accuracy: %f' % (accuracy*100))
print('Loss: %f' % (loss*100))

In [None]:
# load the saved model
from keras.models import load_model
model = load_model('final_models/lstm_e10_b128_p20.h5')

# make predictions
y_pred = model.predict(x_test)

In [None]:
import matplotlib.pyplot as plt

plt.plot(y_test, label='actual')
plt.plot(y_pred, label='predicted')
plt.legend()
plt.show()

In [None]:
# Make predictions
new_sentences = ['I am feeling sad today', 'Today is a great day']
new_sequences = tokenizer.texts_to_sequences(new_sentences)
new_sequences = pad_sequences(new_sequences)
predictions = model.predict(new_sequences)