In [2]:
import pandas as pd
import numpy as np
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.layers import Embedding, LSTM, Dense
from tensorflow.keras.models import Sequential
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [3]:
# Load the preprocessed data from the CSV file
data = pd.read_csv('/content/drive/MyDrive/SWM/preprocessed_dataset.csv')

# Sample a smaller subset of the data for training
train_data = data.sample(n=100000, random_state=42)

# Split the data into testing dataset
test_data = data[100000:]


In [4]:
# Convert the "text" column to a string type
train_data['text'] = train_data['text'].astype(str)

# Drop any rows with missing values
train_data.dropna(inplace=True)

# Convert the "text" column to a string type
test_data['text'] = test_data['text'].astype(str)

# Drop any rows with missing values
test_data.dropna(inplace=True)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test_data['text'] = test_data['text'].astype(str)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test_data.dropna(inplace=True)


In [5]:
# Tokenize the text data
tokenizer = Tokenizer(num_words=5000)
tokenizer.fit_on_texts(train_data['text'])

# Convert the text data into sequences of integers
train_sequences = tokenizer.texts_to_sequences(train_data['text'])
test_sequences = tokenizer.texts_to_sequences(test_data['text'])


In [6]:
# Pad the sequences to make them of equal length
max_len = 100
train_padded_sequences = pad_sequences(train_sequences, maxlen=max_len, padding='post', truncating='post')
test_padded_sequences = pad_sequences(test_sequences, maxlen=max_len, padding='post', truncating='post')

In [7]:
# Define the RNN model
model = Sequential([
    Embedding(input_dim=5000, output_dim=32, input_length=max_len),
    LSTM(units=64, dropout=0.2, recurrent_dropout=0.2),
    Dense(units=1, activation='sigmoid')
])

In [8]:
# Compile the model
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

In [9]:
# Fit the model to the training data
history = model.fit(train_padded_sequences, train_data['target'], epochs=2, batch_size=32, validation_split=0.1)

Epoch 1/2
Epoch 2/2


In [10]:
# Evaluate the model on the testing data
loss, accuracy = model.evaluate(test_padded_sequences, test_data['target'])
print(f'Test loss: {loss:.3f}, test accuracy: {accuracy:.3f}')

Test loss: -385.364, test accuracy: 0.000


In [19]:
# Define a sample test Twitter text
test_text = "I am really happy today!"

# Tokenize the test Twitter text
test_sequence = tokenizer.texts_to_sequences([test_text])

# Pad the sequence to make it of equal length
test_padded_sequence = pad_sequences(test_sequence, maxlen=max_len, padding='post', truncating='post')


In [20]:
# Use the trained model to predict the sentiment score of the test Twitter text
score = model.predict(test_padded_sequence)

# Print the predicted sentiment score
print(f'Predicted sentiment score: {score[0][0]:.3f}')


Predicted sentiment score: 1.000


In [21]:
import pickle

pickle.dump(model, open('RNN.pkl', 'wb'))