In [1]:
import nltk
nltk.download('twitter_samples')

from nltk.corpus import twitter_samples

[nltk_data] Downloading package twitter_samples to /root/nltk_data...
[nltk_data]   Unzipping corpora/twitter_samples.zip.


In [2]:
# Load the positive and negative tweets
positive_tweets = twitter_samples.strings('positive_tweets.json')
negative_tweets = twitter_samples.strings('negative_tweets.json')

print("Number of positive tweets:", len(positive_tweets))
print("Number of negative tweets:", len(negative_tweets))


print(positive_tweets[0])

Number of positive tweets: 5000
Number of negative tweets: 5000
#FollowFriday @France_Inte @PKuchly57 @Milipol_Paris for being top engaged members in my community this week :)


In [3]:
import numpy as np
import nltk
import tensorflow
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

# Download NLTK resources (if not already downloaded)
nltk.download('punkt')
nltk.download('stopwords')

# Tokenization, Stopword Removal, and Stemming
def preprocess_tweet(tweet):
    # Tokenization
    tokens = word_tokenize(tweet)

    # Remove noise (non-alphabetic characters)
    tokens = [token for token in tokens if token.isalpha()]

    # Remove stopwords (retaining negation words)
    stop_words = set(stopwords.words('english'))
    negation_words = set(["not", "no", "never"])  # Negation words to retain
    filtered_tokens = [token for token in tokens if token.lower() not in stop_words or token.lower() in negation_words]

    # Stemming
    stemmer = PorterStemmer()
    stemmed_tokens = [stemmer.stem(token) for token in filtered_tokens]

    return stemmed_tokens

# Combine positive and negative tweets into a single list
all_tweets = positive_tweets + negative_tweets

# Preprocess tweets
preprocessed_tweets = [preprocess_tweet(tweet) for tweet in all_tweets]

# Initialize tokenizer
tokenizer = Tokenizer()
tokenizer.fit_on_texts(preprocessed_tweets)

# Convert text data to sequences of indices
sequences = tokenizer.texts_to_sequences(preprocessed_tweets)

# Pad sequences to a fixed length

# Find the maximum length of sequences
max_length = max([len(seq) for seq in sequences])
print("Maximum sequence length:", max_length)

padded_sequences = pad_sequences(sequences, maxlen=max_length, padding='post', truncating='post')

# Create labels for positive and negative tweets (1 for positive, 0 for negative)
labels = np.concatenate((np.ones(len(positive_tweets)), np.zeros(len(negative_tweets))))

# Split data into training and testing sets
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(padded_sequences, labels, test_size=0.2, random_state=42)

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


Maximum sequence length: 28


In [8]:
print(padded_sequences[0])

[ 375  256 1048  410  266   50    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0]


In [6]:
import numpy as np
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, SimpleRNN, Dense
from tensorflow.keras.optimizers import Adam
from sklearn.metrics import accuracy_score, classification_report

# Define the RNN model
def create_rnn_model(input_dim, output_dim, embedding_dim=128, rnn_units=64):
    model = Sequential()
    model.add(Embedding(input_dim, embedding_dim, input_length=max_length))
    model.add(SimpleRNN(rnn_units))
    model.add(Dense(1, activation='sigmoid'))
    return model

# Create the RNN model
model = create_rnn_model(input_dim=len(tokenizer.word_index) + 1, output_dim=1)

# Compile the model
model.compile(optimizer=Adam(learning_rate=0.001),
              loss='binary_crossentropy',
              metrics=['accuracy'])

# Train the model
history = model.fit(X_train, y_train, epochs=5, batch_size=64, validation_split=0.1)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


In [7]:
# Evaluate the model
y_pred_prob = model.predict(X_test)
y_pred = (y_pred_prob > 0.5).astype(int).flatten()
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)
print("Classification Report:")
print(classification_report(y_test, y_pred))

Accuracy: 0.721
Classification Report:
              precision    recall  f1-score   support

         0.0       0.72      0.72      0.72       988
         1.0       0.73      0.72      0.72      1012

    accuracy                           0.72      2000
   macro avg       0.72      0.72      0.72      2000
weighted avg       0.72      0.72      0.72      2000

