<a href="https://colab.research.google.com/github/Soham0410/Sentimental-Analysis/blob/main/Recurrent_Neural_Network.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install tensorflow
!pip install kaggle



In [None]:
# Configuring the path of Kaggle.json file
!mkdir -p ~/.kaggle
!cp kaggle.json ~/.kaggle/
!chmod 600 ~/.kaggle/kaggle.json

In [None]:
# Downloading the dataset
!kaggle datasets download -d kazanova/sentiment140

Dataset URL: https://www.kaggle.com/datasets/kazanova/sentiment140
License(s): other
Downloading sentiment140.zip to /content
 95% 77.0M/80.9M [00:00<00:00, 94.1MB/s]
100% 80.9M/80.9M [00:00<00:00, 92.6MB/s]


In [None]:
from zipfile import ZipFile
dataset = 'sentiment140.zip'
with ZipFile(dataset, 'r') as zip:
  zip.extractall()
  print('The dataset is extracted')

The dataset is extracted


In [None]:
import numpy as np
import pandas as pd
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, f1_score
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout

In [None]:
# Download stopwords
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [None]:
# Data processing
socialmedia_data = pd.read_csv('training.1600000.processed.noemoticon.csv', encoding='ISO-8859-1')

In [None]:
# Naming columns
column_names = ['target', 'id', 'date', 'flag', 'user', 'text']
socialmedia_data = pd.read_csv('training.1600000.processed.noemoticon.csv', names=column_names, encoding='ISO-8859-1')

In [None]:
# Replace target values
socialmedia_data.replace({'target': {4: 1}}, inplace=True)

In [None]:
# Initialize PorterStemmer
port_stem = PorterStemmer()

In [None]:
# Stemming function
def stemming(content):
    stemmed_content = re.sub('[^a-zA-Z]', ' ', content)
    stemmed_content = stemmed_content.lower()
    stemmed_content = stemmed_content.split()
    stemmed_content = [port_stem.stem(word) for word in stemmed_content if not word in stopwords.words('english')]
    stemmed_content = ' '.join(stemmed_content)
    return stemmed_content

In [None]:
# Apply stemming
socialmedia_data['stemmed_content'] = socialmedia_data['text'].apply(stemming)

In [None]:
# Separating the data and label
X = socialmedia_data['stemmed_content']
Y = socialmedia_data['target'].values

In [None]:
# Train-test split
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, stratify=Y, random_state=2)

In [None]:
# Tokenization and padding
max_words = 10000
max_len = 100

In [None]:
tokenizer = Tokenizer(num_words=max_words)
tokenizer.fit_on_texts(X_train)

In [None]:
import pickle

In [None]:
# Save the tokenizer
with open('tokenizer.pkl', 'wb') as handle:
    pickle.dump(tokenizer, handle, protocol=pickle.HIGHEST_PROTOCOL)

In [None]:
X_train_seq = tokenizer.texts_to_sequences(X_train)
X_test_seq = tokenizer.texts_to_sequences(X_test)

In [None]:
X_train_pad = pad_sequences(X_train_seq, maxlen=max_len)
X_test_pad = pad_sequences(X_test_seq, maxlen=max_len)

In [None]:
# Building the RNN model
model = Sequential()
model.add(Embedding(max_words, 128, input_length=max_len))
model.add(LSTM(128, return_sequences=True))
model.add(Dropout(0.5))
model.add(LSTM(64))
model.add(Dropout(0.5))
model.add(Dense(1, activation='sigmoid'))

model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

In [None]:
# Train the model
history = model.fit(X_train_pad, Y_train, epochs=3, batch_size=128, validation_split=0.2)

Epoch 1/3
Epoch 2/3
Epoch 3/3


In [None]:
# Evaluate the model on train data
from sklearn.metrics import recall_score
Y_train_pred_prob = model.predict(X_train_pad)
Y_train_pred = (Y_train_pred_prob > 0.5).astype("int32")

train_accuracy = accuracy_score(Y_train, Y_train_pred)
train_precision = precision_score(Y_train, Y_train_pred)
train_recall = recall_score(Y_train, Y_train_pred)
train_f1 = f1_score(Y_train, Y_train_pred)

print(f'Accuracy Score on the training data: {train_accuracy}')
print(f'Precision Score on the training data: {train_precision}')
print(f'Recall Score on the training data: {train_recall}')
print(f'F1 Score on the training data: {train_f1}')



In [None]:
# Evaluate the model on  test data
Y_pred_prob = model.predict(X_test_pad)
Y_pred = (Y_pred_prob > 0.5).astype("int32")



In [None]:
accuracy = accuracy_score(Y_test, Y_pred)
precision = precision_score(Y_test, Y_pred)
recall = recall_score(Y_test, Y_pred)
f1 = f1_score(Y_test, Y_pred)

print(f'Accuracy Score on the test data: {accuracy}')
print(f'Precision Score on the test data: {precision}')
print(f'Recall Score on the test data: {recall}')
print(f'F1 Score on the test data: {f1}')

Accuracy Score on the test data: 0.784121875
Precision Score on the test data: 0.7664449198487824
Recall Score on the test data: 0.81729375
F1 Score on the test data: 0.7910530375209835


In [None]:
# Saving the trained model
model.save('trained_rnn_model.h5')

  saving_api.save_model(


In [None]:
# Load and use the trained model
loaded_model = tf.keras.models.load_model('trained_rnn_model.h5')

In [None]:
# Make predictions
def predict_sentiment(text):
    text = stemming(text)
    seq = tokenizer.texts_to_sequences([text])
    pad = pad_sequences(seq, maxlen=max_len)
    pred = loaded_model.predict(pad)
    return 'Positive Tweet' if pred[0][0] > 0.5 else 'Negative Tweet'

In [None]:
# Example predictions
print(predict_sentiment(X_test.iloc[1000]))
print(predict_sentiment(X_test.iloc[3000]))

Negative Tweet
Negative Tweet
