<a href="https://colab.research.google.com/github/Soham0410/Sentimental-Analysis/blob/main/RNN.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# Ensure you have TensorFlow and required libraries installed
!pip install tensorflow scikit-learn matplotlib



In [None]:
# Load the dataset
!pip install Kaggle
!mkdir -p ~/.kaggle
!cp kaggle.json ~/.kaggle/
!chmod 600 ~/.kaggle/kaggle.json
!kaggle datasets download -d kazanova/sentiment140

Dataset URL: https://www.kaggle.com/datasets/kazanova/sentiment140
License(s): other
Downloading sentiment140.zip to /content
 95% 77.0M/80.9M [00:00<00:00, 222MB/s]
100% 80.9M/80.9M [00:00<00:00, 199MB/s]


In [None]:
from zipfile import ZipFile
dataset = '/content/sentiment140.zip'
with ZipFile(dataset, 'r') as zip:
  zip.extractall()
  print('The dataset is extracted')

The dataset is extracted


In [None]:
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import classification_report, accuracy_score, roc_curve, roc_auc_score
import numpy as np
import re
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
import matplotlib.pyplot as plt
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [None]:
# Read the dataset
column_names = ['target','id','date','flag','user','text']
df = pd.read_csv('/content/training.1600000.processed.noemoticon.csv', names=column_names, encoding='ISO-8859-1')

In [None]:
# Data preprocessing
df.replace({'target':{4:1}}, inplace=True)
port_stem = PorterStemmer()

In [None]:
def stemming(content):
    stemmed_content = re.sub('[^a-zA-Z]', ' ', content)
    stemmed_content = stemmed_content.lower()
    stemmed_content = stemmed_content.split()
    stemmed_content = [port_stem.stem(word) for word in stemmed_content if not word in stopwords.words('english')]
    stemmed_content = ' '.join(stemmed_content)
    return stemmed_content

In [None]:
df['stemmed_content'] = df['text'].apply(stemming)

In [None]:
# Separating the data and labels
X = df['stemmed_content']
Y = df['target'].values

In [None]:
# Tokenization and padding
max_words = 5000
max_len = 100

In [None]:
tokenizer = Tokenizer(num_words=max_words)
tokenizer.fit_on_texts(X)
X = tokenizer.texts_to_sequences(X)
X = pad_sequences(X, maxlen=max_len)

In [None]:
# Splitting the dataset
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, stratify=Y, random_state=2)

In [None]:
# Building the RNN model
model = Sequential()
model.add(Embedding(input_dim=max_words, output_dim=128, input_length=max_len))
model.add(LSTM(128, dropout=0.2, recurrent_dropout=0.2))
model.add(Dense(1, activation='sigmoid'))

model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

In [None]:
# Training the model
history = model.fit(X_train, Y_train, epochs=5, batch_size=64, validation_data=(X_test, Y_test))

Epoch 1/5
Epoch 2/5
Epoch 3/5
 4461/20000 [=====>........................] - ETA: 1:28:49 - loss: 0.4403 - accuracy: 0.7922

In [None]:
# Evaluating the model on training data
train_predictions_prob = model.predict(X_train)
train_predictions = (train_predictions_prob > 0.5).astype("int32")
train_accuracy = accuracy_score(Y_train, train_predictions)
train_report = classification_report(Y_train, train_predictions, target_names=['Negative', 'Positive'])

In [None]:
# Evaluating the model on test data
test_predictions_prob = model.predict(X_test)
test_predictions = (test_predictions_prob > 0.5).astype("int32")
test_accuracy = accuracy_score(Y_test, test_predictions)
test_report = classification_report(Y_test, test_predictions, target_names=['Negative', 'Positive'])

In [None]:
print(f'Accuracy on training data: {train_accuracy}')
print('Training data classification report:')
print(train_report)

print(f'Accuracy on test data: {test_accuracy}')
print('Test data classification report:')
print(test_report)

In [None]:
# Plotting the ROC curve for test data
fpr, tpr, _ = roc_curve(Y_test, test_predictions_prob)
roc_auc = roc_auc_score(Y_test, test_predictions_prob)

plt.figure()
plt.plot(fpr, tpr, color='darkorange', lw=2, label=f'ROC curve (area = {roc_auc:.2f})')
plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver Operating Characteristic (ROC) Curve')
plt.legend(loc="lower right")
plt.show()

In [None]:
# Saving the trained model
model.save('sentiment_analysis_rnn.h5')

In [None]:
# Loading the model for prediction
loaded_model = tf.keras.models.load_model('sentiment_analysis_rnn.h5')

In [None]:
# Predicting on new data
sample_index = 200
sample_text = X_test[sample_index]
prediction_prob = loaded_model.predict(np.array([sample_text]))
prediction = (prediction_prob > 0.5).astype("int32")
print(f'Actual sentiment: {"Positive" if Y_test[sample_index] == 1 else "Negative"}')
print(f'Predicted sentiment: {"Positive" if prediction[0] == 1 else "Negative"}')