In [1]:
# Importing libraries
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import nltk
nltk.download('stopwords')
nltk.download('punkt')
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import re
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.layers import LSTM, Dense, Embedding, Dropout
from tensorflow.keras.models import Sequential
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report
from sklearn.metrics import roc_auc_score
from sklearn.metrics import log_loss
import warnings
warnings.filterwarnings("ignore")

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Admin\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Admin\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!





In [2]:
# Read the Data from the Given excel file.

data=pd.read_excel(r'twitter_data.xlsx')
data

Unnamed: 0,clean_text,category
0,when modi promised â€œminimum government maxim...,-1.0
1,talk all the nonsense and continue all the dra...,0.0
2,what did just say vote for modi welcome bjp t...,1.0
3,asking his supporters prefix chowkidar their n...,1.0
4,answer who among these the most powerful world...,1.0
...,...,...
162975,why these 456 crores paid neerav modi not reco...,-1.0
162976,dear rss terrorist payal gawar what about modi...,-1.0
162977,did you cover her interaction forum where she ...,0.0
162978,there big project came into india modi dream p...,0.0


In [3]:
# Change our dependent variable to categorical

data['category'] = data['category'].map({0: "Neutral", -1: "Negative", 1: "Positive"})

In [4]:
# Do Missing value analysis and drop all null/missing values

data.dropna(inplace=True)

In [5]:
data.isnull().sum()

clean_text    0
category      0
dtype: int64

In [6]:
print("Summary statistics of numerical features : \n", data.describe())

print("\nTotal number of reviews: ",len(data))



Summary statistics of numerical features : 
         clean_text  category
count       162969    162969
unique      162968         3
top           2019  Positive
freq             2     72249

Total number of reviews:  162969


In [7]:
# do Text cleaning

def clean_text(text):
    text = re.sub(r'[^\w\s]','',text) # remove punctuation
    text = text.lower() # transform to lowercase
    stop_words = set(stopwords.words('english'))
    words = word_tokenize(text)
    words = [word for word in words if word not in stop_words] # remove stopwords
    text = " ".join(words)
    return text

In [8]:
data['clean_text'] = data['clean_text'].apply(lambda x: clean_text(str(x)))

In [9]:
# Split data into dependent(X) and independent(y) dataframe

X = data['clean_text']
y = data['category']

In [10]:
# Perform Operations on Text Data
tokenizer = Tokenizer()
tokenizer.fit_on_texts(X)
X = tokenizer.texts_to_sequences(X)
X = pad_sequences(X)

In [12]:
# Build and Train an LSTM model

vocab_size = len(tokenizer.word_index)+1
input_length = X.shape[1]

In [14]:
model = Sequential()
model.add(Embedding(vocab_size, 128, input_length=input_length))
model.add(LSTM(64, dropout=0.2, recurrent_dropout=0.2))
model.add(Dense(3, activation='softmax'))

model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])




In [16]:
#  dummy variable creation for the dependent variable

from sklearn.preprocessing import LabelEncoder
label_encoder = LabelEncoder()
y = label_encoder.fit_transform(y)
y = pd.get_dummies(y).values

In [17]:
# Split the data into training and testing sets

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [18]:
# Train the new model

model.fit(X_train, y_train, epochs=10, batch_size=32)

Epoch 1/10


Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.src.callbacks.History at 0x242a7cebfa0>

In [21]:
# Normalizing the prediction as same as the original data
y_pred = model.predict(X_test)
y_pred = np.argmax(y_pred, axis=1)




In [22]:
# Measure Performance Metrices and Accuracy

print("Classification Report:")
print(classification_report(np.argmax(y_test, axis=1), y_pred))

Classification Report:
              precision    recall  f1-score   support

           0       0.79      0.82      0.80      7152
           1       0.92      0.87      0.90     11067
           2       0.88      0.89      0.89     14375

    accuracy                           0.87     32594
   macro avg       0.86      0.86      0.86     32594
weighted avg       0.87      0.87      0.87     32594

