Import Libraries

In [1]:
# Importing libraries
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import nltk
nltk.download('stopwords')
nltk.download('punkt')
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import re
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.layers import LSTM, Dense, Embedding, Dropout
from tensorflow.keras.models import Sequential
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report
from sklearn.metrics import roc_auc_score
from sklearn.metrics import log_loss

Read the Data from the Given excel file

In [5]:
# Reading the data from the given csv file
data = pd.read_csv("/content/Twitter_Data.csv")

In [16]:
data.head(3)

Unnamed: 0,clean_text,category,length
0,modi promised minimum government maximum gover...,Negative,21
1,talk nonsense continue drama vote modi,Neutral,6
2,say vote modi welcome bjp told rahul main camp...,Positive,13


Change our dependent variable to categorical. (0 to “Neutral,”-1 to “Negative”, 1 to “Positive”)

In [6]:
# Changing the dependent variable to categorical
data['category'] = data['category'].map({0: "Neutral", -1: "Negative", 1: "Positive"})

Do Missing value analysisand drop all null/missing values

In [7]:
# Missing value analysis and dropping null/missing values
data = data.dropna()

Do text cleaning. (remove every symbol except alphanumeric, transform all words to lower case, and remove punctuationand stopwords )

In [8]:
# Text cleaning
def clean_text(text):
    text = re.sub(r'[^\w\s]','',text) # remove punctuation
    text = text.lower() # transform to lowercase
    stop_words = set(stopwords.words('english'))
    words = word_tokenize(text)
    words = [word for word in words if word not in stop_words] # remove stopwords
    text = " ".join(words)
    return text

Create a new column and find the length of each sentence (how many words they contain)

In [15]:
data['clean_text'] = data['clean_text'].apply(clean_text)

# Creating a new column for the length of each sentence
data['length'] = data['clean_text'].apply(lambda x: len(str(x).split(" ")))

Split data into dependent(X) and independent(y) dataframe and Do operationson text data

In [17]:
# Splitting data into dependent(X) and independent(y) dataframe
X = data['clean_text']
y = data['category']

# One-hot encoding for each sentence
tokenizer = Tokenizer(num_words=5000, oov_token='<OOV>')
tokenizer.fit_on_texts(X)
X = tokenizer.texts_to_sequences(X)

In [18]:
# Adding padding from the front side
X = pad_sequences(X, padding='pre')

In [19]:
# Building an LSTM model
model = Sequential()
model.add(Embedding(input_dim=5000, output_dim=128, input_length=X.shape[1]))
model.add(LSTM(units=64, return_sequences=True))
model.add(Dropout(0.2))
model.add(LSTM(units=64))
model.add(Dropout(0.2))
model.add(Dense(units=3, activation='softmax'))

In [20]:
# Compiling the LSTM model
model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

# Dummy variable creation for the dependent variable
y = pd.get_dummies(y)

In [21]:
# Splitting the data into tests and train
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Training the model
model.fit(X_train, y_train, epochs=10, batch_size=32)

# Normalizing the prediction as same as the original data
y_pred = model.predict(X_test)
y_pred = np.argmax(y_pred, axis=1)
y_test = np.argmax(y_test.values, axis=1)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [None]:
results = confusion_matrix(X_train,y_pred)
print ('Confusion Matrix :')
print(results)
print ('Accuracy Score is',accuracy_score(X_train,y_pred ))
print ('Classification Report : ')
print (classification_report(X_train, y_pred))
print('AUC-ROC:',roc_auc_score(X_train, y_pred))
print('LOGLOSS Value is',log_loss(X_train, y_pred))

In [22]:
# Printing the classification report
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.81      0.82      0.81      4301
           1       0.90      0.91      0.90      6641
           2       0.90      0.88      0.89      8736

    accuracy                           0.88     19678
   macro avg       0.87      0.87      0.87     19678
weighted avg       0.88      0.88      0.88     19678

