In [1]:
import pandas as pd

# Reading the data

In [2]:
data = pd.read_csv('Twitter_Data.csv')

In [3]:
print(data.head())

                                          clean_text  category
0  when modi promised “minimum government maximum...      -1.0
1  talk all the nonsense and continue all the dra...       0.0
2  what did just say vote for modi  welcome bjp t...       1.0
3  asking his supporters prefix chowkidar their n...       1.0
4  answer who among these the most powerful world...       1.0


# Changing the dependent variable to categorical. ( 0 to “Neutral,” -1 to “Negative”, 1 to “Positive”)


In [4]:
data['category'] = data['category'].map({0:'Neutral',-1:'Negative',1:'Positive'})

In [5]:
print(data.head())

                                          clean_text  category
0  when modi promised “minimum government maximum...  Negative
1  talk all the nonsense and continue all the dra...   Neutral
2  what did just say vote for modi  welcome bjp t...  Positive
3  asking his supporters prefix chowkidar their n...  Positive
4  answer who among these the most powerful world...  Positive


# Performing Missing value analysis and droping null/missing values

In [6]:
data.isnull().sum()

clean_text    4
category      7
dtype: int64

In [7]:
data = data.dropna()

In [8]:
data.isnull().sum()

clean_text    0
category      0
dtype: int64

# Performing text cleaning. (remove every symbol except alphanumeric, transform all words to lower case, and remove punctuation and stopwords)

In [9]:
import re
import nltk
from nltk.corpus import stopwords
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Dell\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [10]:
stop_words = set(stopwords.words('english'))

In [11]:
def clean(tweet):
    tweet = re.sub(r'[^a-zA-Z0-9\s]', '', tweet)
    tweet = tweet.lower()
    tweet = ' '.join(word for word in tweet.split() if word not in stop_words)
    return tweet

In [12]:
data['clean_text'] = data['clean_text'].apply(clean)

# Creating a new column and find the length of each sentence (how many words they contain)

In [13]:
data['tweet_length'] = data['clean_text'].apply(lambda x: len(x.split()))

In [14]:
print(data.head())

                                          clean_text  category  tweet_length
0  modi promised minimum government maximum gover...  Negative            21
1             talk nonsense continue drama vote modi   Neutral             6
2  say vote modi welcome bjp told rahul main camp...  Positive            13
3  asking supporters prefix chowkidar names modi ...  Positive            19
4  answer among powerful world leader today trump...  Positive            10


# Spliting data into dependent(X) and independent(Y) dataframe

In [15]:
X = data['clean_text']

In [16]:
Y = data['category']

# Performing operations on text data  

In [17]:
import tensorflow as tf

In [18]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

### Performing one-hot encoding for each sentence (using TensorFlow)

In [19]:
Y_encoded = pd.get_dummies(Y)

In [20]:
tokenizer = Tokenizer()

In [21]:
tokenizer.fit_on_texts(X)

In [22]:
X_sequences = tokenizer.texts_to_sequences(X)

### Adding padding from the front side (using Tensorflow)

In [23]:
max_len = max(data['tweet_length'])

In [24]:
X_padded = pad_sequences(X_sequences, maxlen=max_len, padding='pre')

### Building an LSTM model and compile it

In [25]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout

In [26]:
vocab_size = len(tokenizer.word_index) + 1

In [27]:
embedding_dim = 100

In [28]:
input_length = max_len

In [29]:
model = Sequential()

In [30]:
model.add(Embedding(input_dim=vocab_size, output_dim=embedding_dim, input_length=input_length))



In [31]:
model.add(LSTM(128, return_sequences=True))

In [32]:
model.add(Dropout(0.5))

In [33]:
model.add(LSTM(64))

In [34]:
model.add(Dense(3, activation='softmax'))

In [35]:
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

In [36]:
model.summary()

### spliting the data into tests and train 

In [37]:
from sklearn.model_selection import train_test_split

In [38]:
X_train, X_test, Y_train, Y_test = train_test_split(X_padded, Y_encoded, test_size=0.2, random_state=42)

In [39]:
print(X_train.shape, X_test.shape, Y_train.shape, Y_test.shape)

(130375, 43) (32594, 43) (130375, 3) (32594, 3)


# Training new model

In [40]:
history = model.fit(X_train, Y_train, epochs=5, batch_size=32, validation_split=0.2)

Epoch 1/5
[1m3260/3260[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m434s[0m 131ms/step - accuracy: 0.7965 - loss: 0.5162 - val_accuracy: 0.9190 - val_loss: 0.2731
Epoch 2/5
[1m3260/3260[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m418s[0m 128ms/step - accuracy: 0.9307 - loss: 0.2270 - val_accuracy: 0.9147 - val_loss: 0.2788
Epoch 3/5
[1m3260/3260[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m427s[0m 131ms/step - accuracy: 0.9557 - loss: 0.1474 - val_accuracy: 0.9060 - val_loss: 0.3147
Epoch 4/5
[1m3260/3260[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m426s[0m 131ms/step - accuracy: 0.9704 - loss: 0.0964 - val_accuracy: 0.8919 - val_loss: 0.3876
Epoch 5/5
[1m3260/3260[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m429s[0m 131ms/step - accuracy: 0.9799 - loss: 0.0630 - val_accuracy: 0.8893 - val_loss: 0.4621


# Normalizing the prediction as same as the original data(prediction might be in decimal, so whoever is nearest to 1 is predicted as yes and set other as 0)

In [41]:
Y_pred_prob = model.predict(X_test)

[1m1019/1019[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m22s[0m 22ms/step


In [42]:
Y_pred = Y_pred_prob.argmax(axis=1)

In [43]:
Y_test_labels = Y_test.values.argmax(axis=1)

In [44]:
category_mapping = {0: 'Negative', 1: 'Neutral', 2: 'Positive'}

In [45]:
Y_pred = [category_mapping[label] for label in Y_pred]

In [46]:
Y_test_labels = [category_mapping[label] for label in Y_test_labels]

# Measuring performance metrics and accuracy

In [47]:
from sklearn.metrics import classification_report, accuracy_score

In [48]:
accuracy = accuracy_score(Y_test_labels, Y_pred)

In [49]:
print(f'Accuracy: {accuracy:.2f}')

Accuracy: 0.89


# Printing Classification report

In [50]:
print(classification_report(Y_test_labels, Y_pred))

              precision    recall  f1-score   support

    Negative       0.81      0.83      0.82      7152
     Neutral       0.92      0.91      0.91     11067
    Positive       0.89      0.90      0.90     14375

    accuracy                           0.89     32594
   macro avg       0.88      0.88      0.88     32594
weighted avg       0.89      0.89      0.89     32594

