<a href="https://colab.research.google.com/github/Tdas-christ/NLP/blob/main/Sentiment_Analysis_using_RNN.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import numpy as np
import pandas as pd
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.layers import LSTM, Embedding, Dense, Dropout, Bidirectional
from tensorflow.keras.models import Sequential
from tensorflow.keras.losses import CategoricalCrossentropy
from sklearn.preprocessing import LabelEncoder

In [2]:
# Load the dataset
df = pd.read_csv('/content/drive/MyDrive/DATASETS/twitter_training.csv')

In [3]:
df.sample(10)

Unnamed: 0,2401,Borderlands,Positive,"im getting on borderlands and i will murder you all ,"
60121,3503,Facebook,Irrelevant,During the lockout I will cry about my salary....
49514,6100,FIFA,Negative,The absolute core
20553,12719,WorldOfCraft,Negative,@Warcraft what powers on earth have had you no...
51740,10486,RedDeadRedemption(RDR),Negative,Update: closed and restarted game. Still waiti...
67753,3604,Cyberpunk2077,Neutral,"Fuck, well ok"
72729,8859,Nvidia,Positive,Fuck everything ever except the Nvidia GeForce...
4897,42,Amazon,Neutral,"At the same time, despite the fact that there ..."
64457,7843,MaddenNFL,Negative,Larry Fitzgerald not being in the top 3 on thi...
69951,3985,Cyberpunk2077,Neutral,
20474,12705,WorldOfCraft,Positive,In best Warcraft peon systems work.”


In [4]:
df.shape

(74681, 4)

In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 74681 entries, 0 to 74680
Data columns (total 4 columns):
 #   Column                                                 Non-Null Count  Dtype 
---  ------                                                 --------------  ----- 
 0   2401                                                   74681 non-null  int64 
 1   Borderlands                                            74681 non-null  object
 2   Positive                                               74681 non-null  object
 3   im getting on borderlands and i will murder you all ,  73995 non-null  object
dtypes: int64(1), object(3)
memory usage: 2.3+ MB


In [6]:
df.rename(columns={'im getting on borderlands and i will murder you all ,': 'text'}, inplace=True)

In [7]:
df.isnull().sum()

2401             0
Borderlands      0
Positive         0
text           686
dtype: int64

In [8]:
df.isnull().mean()

2401           0.000000
Borderlands    0.000000
Positive       0.000000
text           0.009186
dtype: float64

In [9]:
df = df.dropna()

In [10]:
df.isnull().sum()

2401           0
Borderlands    0
Positive       0
text           0
dtype: int64

In [11]:
df.shape

(73995, 4)

In [12]:
df['Positive'].value_counts()

Negative      22358
Positive      20654
Neutral       18108
Irrelevant    12875
Name: Positive, dtype: int64

In [13]:
#Tokenize the text
tokenizer = Tokenizer(num_words=10000)
tokenizer.fit_on_texts(df['text'])

In [14]:
test_df = pd.read_csv('/content/drive/MyDrive/DATASETS/twitter_validation.csv')

In [15]:
test_df.sample(10)

Unnamed: 0,3364,Facebook,Irrelevant,"I mentioned on Facebook that I was struggling for motivation to go for a run the other day, which has been translated by Tom’s great auntie as ‘Hayley can’t get out of bed’ and told to his grandma, who now thinks I’m a lazy, terrible person 🤣"
115,9149,Nvidia,Neutral,The Nvidia Shield is my daily driver but I lik...
935,5987,HomeDepot,Negative,@HomeDepot Your customer service is atrociou...
55,1725,CallOfDutyBlackopsColdWar,Positive,I like the killstreaks
798,12882,Xbox(Xseries),Positive,Getting strong Netflix vibes and I like it
782,970,AssassinsCreed,Positive,It's kind of incredible what games I'll be get...
385,9864,PlayStation5(PS5),Neutral,I don't know what you people have been doing o...
297,3312,Facebook,Neutral,"House Democrats say Facebook, Amazon, Alphabet..."
213,1551,Battlefield,Irrelevant,"Really good in-depth video, highly recommend."
203,2847,Dota2,Positive,Pre-Covid: Student Game day! Introducing stude...
441,9452,Overwatch,Positive,I started playing Overwatch (on Switch) right ...


In [16]:
test_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 999 entries, 0 to 998
Data columns (total 4 columns):
 #   Column                                                                                                                                                                                                                                              Non-Null Count  Dtype 
---  ------                                                                                                                                                                                                                                              --------------  ----- 
 0   3364                                                                                                                                                                                                                                                999 non-null    int64 
 1   Facebook                                                                   

In [17]:
test_df.rename(columns={'I mentioned on Facebook that I was struggling for motivation to go for a run the other day, which has been translated by Tom’s great auntie as ‘Hayley can’t get out of bed’ and told to his grandma, who now thinks I’m a lazy, terrible person 🤣':'text'}, inplace=True)

In [19]:
# Convert text data to sequences
X_train_seq = tokenizer.texts_to_sequences(df['text'])
X_test_seq = tokenizer.texts_to_sequences(test_df['text'])

In [20]:
# Pad sequences to ensure uniform length
X_train_padded = pad_sequences(X_train_seq, maxlen=100)
X_test_padded = pad_sequences(X_test_seq, maxlen=100)

In [23]:
# Prepare labels
y_train = df['Positive']
y_test = test_df['Irrelevant']

In [25]:
le = LabelEncoder()
y_train = le.fit_transform(y_train)
y_test = le.fit_transform(y_test)

In [26]:
from imblearn.over_sampling import SMOTE
smote = SMOTE()
X_train_resampled, y_train_resampled = smote.fit_resample(X_train_padded, y_train)

In [33]:
model = Sequential([
    Embedding(input_dim=10000, output_dim=128, input_length=100),  # Assuming 100 is the sequence length
    Bidirectional(LSTM(128, return_sequences=True)),
    Dropout(0.5),
    Bidirectional(LSTM(64)),
    Dropout(0.5),
    Dense(4, activation='softmax')  # Using softmax for multiclass classification
])


In [34]:
# Compile the model
model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])

In [35]:
# Train the model
model.fit(X_train_resampled, y_train_resampled, epochs=10, batch_size=32, validation_split=0.1)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.src.callbacks.History at 0x7849a827a680>

In [36]:
# Evaluate the model on test data
loss, accuracy = model.evaluate(X_test_padded, y_test)
print("Test Accuracy:", accuracy)

Test Accuracy: 0.9279279112815857
