<a href="https://colab.research.google.com/github/RajaAbitha/RajaAbitha/blob/main/Assignment7DeeplearningNLP.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
# Deeplearning NLP
import pandas as pd
import numpy as np

In [2]:
# Load Dataset
df = pd.read_csv('/content/judge-1377884607_tweet_product_company.csv',encoding='ISO-8859-1')


In [3]:
df.head()

Unnamed: 0,tweet_text,emotion_in_tweet_is_directed_at,is_there_an_emotion_directed_at_a_brand_or_product
0,.@wesley83 I have a 3G iPhone. After 3 hrs twe...,iPhone,Negative emotion
1,@jessedee Know about @fludapp ? Awesome iPad/i...,iPad or iPhone App,Positive emotion
2,@swonderlin Can not wait for #iPad 2 also. The...,iPad,Positive emotion
3,@sxsw I hope this year's festival isn't as cra...,iPad or iPhone App,Negative emotion
4,@sxtxstate great stuff on Fri #SXSW: Marissa M...,Google,Positive emotion


Preprocessing

In [4]:
import re
def clean_text(text):
    text = str(text).lower()                         # Lowercase
    text = re.sub(r'http\S+', ' ', text)             # Remove URLs
    text = re.sub(r'[^a-z\s]', ' ', text)            # Remove punctuation/numbers
    return text

df['tweet_text'] = df['tweet_text'].apply(clean_text)

In [5]:
df.head()

Unnamed: 0,tweet_text,emotion_in_tweet_is_directed_at,is_there_an_emotion_directed_at_a_brand_or_product
0,wesley i have a g iphone after hrs twe...,iPhone,Negative emotion
1,jessedee know about fludapp awesome ipad i...,iPad or iPhone App,Positive emotion
2,swonderlin can not wait for ipad also the...,iPad,Positive emotion
3,sxsw i hope this year s festival isn t as cra...,iPad or iPhone App,Negative emotion
4,sxtxstate great stuff on fri sxsw marissa m...,Google,Positive emotion


In [6]:
# Changing column name
df = df.rename(columns={'is_there_an_emotion_directed_at_a_brand_or_product': 'sentiment'})


In [7]:
print(df['sentiment'].value_counts())

sentiment
No emotion toward brand or product    5389
Positive emotion                      2978
Negative emotion                       570
I can't tell                           156
Name: count, dtype: int64


In [8]:
# Map sentiment values to clean labels
sentiment_map = {
    'Positive emotion': 'positive',
    'Negative emotion': 'negative',
    'No emotion toward brand or product': 'neutral',
    "I can't tell": 'no_idea'}

df['sentiment'] = df['sentiment'].map(sentiment_map)


In [9]:
# Drop rows with missing or unmapped sentiment
df.dropna(subset=['sentiment'], inplace=True)


In [10]:
# Preprocessing
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
df['sentiment_enc'] = le.fit_transform(df['sentiment'])

In [11]:
print({label: int(code) for label, code in zip(le.classes_, le.transform(le.classes_))})

{'negative': 0, 'neutral': 1, 'no_idea': 2, 'positive': 3}


In [11]:
df.isnull().sum()

Unnamed: 0,0
tweet_text,0
emotion_in_tweet_is_directed_at,5802
sentiment,0
sentiment_enc,0


In [12]:
df.drop(columns=['emotion_in_tweet_is_directed_at'], inplace=True)

In [14]:
is_null = df.isnull().sum()
print(is_null)

tweet_text       0
sentiment        0
sentiment_enc    0
dtype: int64


In [15]:
#Tokenize & pad
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing import sequence

tokenizer = Tokenizer(num_words=8000, oov_token="<OOV>")
tokenizer.fit_on_texts(df['tweet_text'])

sequences = tokenizer.texts_to_sequences(df['tweet_text'])
X= sequence.pad_sequences(sequences, maxlen=30)

In [16]:
# one_hot encoding target
from tensorflow.keras.utils import to_categorical
Y=to_categorical(df['sentiment_enc'],num_classes=4)

In [17]:
# Split the data
from sklearn.model_selection import train_test_split
from tensorflow.keras.utils import to_categorical

X_train, X_test, y_train, y_test = train_test_split(X,Y, test_size=0.2, random_state=42)



In [18]:
X_train.shape

(7274, 30)

In [19]:
X_test.shape

(1819, 30)

In [20]:
# Build & Train a SimpleRNN Classifier
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, SimpleRNN, Dense

model = Sequential()
model.add(Embedding(input_dim=8000,output_dim=100,input_shape=(30,)))
model.add(SimpleRNN(64,dropout=0.2))
model.add(Dense(4,activation='softmax'))



  super().__init__(**kwargs)


In [21]:
model.compile(optimizer='adam',loss='categorical_crossentropy',metrics=['accuracy'])


In [22]:
model.summary()

In [23]:
history_rnn = model.fit(X_train, y_train, epochs=10, batch_size=32, validation_split=0.2)


Epoch 1/10
[1m182/182[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 28ms/step - accuracy: 0.5635 - loss: 0.9758 - val_accuracy: 0.6096 - val_loss: 0.8903
Epoch 2/10
[1m182/182[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 24ms/step - accuracy: 0.7096 - loss: 0.7464 - val_accuracy: 0.6014 - val_loss: 0.8890
Epoch 3/10
[1m182/182[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 30ms/step - accuracy: 0.8639 - loss: 0.3996 - val_accuracy: 0.6144 - val_loss: 1.0211
Epoch 4/10
[1m182/182[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 28ms/step - accuracy: 0.9116 - loss: 0.2543 - val_accuracy: 0.5753 - val_loss: 1.1590
Epoch 5/10
[1m182/182[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 20ms/step - accuracy: 0.9343 - loss: 0.1910 - val_accuracy: 0.6124 - val_loss: 1.1845
Epoch 6/10
[1m182/182[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 22ms/step - accuracy: 0.9410 - loss: 0.1621 - val_accuracy: 0.6137 - val_loss: 1.2266
Epoch 7/10
[1m182/18

In [24]:
# Evaluate the model
loss, accuracy = model.evaluate(X_test, y_test)
print(f"Test Accuracy,{accuracy:4f}")

[1m57/57[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 5ms/step - accuracy: 0.5923 - loss: 1.4171
Test Accuracy,0.594832


In [25]:
# prediction
y_pred = model.predict(X_test)
y_pred = np.argmax(y_pred, axis=1)

print(y_pred)

[1m57/57[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 10ms/step
[1 1 3 ... 3 1 3]


In [26]:
# Test sample
test_sample = "this is a great product! It's amazing."
test_sample = clean_text(test_sample)
test_sample = tokenizer.texts_to_sequences([test_sample])
test_sample = sequence.pad_sequences(test_sample, maxlen=20)
prediction = model.predict(test_sample)
prediction = np.argmax(prediction, axis=1)
print(prediction)

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 228ms/step
[3]


In [34]:

test_sample1 = "Not sure what to say."
test_sample1 = clean_text(test_sample1)
test_sample1 = tokenizer.texts_to_sequences([test_sample1])
test_sample1 = sequence.pad_sequences(test_sample1, maxlen=10)
prediction = model.predict(test_sample1)
prediction = np.argmax(prediction,)
print(prediction)

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 38ms/step
1


LSTM Model

In [36]:
from tensorflow.keras.layers import LSTM
model1 = Sequential()
model1.add(Embedding(input_dim=8000, output_dim=100,input_shape=(30,)))
model1.add(LSTM(64, dropout=0.2))
model1.add(Dense(4, activation='softmax'))


In [37]:
model1.compile(loss='categorical_crossentropy',optimizer='adam',  metrics=['accuracy'])

In [38]:
history1=model1.fit(X_train,y_train,epochs=10,batch_size=64,validation_split=0.2)

Epoch 1/10
[1m91/91[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 48ms/step - accuracy: 0.5631 - loss: 1.0569 - val_accuracy: 0.5986 - val_loss: 0.8852
Epoch 2/10
[1m91/91[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 60ms/step - accuracy: 0.6322 - loss: 0.8488 - val_accuracy: 0.6275 - val_loss: 0.8322
Epoch 3/10
[1m91/91[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 47ms/step - accuracy: 0.7144 - loss: 0.7127 - val_accuracy: 0.6268 - val_loss: 0.8488
Epoch 4/10
[1m91/91[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 49ms/step - accuracy: 0.8054 - loss: 0.5479 - val_accuracy: 0.6570 - val_loss: 0.8620
Epoch 5/10
[1m91/91[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 45ms/step - accuracy: 0.8336 - loss: 0.4447 - val_accuracy: 0.6522 - val_loss: 0.8714
Epoch 6/10
[1m91/91[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 61ms/step - accuracy: 0.8657 - loss: 0.3641 - val_accuracy: 0.6543 - val_loss: 0.9482
Epoch 7/10
[1m91/91[0m [32m━━━━

In [40]:
# Evaluate the model
loss, accuracy = model1.evaluate(X_test, y_test)
print(f"Test Accuracy:{accuracy:4f}")
# train accuracy
loss, accuracy = model1.evaluate(X_train, y_train)
print(f"Train Accuracy:{accuracy:4f}")

[1m57/57[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 8ms/step - accuracy: 0.6602 - loss: 1.4200
Test Accuracy:0.657504
[1m228/228[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 9ms/step - accuracy: 0.9040 - loss: 0.2578
Train Accuracy:0.865549


In [41]:
# Sample test
sample_text = "this is a great product! It's amazing."
sample_text = clean_text(sample_text)
sample_text = tokenizer.texts_to_sequences([sample_text])
sample_text = sequence.pad_sequences(sample_text, maxlen=20)
prediction = model1.predict(sample_text)
prediction = np.argmax(prediction, axis=1)
print(prediction)

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 249ms/step
[3]


In [42]:
sample_text = "this is a worst product! waste of money."
sample_text = clean_text(sample_text)
sample_text = tokenizer.texts_to_sequences([sample_text])
sample_text = sequence.pad_sequences(sample_text, maxlen=20)
prediction = model1.predict(sample_text)
prediction = np.argmax(prediction, axis=1)
print(prediction)

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 41ms/step
[0]


In [43]:
sample_text = "not sure what to say"
sample_text = clean_text(sample_text)
sample_text = tokenizer.texts_to_sequences([sample_text])
sample_text = sequence.pad_sequences(sample_text, maxlen=20)
prediction = model1.predict(sample_text)
prediction = np.argmax(prediction, axis=1)
print(prediction)

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 54ms/step
[1]
