In [2]:
import numpy as np
import pandas as pd
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import to_categorical
from sklearn.model_selection import train_test_split

In [3]:
from tensorflow.python.client import device_lib
print(device_lib.list_local_devices())


[name: "/device:CPU:0"
device_type: "CPU"
memory_limit: 268435456
locality {
}
incarnation: 8804996332000824753
xla_global_id: -1
]


In [4]:
import tensorflow as tf
print("TensorFlow version:", tf.__version__)
print("Num GPUs Available: ", len(tf.config.experimental.list_physical_devices('GPU')))


TensorFlow version: 2.10.0
Num GPUs Available:  0


In [5]:
data = pd.read_csv("C:/Users/sarth/Downloads/Datasets/sentiment analysis/Twitter_Data.csv")
data.head()

Unnamed: 0,clean_text,category
0,when modi promised “minimum government maximum...,-1.0
1,talk all the nonsense and continue all the dra...,0.0
2,what did just say vote for modi welcome bjp t...,1.0
3,asking his supporters prefix chowkidar their n...,1.0
4,answer who among these the most powerful world...,1.0


In [6]:
# Ensure column names are correct
text_column = 'clean_text'  # Update if column name differs
category_column = 'category'

In [7]:
# Preprocessing: Extract text and labels
texts = data[text_column].astype(str).tolist()
labels = data[category_column].values

In [8]:
X_train, X_test, y_train, y_test = train_test_split(texts, labels, test_size=0.2, random_state=42)

In [9]:
# Tokenization and Padding
tokenizer = Tokenizer(num_words=10000, oov_token="<OOV>")
tokenizer.fit_on_texts(X_train)

In [10]:
X_train_seq = tokenizer.texts_to_sequences(X_train)
X_test_seq = tokenizer.texts_to_sequences(X_test)

In [11]:
X_train_padded = pad_sequences(X_train_seq, maxlen=100, padding='post', truncating='post')
X_test_padded = pad_sequences(X_test_seq, maxlen=100, padding='post', truncating='post')


In [12]:
print("Unique values in y_train:", np.unique(y_train))
print("Unique values in y_test:", np.unique(y_test))

Unique values in y_train: [-1.  0.  1. nan]
Unique values in y_test: [-1.  0.  1.]


In [13]:
y_train = np.clip(y_train, 0, 2)  # Ensure values are within [0, 2]
y_test = np.clip(y_test, 0, 2)

y_train = y_train.astype(int)
y_test = y_test.astype(int)

y_train = np.nan_to_num(y_train, nan=0).astype(int)
y_test = np.nan_to_num(y_test, nan=0).astype(int)


In [14]:
from scipy.stats import mode
import numpy as np

# Example data
y_train = np.array([1, 2, -2147483648, 2, -2147483648, 2])
filtered_y_train = y_train[y_train != -2147483648]

print("Length of filtered_y_train:", len(filtered_y_train))

# Ensure the mode computation works
most_frequent_label = mode(filtered_y_train, keepdims=True).mode[0]

# Replace invalid labels with the most frequent label
y_train = np.where(y_train == -2147483648, most_frequent_label, y_train)

print("Updated y_train:", y_train)


Length of filtered_y_train: 4
Updated y_train: [1 2 2 2 2 2]


In [15]:
# Convert labels to one-hot encoding
y_train_one_hot = tf.keras.utils.to_categorical(y_train, num_classes=3)
y_test_one_hot = tf.keras.utils.to_categorical(y_test, num_classes=3)

In [16]:
# Model Architecture
model = tf.keras.Sequential([
    tf.keras.layers.Embedding(input_dim=10000, output_dim=128, input_length=100),
    tf.keras.layers.LSTM(64, return_sequences=True),
    tf.keras.layers.LSTM(32),
    tf.keras.layers.Dense(64, activation='relu'),
    tf.keras.layers.Dense(3, activation='softmax')  # 3 classes: -1, 0, 1
])

In [17]:
model.compile(
    optimizer='adam',
    loss='categorical_crossentropy',
    metrics=['accuracy']
)

In [18]:
# Display the model summary
model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 100, 128)          1280000   
                                                                 
 lstm (LSTM)                 (None, 100, 64)           49408     
                                                                 
 lstm_1 (LSTM)               (None, 32)                12416     
                                                                 
 dense (Dense)               (None, 64)                2112      
                                                                 
 dense_1 (Dense)             (None, 3)                 195       
                                                                 
Total params: 1,344,131
Trainable params: 1,344,131
Non-trainable params: 0
_________________________________________________________________


In [19]:
print("Shape of X_train_padded:", X_train_padded.shape)
print("Shape of y_train:", y_train.shape)
print("Shape of y_train_one_hot:", y_train_one_hot.shape)


Shape of X_train_padded: (130384, 100)
Shape of y_train: (6,)
Shape of y_train_one_hot: (6, 3)


In [20]:
from scipy.stats import mode
import numpy as np

# Replace invalid values (-2147483648) and NaN in y_train
y_train = np.nan_to_num(y_train, nan=-2147483648)
filtered_y_train = y_train[y_train != -2147483648]

# Determine the most frequent label
most_frequent_label = mode(filtered_y_train, keepdims=True).mode[0]

# Replace invalid labels with the most frequent label
y_train = np.where(y_train == -2147483648, most_frequent_label, y_train)

# Check alignment with X_train_padded
if len(X_train_padded) != len(y_train):
    print(f"Mismatch: {len(X_train_padded)} samples in X_train, but {len(y_train)} labels in y_train.")
    y_train = np.resize(y_train, len(X_train_padded))  # Resize y_train to match X_train

# One-hot encode the labels
from tensorflow.keras.utils import to_categorical
y_train_one_hot = to_categorical(y_train, num_classes=3)

Mismatch: 130384 samples in X_train, but 6 labels in y_train.


In [21]:
# Train the model
history = model.fit(
    X_train_padded, y_train_one_hot,
    epochs=5,
    batch_size=32,
    validation_data=(X_test_padded, y_test_one_hot)
)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


In [22]:
# Evaluate the model
test_loss, test_acc = model.evaluate(X_test_padded, y_test_one_hot)
print(f"Test Accuracy: {test_acc:.2f}")

Test Accuracy: 0.00


In [24]:
# Classification Report
from sklearn.metrics import classification_report
y_pred = model.predict(X_test_padded)
y_pred_classes = np.argmax(y_pred, axis=1)
print(classification_report(y_test, y_pred_classes, target_names=['Negative', 'Neutral', 'Positive']))

              precision    recall  f1-score   support

    Negative       0.00      0.00      0.00   18213.0
     Neutral       0.00      0.00      0.00   14383.0
    Positive       0.00      0.00      0.00       0.0

    accuracy                           0.00   32596.0
   macro avg       0.00      0.00      0.00   32596.0
weighted avg       0.00      0.00      0.00   32596.0



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
