In [1]:
import numpy as np
import pandas as pd
import re
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential, Model
from tensorflow.keras.layers import Embedding, LSTM, Dense, SpatialDropout1D, Bidirectional, Input, Dropout, Layer
from tensorflow.keras.optimizers import Adam
from keras.regularizers import l2
from keras.callbacks import EarlyStopping
import tensorflow as tf
from sklearn.model_selection import train_test_split
import keras.backend as K

In [2]:
# Load the dataset
df = pd.read_csv('./data.csv', encoding='latin-1', header=None)
df.columns = ['target', 'ids', 'date', 'flag', 'user', 'text']
df = df[['text', 'target']]

# Map target to two classes: negative, positive
df['target'] = df['target'].map({0: 0, 4: 1})

# Sample 66,666 instances from each class to get a total of 200,000 samples
neg_df = df[df['target'] == 0].sample(n=66666, random_state=42)
pos_df = df[df['target'] == 1].sample(n=66666, random_state=42)
# Combine the sampled data
df_sampled = pd.concat([neg_df, pos_df])

# Shuffle the combined DataFrame
df_sampled = df_sampled.sample(frac=1, random_state=42).reset_index(drop=True)
df = df_sampled

# Clean the text data
def clean_text(text):
    text = re.sub(r'@[A-Za-z0-9]+', '', text)  # Remove @mentions
    text = re.sub(r'#', '', text)  # Remove hashtag symbol
    text = re.sub(r'RT[\s]+', '', text)  # Remove RT
    text = re.sub(r'https?://\S+', '', text)  # Remove the hyperlink
    text = re.sub(r'\W', ' ', str(text))  # Remove special characters
    text = text.lower()  # Convert to lower case
    return text

df['text'] = df['text'].apply(clean_text)

# Tokenize and pad sequences
tokenizer = Tokenizer(num_words=5000, split=' ')
tokenizer.fit_on_texts(df['text'].values)
X = tokenizer.texts_to_sequences(df['text'].values)
X = pad_sequences(X, maxlen=100)

# One-hot encode the target
Y = df['target'].values

# Train-test split
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=42)

In [4]:
# Load GloVe embeddings
def load_glove_embeddings(glove_file_path, embedding_dim):
    embeddings_index = {}
    with open(glove_file_path, encoding='utf-8') as f:
        for line in f:
            values = line.split()
            word = values[0]
            coefs = np.asarray(values[1:], dtype='float32')
            embeddings_index[word] = coefs
    return embeddings_index

# Create an embedding matrix
def create_embedding_matrix(word_index, embeddings_index, embedding_dim):
    embedding_matrix = np.zeros((len(word_index) + 1, embedding_dim))
    for word, i in word_index.items():
        embedding_vector = embeddings_index.get(word)
        if embedding_vector is not None:
            embedding_matrix[i] = embedding_vector
    return embedding_matrix

# Path to GloVe file
glove_file_path = 'glove.twitter.27B.100d.txt'
embedding_dim = 100  # Depending on the GloVe file used

# Load GloVe embeddings
embeddings_index = load_glove_embeddings(glove_file_path, embedding_dim)

# Create the embedding matrix
embedding_matrix = create_embedding_matrix(tokenizer.word_index, embeddings_index, embedding_dim)

In [7]:
class Attention(tf.keras.layers.Layer):
    def __init__(self, **kwargs):
        super(Attention, self).__init__(**kwargs)

    def build(self, input_shape):
        self.W = self.add_weight(name='attention_weight', shape=(input_shape[-1], 1), initializer='random_normal', trainable=True)
        self.b = self.add_weight(name='attention_bias', shape=(input_shape[1], 1), initializer='zeros', trainable=True)
        super(Attention, self).build(input_shape)

    def call(self, x):
        e = tf.tanh(tf.matmul(x, self.W) + self.b)
        a = tf.nn.softmax(e, axis=1)
        output = x * a
        return tf.reduce_sum(output, axis=1)

# Assuming `tokenizer` and `embedding_matrix` are defined
tokenizer = {'word_index': {'word1': 1, 'word2': 2}}  # Example tokenizer
embedding_matrix = np.random.rand(len(tokenizer['word_index']) + 1, 100)  # Example embedding matrix

# Model definition
input = Input(shape=(100,))
x = Embedding(input_dim=len(tokenizer['word_index']) + 1, output_dim=100, weights=[embedding_matrix], trainable=False)(input)
x = SpatialDropout1D(0.5)(x)
x = Bidirectional(LSTM(128, activation='relu', dropout=0.5, recurrent_dropout=0.5, return_sequences=True))(x)
x = Attention()(x)
x = Dropout(0.5)(x)
output = Dense(1, activation='sigmoid')(x)

model = Model(inputs=input, outputs=output)

# Compile the model
optimizer = Adam(learning_rate=0.0001, decay=1e-6)
model.compile(loss='binary_crossentropy', optimizer=optimizer, metrics=['accuracy'])

# Early stopping
early_stopping = EarlyStopping(monitor='val_loss', patience=3, restore_best_weights=True)

# Model summary
model.summary()



In [8]:
# Final training on the full dataset
history = model.fit(X_train, Y_train, epochs=5, batch_size=10, validation_data=(X_test, Y_test), callbacks=[early_stopping], verbose=2)

Epoch 1/5


InvalidArgumentError: Graph execution error:

Detected at node functional_1/embedding_2_1/GatherV2 defined at (most recent call last):
  File "<frozen runpy>", line 198, in _run_module_as_main

  File "<frozen runpy>", line 88, in _run_code

  File "c:\Users\Prakhar\anaconda3\envs\lat\Lib\site-packages\ipykernel_launcher.py", line 17, in <module>

  File "c:\Users\Prakhar\anaconda3\envs\lat\Lib\site-packages\traitlets\config\application.py", line 1075, in launch_instance

  File "c:\Users\Prakhar\anaconda3\envs\lat\Lib\site-packages\ipykernel\kernelapp.py", line 701, in start

  File "c:\Users\Prakhar\anaconda3\envs\lat\Lib\site-packages\tornado\platform\asyncio.py", line 205, in start

  File "c:\Users\Prakhar\anaconda3\envs\lat\Lib\asyncio\windows_events.py", line 322, in run_forever

  File "c:\Users\Prakhar\anaconda3\envs\lat\Lib\asyncio\base_events.py", line 641, in run_forever

  File "c:\Users\Prakhar\anaconda3\envs\lat\Lib\asyncio\base_events.py", line 1987, in _run_once

  File "c:\Users\Prakhar\anaconda3\envs\lat\Lib\asyncio\events.py", line 88, in _run

  File "c:\Users\Prakhar\anaconda3\envs\lat\Lib\site-packages\ipykernel\kernelbase.py", line 534, in dispatch_queue

  File "c:\Users\Prakhar\anaconda3\envs\lat\Lib\site-packages\ipykernel\kernelbase.py", line 523, in process_one

  File "c:\Users\Prakhar\anaconda3\envs\lat\Lib\site-packages\ipykernel\kernelbase.py", line 429, in dispatch_shell

  File "c:\Users\Prakhar\anaconda3\envs\lat\Lib\site-packages\ipykernel\kernelbase.py", line 767, in execute_request

  File "c:\Users\Prakhar\anaconda3\envs\lat\Lib\site-packages\ipykernel\ipkernel.py", line 429, in do_execute

  File "c:\Users\Prakhar\anaconda3\envs\lat\Lib\site-packages\ipykernel\zmqshell.py", line 549, in run_cell

  File "c:\Users\Prakhar\anaconda3\envs\lat\Lib\site-packages\IPython\core\interactiveshell.py", line 3075, in run_cell

  File "c:\Users\Prakhar\anaconda3\envs\lat\Lib\site-packages\IPython\core\interactiveshell.py", line 3130, in _run_cell

  File "c:\Users\Prakhar\anaconda3\envs\lat\Lib\site-packages\IPython\core\async_helpers.py", line 129, in _pseudo_sync_runner

  File "c:\Users\Prakhar\anaconda3\envs\lat\Lib\site-packages\IPython\core\interactiveshell.py", line 3334, in run_cell_async

  File "c:\Users\Prakhar\anaconda3\envs\lat\Lib\site-packages\IPython\core\interactiveshell.py", line 3517, in run_ast_nodes

  File "c:\Users\Prakhar\anaconda3\envs\lat\Lib\site-packages\IPython\core\interactiveshell.py", line 3577, in run_code

  File "C:\Users\Prakhar\AppData\Local\Temp\ipykernel_19072\3408593971.py", line 2, in <module>

  File "c:\Users\Prakhar\anaconda3\envs\lat\Lib\site-packages\keras\src\utils\traceback_utils.py", line 117, in error_handler

  File "c:\Users\Prakhar\anaconda3\envs\lat\Lib\site-packages\keras\src\backend\tensorflow\trainer.py", line 318, in fit

  File "c:\Users\Prakhar\anaconda3\envs\lat\Lib\site-packages\keras\src\backend\tensorflow\trainer.py", line 121, in one_step_on_iterator

  File "c:\Users\Prakhar\anaconda3\envs\lat\Lib\site-packages\keras\src\backend\tensorflow\trainer.py", line 108, in one_step_on_data

  File "c:\Users\Prakhar\anaconda3\envs\lat\Lib\site-packages\keras\src\backend\tensorflow\trainer.py", line 51, in train_step

  File "c:\Users\Prakhar\anaconda3\envs\lat\Lib\site-packages\keras\src\utils\traceback_utils.py", line 117, in error_handler

  File "c:\Users\Prakhar\anaconda3\envs\lat\Lib\site-packages\keras\src\layers\layer.py", line 882, in __call__

  File "c:\Users\Prakhar\anaconda3\envs\lat\Lib\site-packages\keras\src\utils\traceback_utils.py", line 117, in error_handler

  File "c:\Users\Prakhar\anaconda3\envs\lat\Lib\site-packages\keras\src\ops\operation.py", line 46, in __call__

  File "c:\Users\Prakhar\anaconda3\envs\lat\Lib\site-packages\keras\src\utils\traceback_utils.py", line 156, in error_handler

  File "c:\Users\Prakhar\anaconda3\envs\lat\Lib\site-packages\keras\src\models\functional.py", line 175, in call

  File "c:\Users\Prakhar\anaconda3\envs\lat\Lib\site-packages\keras\src\ops\function.py", line 171, in _run_through_graph

  File "c:\Users\Prakhar\anaconda3\envs\lat\Lib\site-packages\keras\src\models\functional.py", line 556, in call

  File "c:\Users\Prakhar\anaconda3\envs\lat\Lib\site-packages\keras\src\utils\traceback_utils.py", line 117, in error_handler

  File "c:\Users\Prakhar\anaconda3\envs\lat\Lib\site-packages\keras\src\layers\layer.py", line 882, in __call__

  File "c:\Users\Prakhar\anaconda3\envs\lat\Lib\site-packages\keras\src\utils\traceback_utils.py", line 117, in error_handler

  File "c:\Users\Prakhar\anaconda3\envs\lat\Lib\site-packages\keras\src\ops\operation.py", line 46, in __call__

  File "c:\Users\Prakhar\anaconda3\envs\lat\Lib\site-packages\keras\src\utils\traceback_utils.py", line 156, in error_handler

  File "c:\Users\Prakhar\anaconda3\envs\lat\Lib\site-packages\keras\src\layers\core\embedding.py", line 140, in call

  File "c:\Users\Prakhar\anaconda3\envs\lat\Lib\site-packages\keras\src\ops\numpy.py", line 4875, in take

  File "c:\Users\Prakhar\anaconda3\envs\lat\Lib\site-packages\keras\src\backend\tensorflow\numpy.py", line 1951, in take

indices[5,77] = 22 is not in [0, 3)
	 [[{{node functional_1/embedding_2_1/GatherV2}}]] [Op:__inference_one_step_on_iterator_5650]

In [10]:
import pandas as pd
import numpy as np
import re
from sklearn.model_selection import train_test_split
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Embedding, SpatialDropout1D, LSTM, Bidirectional, Dropout, Dense
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras import backend as K
from sklearn.metrics import classification_report

# Assuming your Attention layer is defined as before
class Attention(tf.keras.layers.Layer):
    def __init__(self, **kwargs):
        super(Attention, self).__init__(**kwargs)

    def build(self, input_shape):
        self.W = self.add_weight(name='attention_weight', shape=(input_shape[-1], 1), initializer='random_normal', trainable=True)
        self.b = self.add_weight(name='attention_bias', shape=(input_shape[1], 1), initializer='zeros', trainable=True)
        super(Attention, self).build(input_shape)

    def call(self, x):
        e = K.tanh(K.dot(x, self.W) + self.b)
        a = K.softmax(e, axis=1)
        output = x * a
        return K.sum(output, axis=1)

# Load the dataset
df = pd.read_csv('./data.csv', encoding='latin-1', header=None)
df.columns = ['target', 'ids', 'date', 'flag', 'user', 'text']
df = df[['text', 'target']]

# Map target to two classes: negative, positive
df['target'] = df['target'].map({0: 0, 4: 1})

# Sample 66,666 instances from each class to get a total of 200,000 samples
neg_df = df[df['target'] == 0].sample(n=66666, random_state=42)
pos_df = df[df['target'] == 1].sample(n=66666, random_state=42)
# Combine the sampled data
df_sampled = pd.concat([neg_df, pos_df])

# Shuffle the combined DataFrame
df_sampled = df_sampled.sample(frac=1, random_state=42).reset_index(drop=True)
df = df_sampled

# Clean the text data
def clean_text(text):
    text = re.sub(r'@[A-Za-z0-9]+', '', text)  # Remove @mentions
    text = re.sub(r'#', '', text)  # Remove hashtag symbol
    text = re.sub(r'RT[\s]+', '', text)  # Remove RT
    text = re.sub(r'https?://\S+', '', text)  # Remove the hyperlink
    text = re.sub(r'\W', ' ', str(text))  # Remove special characters
    text = text.lower()  # Convert to lower case
    return text

df['text'] = df['text'].apply(clean_text)

# Tokenize and pad sequences
tokenizer = Tokenizer(num_words=5000, split=' ')
tokenizer.fit_on_texts(df['text'].values)
X = tokenizer.texts_to_sequences(df['text'].values)
X = pad_sequences(X, maxlen=100)

# One-hot encode the target
Y = df['target'].values

# Train-test split
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=42)

# Load GloVe embeddings
def load_glove_embeddings(glove_file_path, embedding_dim):
    embeddings_index = {}
    with open(glove_file_path, encoding='utf-8') as f:
        for line in f:
            values = line.split()
            word = values[0]
            coefs = np.asarray(values[1:], dtype='float32')
            embeddings_index[word] = coefs
    return embeddings_index

# Create an embedding matrix
def create_embedding_matrix(word_index, embeddings_index, embedding_dim):
    embedding_matrix = np.zeros((len(word_index) + 1, embedding_dim))
    for word, i in word_index.items():
        embedding_vector = embeddings_index.get(word)
        if embedding_vector is not None:
            embedding_matrix[i] = embedding_vector
    return embedding_matrix

# Path to GloVe file
glove_file_path = 'glove.twitter.27B.100d.txt'
embedding_dim = 100  # Depending on the GloVe file used

# Load GloVe embeddings
embeddings_index = load_glove_embeddings(glove_file_path, embedding_dim)

# Create the embedding matrix
embedding_matrix = create_embedding_matrix(tokenizer.word_index, embeddings_index, embedding_dim)

# Build the model
input = Input(shape=(100,))
x = Embedding(input_dim=len(tokenizer.word_index) + 1, output_dim=embedding_dim, weights=[embedding_matrix], trainable=False)(input)
x = SpatialDropout1D(0.5)(x)
x = Bidirectional(LSTM(128, activation='relu', dropout=0.5, recurrent_dropout=0.5, return_sequences=True))(x)
x = Attention()(x)
x = Dropout(0.5)(x)
output = Dense(1, activation='sigmoid')(x)

model = Model(inputs=input, outputs=output)

# Compile the model
optimizer = Adam(learning_rate=0.0001, decay=1e-6)
model.compile(loss='binary_crossentropy', optimizer=optimizer, metrics=['accuracy'])

# Early stopping
early_stopping = EarlyStopping(monitor='val_loss', patience=3, restore_best_weights=True)

# Model summary
model.summary()

# Train the model
history = model.fit(X_train, Y_train, epochs=5, batch_size=64, validation_data=(X_test, Y_test), callbacks=[early_stopping], verbose=1)

# Evaluate the model
loss, accuracy = model.evaluate(X_test, Y_test, verbose=0)
print(f"Test Loss: {loss:.4f}, Test Accuracy: {accuracy:.4f}")

# Example of classification report
Y_pred = model.predict(X_test)
Y_pred_classes = (Y_pred > 0.5).astype("int32")
print(classification_report(Y_test, Y_pred_classes))


KeyboardInterrupt: 