In [6]:
import pandas as pd
import numpy as np
import re
from sklearn.model_selection import train_test_split
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Embedding, SpatialDropout1D, LSTM, Dropout, Dense
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import EarlyStopping
from sklearn.metrics import classification_report

# Load the dataset
df = pd.read_csv('./data.csv', encoding='latin-1', header=None)
df.columns = ['target', 'ids', 'date', 'flag', 'user', 'text']
df = df[['text', 'target']]

# Map target to two classes: negative, positive
df['target'] = df['target'].map({0: 0, 4: 1})

# Sample 66,666 instances from each class to get a total of 200,000 samples
neg_df = df[df['target'] == 0].sample(n=66666, random_state=42)
pos_df = df[df['target'] == 1].sample(n=66666, random_state=42)
# Combine the sampled data
df_sampled = pd.concat([neg_df, pos_df])

# Shuffle the combined DataFrame
df_sampled = df_sampled.sample(frac=1, random_state=42).reset_index(drop=True)
df = df_sampled

# Clean the text data
def clean_text(text):
    text = re.sub(r'@[A-Za-z0-9]+', '', text)  # Remove @mentions
    text = re.sub(r'#', '', text)  # Remove hashtag symbol
    text = re.sub(r'RT[\s]+', '', text)  # Remove RT
    text = re.sub(r'https?://\S+', '', text)  # Remove the hyperlink
    text = re.sub(r'\W', ' ', str(text))  # Remove special characters
    text = text.lower()  # Convert to lower case
    return text

df['text'] = df['text'].apply(clean_text)

# Tokenize and pad sequences
tokenizer = Tokenizer(num_words=5000, split=' ')
tokenizer.fit_on_texts(df['text'].values)
X = tokenizer.texts_to_sequences(df['text'].values)
X = pad_sequences(X, maxlen=100)

# One-hot encode the target
Y = df['target'].values

# Train-test split
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=42)

# Load GloVe embeddings (if needed)
# Replace glove_file_path and embedding_dim with your specific GloVe file path and dimensions
def load_glove_embeddings(glove_file_path, embedding_dim):
    embeddings_index = {}
    with open(glove_file_path, encoding='utf-8') as f:
        for line in f:
            values = line.split()
            word = values[0]
            coefs = np.asarray(values[1:], dtype='float32')
            embeddings_index[word] = coefs
    return embeddings_index

# Create an embedding matrix (if using pre-trained embeddings)
# Replace tokenizer.word_index with your tokenizer's word index
def create_embedding_matrix(word_index, embeddings_index, embedding_dim):
    embedding_matrix = np.zeros((len(word_index) + 1, embedding_dim))
    for word, i in word_index.items():
        embedding_vector = embeddings_index.get(word)
        if embedding_vector is not None:
            embedding_matrix[i] = embedding_vector
    return embedding_matrix

# Path to GloVe file (if using pre-trained embeddings)
glove_file_path = 'glove.twitter.27B.100d.txt'
embedding_dim = 100  # Replace with your GloVe file's dimensions

# Load GloVe embeddings (if using pre-trained embeddings)
embeddings_index = load_glove_embeddings(glove_file_path, embedding_dim)

# Create the embedding matrix (if using pre-trained embeddings)
embedding_matrix = create_embedding_matrix(tokenizer.word_index, embeddings_index, embedding_dim)

# Build the model
input = Input(shape=(100,))
x = Embedding(input_dim=len(tokenizer.word_index) + 1, output_dim=embedding_dim, weights=[embedding_matrix], trainable=False)(input)
x = SpatialDropout1D(0.5)(x)
x = LSTM(128, activation='relu', dropout=0.5, recurrent_dropout=0.5, return_sequences=True)(x)
x = Dropout(0.5)(x)
output = Dense(1, activation='sigmoid')(x[:, -1, :])

model = Model(inputs=input, outputs=output)

# Compile the model
optimizer = Adam(learning_rate=0.0001, decay=1e-6)
model.compile(loss='binary_crossentropy', optimizer=optimizer, metrics=['accuracy'])

# Early stopping
early_stopping = EarlyStopping(monitor='val_loss', patience=3, restore_best_weights=True)

# Model summary
model.summary()

# Train the model
history = model.fit(X_train, Y_train, epochs=5, batch_size=10, validation_data=(X_test, Y_test), callbacks=[early_stopping], verbose=1)

# Evaluate the model
loss, accuracy = model.evaluate(X_test, Y_test, verbose=0)
print(f"Test Loss: {loss:.4f}, Test Accuracy: {accuracy:.4f}")

# Example of classification report
Y_pred = model.predict(X_test)
Y_pred_classes = (Y_pred > 0.5).astype("int32")
print(classification_report(Y_test, Y_pred_classes))




Epoch 1/5
[1m10667/10667[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m412s[0m 38ms/step - accuracy: 0.5797 - loss: 0.6897 - val_accuracy: 0.7143 - val_loss: 0.5967
Epoch 2/5
[1m10667/10667[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m395s[0m 37ms/step - accuracy: 0.6462 - loss: 0.6260 - val_accuracy: 0.7302 - val_loss: 0.5749
Epoch 3/5
[1m10667/10667[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m439s[0m 41ms/step - accuracy: 0.6598 - loss: 0.6129 - val_accuracy: 0.7375 - val_loss: 0.5528
Epoch 4/5
[1m 2974/10667[0m [32m━━━━━[0m[37m━━━━━━━━━━━━━━━[0m [1m5:15[0m 41ms/step - accuracy: 0.6658 - loss: 0.6074

KeyboardInterrupt: 

In [8]:
import pandas as pd
import numpy as np
import re
from sklearn.model_selection import train_test_split
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Embedding, SpatialDropout1D, LSTM, Dropout, Dense
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras import backend as K
from sklearn.metrics import classification_report
import tensorflow as tf

# Assuming your Attention layer is defined as before
class Attention(tf.keras.layers.Layer):
    def __init__(self, **kwargs):
        super(Attention, self).__init__(**kwargs)

    def build(self, input_shape):
        self.W = self.add_weight(name='attention_weight', shape=(input_shape[-1], 1), initializer='random_normal', trainable=True)
        self.b = self.add_weight(name='attention_bias', shape=(input_shape[1], 1), initializer='zeros', trainable=True)
        super(Attention, self).build(input_shape)

    def call(self, x):
        e = K.tanh(K.dot(x, self.W) + self.b)
        a = K.softmax(e, axis=1)
        output = x * a
        return K.sum(output, axis=1)

# Load the dataset
df = pd.read_csv('./data.csv', encoding='latin-1', header=None)
df.columns = ['target', 'ids', 'date', 'flag', 'user', 'text']
df = df[['text', 'target']]

# Map target to two classes: negative, positive
df['target'] = df['target'].map({0: 0, 4: 1})

# Sample 66,666 instances from each class to get a total of 200,000 samples
neg_df = df[df['target'] == 0].sample(n=66666, random_state=42)
pos_df = df[df['target'] == 1].sample(n=66666, random_state=42)
# Combine the sampled data
df_sampled = pd.concat([neg_df, pos_df])

# Shuffle the combined DataFrame
df_sampled = df_sampled.sample(frac=1, random_state=42).reset_index(drop=True)
df = df_sampled

# Clean the text data
def clean_text(text):
    text = re.sub(r'@[A-Za-z0-9]+', '', text)  # Remove @mentions
    text = re.sub(r'#', '', text)  # Remove hashtag symbol
    text = re.sub(r'RT[\s]+', '', text)  # Remove RT
    text = re.sub(r'https?://\S+', '', text)  # Remove the hyperlink
    text = re.sub(r'\W', ' ', str(text))  # Remove special characters
    text = text.lower()  # Convert to lower case
    return text

df['text'] = df['text'].apply(clean_text)

# Tokenize and pad sequences
tokenizer = Tokenizer(num_words=5000, split=' ')
tokenizer.fit_on_texts(df['text'].values)
X = tokenizer.texts_to_sequences(df['text'].values)
X = pad_sequences(X, maxlen=100)

# One-hot encode the target
Y = df['target'].values

# Train-test split
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=42)

# Load GloVe embeddings
def load_glove_embeddings(glove_file_path, embedding_dim):
    embeddings_index = {}
    with open(glove_file_path, encoding='utf-8') as f:
        for line in f:
            values = line.split()
            word = values[0]
            coefs = np.asarray(values[1:], dtype='float32')
            embeddings_index[word] = coefs
    return embeddings_index

# Create an embedding matrix
def create_embedding_matrix(word_index, embeddings_index, embedding_dim):
    embedding_matrix = np.zeros((len(word_index) + 1, embedding_dim))
    for word, i in word_index.items():
        embedding_vector = embeddings_index.get(word)
        if embedding_vector is not None:
            embedding_matrix[i] = embedding_vector
    return embedding_matrix

# Path to GloVe file
glove_file_path = 'glove.twitter.27B.100d.txt'
embedding_dim = 100  # Depending on the GloVe file used

# Load GloVe embeddings
embeddings_index = load_glove_embeddings(glove_file_path, embedding_dim)

# Create the embedding matrix
embedding_matrix = create_embedding_matrix(tokenizer.word_index, embeddings_index, embedding_dim)

input = Input(shape=(100,))
x = Embedding(input_dim=len(tokenizer.word_index) + 1, output_dim=embedding_dim, weights=[embedding_matrix], trainable=False)(input)
x = SpatialDropout1D(0.5)(x)
x = LSTM(128, activation='relu', dropout=0.5, recurrent_dropout=0.5, return_sequences=True)(x)
x = LSTM(64, activation='relu', dropout=0.5, recurrent_dropout=0.5)(x)
x = Dropout(0.5)(x)
x = Dense(64, activation='relu')(x)
output = Dense(1, activation='sigmoid')(x)

model = Model(inputs=input, outputs=output)

# Compile the model
optimizer = Adam(learning_rate=0.0001, decay=1e-6)
model.compile(loss='binary_crossentropy', optimizer=optimizer, metrics=['accuracy'])

# Early stopping
early_stopping = EarlyStopping(monitor='val_loss', patience=3, restore_best_weights=True)

# Model summary
model.summary()

# Train the model
history = model.fit(X_train, Y_train, epochs=5, batch_size=10, validation_data=(X_test, Y_test), callbacks=[early_stopping], verbose=1)

# Evaluate the model
loss, accuracy = model.evaluate(X_test, Y_test, verbose=0)
print(f"Test Loss: {loss:.4f}, Test Accuracy: {accuracy:.4f}")

# Example of classification report
Y_pred = model.predict(X_test)
Y_pred_classes = (Y_pred > 0.5).astype("int32")
print(classification_report(Y_test, Y_pred_classes))




Epoch 1/5
[1m 3571/10667[0m [32m━━━━━━[0m[37m━━━━━━━━━━━━━━[0m [1m7:34[0m 64ms/step - accuracy: 0.5086 - loss: 0.7159

KeyboardInterrupt: 

In [13]:
import pandas as pd
import numpy as np
import re
from sklearn.model_selection import train_test_split
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Embedding, SpatialDropout1D, LSTM, Dropout, Dense
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import EarlyStopping
from sklearn.metrics import classification_report
import tensorflow as tf

# Load the dataset
df = pd.read_csv('./data.csv', encoding='latin-1', header=None)
df.columns = ['target', 'ids', 'date', 'flag', 'user', 'text']
df = df[['text', 'target']]

# Map target to two classes: negative, positive
df['target'] = df['target'].map({0: 0, 4: 1})

# Sample 66,666 instances from each class to get a total of 200,000 samples
neg_df = df[df['target'] == 0].sample(n=66666, random_state=42)
pos_df = df[df['target'] == 1].sample(n=66666, random_state=42)
# Combine the sampled data
df_sampled = pd.concat([neg_df, pos_df])

# Shuffle the combined DataFrame
df_sampled = df_sampled.sample(frac=1, random_state=42).reset_index(drop=True)
df = df_sampled

# Clean the text data
def clean_text(text):
    text = re.sub(r'@[A-Za-z0-9]+', '', text)  # Remove @mentions
    text = re.sub(r'#', '', text)  # Remove hashtag symbol
    text = re.sub(r'RT[\s]+', '', text)  # Remove RT
    text = re.sub(r'https?://\S+', '', text)  # Remove the hyperlink
    text = re.sub(r'\W', ' ', str(text))  # Remove special characters
    text = text.lower()  # Convert to lower case
    return text

df['text'] = df['text'].apply(clean_text)

# Tokenize and pad sequences
tokenizer = Tokenizer(num_words=5000, split=' ')
tokenizer.fit_on_texts(df['text'].values)
X = tokenizer.texts_to_sequences(df['text'].values)
X = pad_sequences(X, maxlen=100)

# One-hot encode the target
Y = pd.get_dummies(df['target'].values)

# Train-test split
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=42)

# Build the model
input = Input(shape=(100,2))
x = Embedding(input_dim=len(tokenizer.word_index) + 1, output_dim=128, input_length=100)(input)  # Use TensorFlow's Embedding layer
x = SpatialDropout1D(0.5)(x)
x = LSTM(128, activation='relu', dropout=0.5, recurrent_dropout=0.5)(x)
###x = LSTM(64, activation='relu', dropout=0.5, recurrent_dropout=0.5)(x)
x = Dropout(0.5)(x)
###x = Dense(64, activation='relu')(x)
output = Dense(2, activation='sigmoid')(x)

model = Model(inputs=input, outputs=output)

# Compile the model
optimizer = Adam(learning_rate=0.0001, decay=1e-6)
model.compile(loss='categorical_crossentropy', optimizer=optimizer, metrics=['accuracy'])

# Early stopping
early_stopping = EarlyStopping(monitor='val_loss', patience=3, restore_best_weights=True)

# Model summary
model.summary()

# Train the model
history = model.fit(X_train, Y_train, epochs=5, batch_size=10, validation_data=(X_test, Y_test), callbacks=[early_stopping], verbose=1)

# Evaluate the model
loss, accuracy = model.evaluate(X_test, Y_test, verbose=0)
print(f"Test Loss: {loss:.4f}, Test Accuracy: {accuracy:.4f}")

# Example of classification report
Y_pred = model.predict(X_test)
Y_pred_classes = (Y_pred > 0.5).astype("int32")
print(classification_report(Y_test, Y_pred_classes))




ValueError: Input 0 of layer "spatial_dropout1d_11" is incompatible with the layer: expected ndim=3, found ndim=4. Full shape received: (None, 100, 2, 128)

In [14]:
import pandas as pd
import numpy as np
import re
from sklearn.model_selection import train_test_split
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Embedding, SpatialDropout1D, LSTM, Dropout, Dense
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import EarlyStopping
from sklearn.metrics import classification_report
import tensorflow as tf

# Load the dataset
df = pd.read_csv('./data.csv', encoding='latin-1', header=None)
df.columns = ['target', 'ids', 'date', 'flag', 'user', 'text']
df = df[['text', 'target']]

# Map target to two classes: negative, positive
df['target'] = df['target'].map({0: 0, 4: 1})

# Sample 66,666 instances from each class to get a total of 200,000 samples
neg_df = df[df['target'] == 0].sample(n=66666, random_state=42)
pos_df = df[df['target'] == 1].sample(n=66666, random_state=42)
# Combine the sampled data
df_sampled = pd.concat([neg_df, pos_df])

# Shuffle the combined DataFrame
df_sampled = df_sampled.sample(frac=1, random_state=42).reset_index(drop=True)
df = df_sampled

# Clean the text data
def clean_text(text):
    text = re.sub(r'@[A-Za-z0-9]+', '', text)  # Remove @mentions
    text = re.sub(r'#', '', text)  # Remove hashtag symbol
    text = re.sub(r'RT[\s]+', '', text)  # Remove RT
    text = re.sub(r'https?://\S+', '', text)  # Remove the hyperlink
    text = re.sub(r'\W', ' ', str(text))  # Remove special characters
    text = text.lower()  # Convert to lower case
    return text

df['text'] = df['text'].apply(clean_text)

# Tokenize and pad sequences
tokenizer = Tokenizer(num_words=5000, split=' ')
tokenizer.fit_on_texts(df['text'].values)
X = tokenizer.texts_to_sequences(df['text'].values)
X = pad_sequences(X, maxlen=100)

# One-hot encode the target
Y = pd.get_dummies(df['target'].values)  # Use one-hot encoding for multi-class classification

# Train-test split
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=42)

# Build the model
input = Input(shape=(100,))
x = Embedding(input_dim=len(tokenizer.word_index) + 1, output_dim=128, input_length=100)(input)
x = SpatialDropout1D(0.5)(x)
x = LSTM(128, activation='relu', dropout=0.5, recurrent_dropout=0.5)(x)
x = Dropout(0.5)(x)
output = Dense(2, activation='sigmoid')(x)  # Output layer with two neurons for binary classification

model = Model(inputs=input, outputs=output)

# Compile the model
optimizer = Adam(learning_rate=0.0001, decay=1e-6)
model.compile(loss='binary_crossentropy', optimizer=optimizer, metrics=['accuracy'])

# Early stopping
early_stopping = EarlyStopping(monitor='val_loss', patience=3, restore_best_weights=True)

# Model summary
model.summary()

# Train the model
history = model.fit(X_train, Y_train, epochs=5, batch_size=10, validation_data=(X_test, Y_test), callbacks=[early_stopping], verbose=1)

# Evaluate the model
loss, accuracy = model.evaluate(X_test, Y_test, verbose=0)
print(f"Test Loss: {loss:.4f}, Test Accuracy: {accuracy:.4f}")

# Example of classification report
Y_pred = model.predict(X_test)
Y_pred_classes = np.argmax(Y_pred, axis=1)  # Get the predicted classes
print(classification_report(np.argmax(Y_test, axis=1), Y_pred_classes))  # Compare with true classes




Epoch 1/5
[1m 3042/10667[0m [32m━━━━━[0m[37m━━━━━━━━━━━━━━━[0m [1m17:29[0m 138ms/step - accuracy: 0.5369 - loss: 61096.1719

KeyboardInterrupt: 