In [101]:
import numpy as np
from tensorflow import keras
from tensorflow.keras import layers
import pandas as pd


In [102]:
data_train = pd.read_json("train.jsonl", lines=True)
data_test = pd.read_json("test.jsonl", lines=True)
data_dev = pd.read_json("dev.jsonl", lines=True)

print(len(data_train), "Training")
print(len(data_test), "Test")
print(len(data_dev), "Dev")

combined_data = pd.concat([data_train, data_dev], ignore_index=True)
combined_data = data_train
# Print the combined DataFrame
#combined_data.head()

print(len(combined_data), "combined")

8739 Training
803 Test
343 Dev
8739 combined


In [103]:
#resample data, oversampling
import pandas as pd
from sklearn.utils import resample

def oversample_and_shuffle(df):
    # Separate classes
    df_class_minus_one = df[df['polarity'] == -1]
    df_class_0 = df[df['polarity'] == 0]
    df_class_1 = df[df['polarity'] == 1]

    # Oversample minority classes (class -1 and class 1 in this case)
    df_class_minus_one_oversampled = resample(df_class_minus_one, replace=True, n_samples=len(df_class_0), random_state=42)
    df_class_1_oversampled = resample(df_class_1, replace=True, n_samples=len(df_class_0), random_state=42)

    # Combine oversampled minority classes with majority class
    df_oversampled = pd.concat([df_class_minus_one_oversampled, df_class_0, df_class_1_oversampled])

    # Shuffle the dataframe to mix the classes
    df_oversampled = df_oversampled.sample(frac=1, random_state=42)

    # Display the counts after oversampling
    print(df_oversampled['polarity'].value_counts())

    return df_oversampled

df = combined_data
combined_data = oversample_and_shuffle(df)

df = data_test
data_test = oversample_and_shuffle(df)

polarity
 1    3028
 0    3028
-1    3028
Name: count, dtype: int64
polarity
 1    319
 0    319
-1    319
Name: count, dtype: int64


In [104]:
combined_data.head()
# Specify the columns to be removed
columns_to_remove = ['mention', 'from', 'to', 'id']

# Use the drop method to remove the specified columns
combined_data = combined_data.drop(columns=columns_to_remove)
data_test = data_test.drop(columns=columns_to_remove)



In [105]:
from tensorflow import keras
from tensorflow.keras import layers

embedding_dim = 100
vocab_size = 18000
max_length = 200

# Input for variable-length sequences of integers
inputs = keras.Input(shape=(max_length,), dtype="int64")

# Embed each integer in a 50-dimensional vector (adjust embedding_dim to match your original model)
x = layers.Embedding(vocab_size, embedding_dim)(inputs)
x = layers.BatchNormalization()(x)

# Add 2 bidirectional LSTMs with similar units and dropout as your original model
x = layers.Bidirectional(layers.LSTM(150, dropout=0.3, recurrent_dropout=0.3, return_sequences=True))(x)
x = layers.Bidirectional(layers.LSTM(150))(x)


# Add a classifier with output shape matching the number of classes (3 in this case)
num_classes = 3  # Number of classes
outputs = layers.Dense(num_classes, activation="softmax")(x)

model = keras.Model(inputs, outputs)

from tensorflow.keras.optimizers import Adam

from tensorflow.keras.optimizers import Adam

model.compile(optimizer=Adam(learning_rate=0.0001), loss='categorical_crossentropy', metrics=['accuracy'])
early_stopping = keras.callbacks.EarlyStopping(monitor='val_loss', patience=3)


# Compile model with metrics
#model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])
model.summary()

Model: "model_9"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_13 (InputLayer)       [(None, 200)]             0         
                                                                 
 embedding_12 (Embedding)    (None, 200, 100)          1800000   
                                                                 
 batch_normalization_6 (Bat  (None, 200, 100)          400       
 chNormalization)                                                
                                                                 
 bidirectional_27 (Bidirect  (None, 200, 300)          301200    
 ional)                                                          
                                                                 
 bidirectional_28 (Bidirect  (None, 300)               541200    
 ional)                                                          
                                                           

In [106]:
#combined_data = combined_data.drop_duplicates(subset='sentence', keep='first')
#data_test = data_test.drop_duplicates(subset='sentence', keep='first')

In [107]:
from tensorflow.keras.preprocessing.text import Tokenizer

from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import to_categorical

texts_train = combined_data['sentence']
labels_train = combined_data['polarity']

texts_test = data_test['sentence']
labels_test = data_test['polarity']

# Tokenize the text data
tokenizer = Tokenizer(num_words=vocab_size)
tokenizer.fit_on_texts(texts_train)
sequences_train = tokenizer.texts_to_sequences(texts_train)

tokenizer_test = Tokenizer(num_words=vocab_size)
tokenizer_test.fit_on_texts(texts_test)
sequences_test = tokenizer_test.texts_to_sequences(texts_test)

maxlen = 200
x_train = pad_sequences(sequences_train, maxlen=maxlen)
x_val = pad_sequences(sequences_test, maxlen=maxlen)

y_train = labels_train
y_val = labels_test

In [108]:
num_classes = len(np.unique(y_train))
y_train_categorical = to_categorical(y_train, num_classes=num_classes)
y_val_categorical = to_categorical(y_val, num_classes=num_classes)

In [109]:
print(len(combined_data), "combined")
print(len(data_test), "test")
print(len(tokenizer.word_index), "word index")

9084 combined
957 test
17271 word index


In [110]:
model.fit(x_train, y_train_categorical, epochs=10, batch_size=64, validation_data=(x_val, y_val_categorical), callbacks=[early_stopping])


# Evaluate the model
loss, accuracy = model.evaluate(x_val, y_val_categorical)
print(f'Loss: {loss}, Accuracy: {accuracy * 100:.2f}%')


Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Loss: 1.9601343870162964, Accuracy: 36.36%
