In [3]:
import pandas as pd
from transformers import  AlbertConfig, AlbertModel
import tensorflow as tf
from sklearn.preprocessing import LabelEncoder
from torch.utils.data import DataLoader
from tqdm import tqdm
import torch
from sklearn.model_selection import train_test_split
import pandas as pd
import tensorflow as tf
from transformers import AlbertTokenizer
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras.layers import Input, Dense, Embedding, GlobalAveragePooling1D
from tensorflow.keras.models import Model
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.losses import SparseCategoricalCrossentropy
import numpy as np

In [4]:
# Load training data from "train.csv"
train_df = pd.read_csv("synthetic_train.csv")

# Load validation data from "val.csv"
val_df = pd.read_csv("synthetic_val.csv")

label_encoder = LabelEncoder()
train_df['label_encoded'] = label_encoder.fit_transform(train_df['sentiment'])
val_df['label_encoded'] = label_encoder.transform(val_df['sentiment'])


In [5]:
tokenizer = AlbertTokenizer.from_pretrained('albert-base-v2')

def tokenize_text(text):
    return tokenizer.encode(text, add_special_tokens=True, max_length=128, padding='max_length', truncation=True)


In [6]:
train_df['input_ids'] = train_df['sentence'].apply(tokenize_text)
val_df['input_ids'] = val_df['sentence'].apply(tokenize_text)



In [7]:
albert_config = AlbertConfig(
    vocab_size=30000,            # Size of the vocabulary
    hidden_size=512,            # Size of the hidden layers
    num_hidden_layers=10,        # Number of hidden layers
    num_attention_heads=10,      # Number of attention heads
    intermediate_size=1024,     # Size of the intermediate (feed-forward) layers
    hidden_dropout_prob=0.2,    # Dropout probability for hidden layers
    attention_probs_dropout_prob=0.2,  # Dropout probability for attention scores
    max_position_embeddings=128,  # Maximum position embeddings (adjust based on your sequence length)
    type_vocab_size=2,          # Number of token types (typically 0 for regular text, 1 for special tokens)
)



In [8]:
input_layer = Input(shape=(128,), dtype=tf.int32)
embedding_layer = Embedding(input_dim=albert_config.vocab_size, output_dim=albert_config.hidden_size)(input_layer)
pooling_layer = GlobalAveragePooling1D()(embedding_layer)
output_layer = Dense(units=len(label_encoder.classes_), activation='softmax')(pooling_layer)

model = Model(inputs=input_layer, outputs=output_layer)
model.compile(optimizer=Adam(learning_rate=1e-4), loss=SparseCategoricalCrossentropy(from_logits=False), metrics=['accuracy'])


In [9]:
train_input_ids = np.array(train_df['input_ids'].to_list())
val_input_ids = np.array(val_df['input_ids'].to_list())

In [10]:

history = model.fit(
    train_input_ids, train_df['label_encoded'].values,
    validation_data=(val_input_ids, val_df['label_encoded'].values),
    epochs=10, batch_size=128
)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [11]:
test_input_ids = np.array(val_df['input_ids'].to_list())
test_loss, test_accuracy = model.evaluate(test_input_ids, val_df['label_encoded'].values)
print(f"Test Loss: {test_loss:.4f}, Test Accuracy: {test_accuracy:.4f}")


Test Loss: 1.0434, Test Accuracy: 0.6488


test_loss, test_accuracy = model.evaluate(test_df['input_ids'].to_list(), test_df['label_encoded'].values)
print(f"Test Loss: {test_loss:.4f}, Test Accuracy: {test_accuracy:.4f}")

In [None]:
model.save('sentiment_model')

In [None]:
sample_text = ""
encoded_text = tokenize_text(sample_text)
encoded_text = np.array([encoded_text]) 
predicted_class = model.predict(encoded_text)[0]
predicted_sentiment = label_encoder.inverse_transform([predicted_class.argmax()])[0]
print(f"Predicted Sentiment: {predicted_sentiment}")