In [3]:


# After uploading the CSV files, proceed with data loading and preprocessing

import pandas as pd
import re
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout
from tensorflow.keras.utils import to_categorical
from sklearn.model_selection import train_test_split

# Load the train.csv file
train_df = pd.read_csv('/content/train.csv')

# Function to preprocess the text
def preprocess_text(text):
    # Convert text to lowercase
    text = text.lower()

    # Remove non-alphabetic characters and extra spaces
    text = re.sub(r'[^a-zA-Z0-9\s]', '', text)

    return text

# Apply preprocessing to the 'text' column
train_df['cleaned_text'] = train_df['text'].apply(preprocess_text)

# Split the dataset into features and labels
X = train_df['cleaned_text']
y = train_df['sentiment']

# Tokenize the text
tokenizer = Tokenizer(num_words=5000)
tokenizer.fit_on_texts(X)

# Convert the text to sequences
X_sequences = tokenizer.texts_to_sequences(X)

# Pad the sequences to ensure uniform input length
max_length = 200  # Maximum length of the sequence
X_padded = pad_sequences(X_sequences, maxlen=max_length, padding='post')

# Convert the labels to categorical format (one-hot encoding)
y_categorical = pd.get_dummies(y).values

# Split the data into training and testing sets
X_train_lstm, X_test_lstm, y_train_lstm, y_test_lstm = train_test_split(X_padded, y_categorical, test_size=0.2, random_state=42)

# Build the LSTM model
model = Sequential()
model.add(Embedding(input_dim=5000, output_dim=128, input_length=max_length))
model.add(LSTM(128, dropout=0.2, recurrent_dropout=0.2))
model.add(Dense(3, activation='softmax'))  # 3 classes: positive, neutral, negative

# Compile the model
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

# Train the model
history = model.fit(X_train_lstm, y_train_lstm, epochs=500, batch_size=32, validation_data=(X_test_lstm, y_test_lstm))

# Evaluate the model
lstm_accuracy = model.evaluate(X_test_lstm, y_test_lstm)
print(f"LSTM Model Accuracy: {lstm_accuracy[1] * 100:.2f}%")

# Optionally, you can save the trained model for later use
model.save('sentiment_analysis_lstm_model.h5')


Epoch 1/500




[1m5/5[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 722ms/step - accuracy: 0.3033 - loss: 1.1020 - val_accuracy: 0.3158 - val_loss: 1.1000
Epoch 2/500
[1m5/5[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 671ms/step - accuracy: 0.3434 - loss: 1.1001 - val_accuracy: 0.3158 - val_loss: 1.1029
Epoch 3/500
[1m5/5[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 586ms/step - accuracy: 0.2651 - loss: 1.1046 - val_accuracy: 0.3421 - val_loss: 1.1032
Epoch 4/500
[1m5/5[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 590ms/step - accuracy: 0.3393 - loss: 1.1025 - val_accuracy: 0.3421 - val_loss: 1.1022
Epoch 5/500
[1m5/5[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 903ms/step - accuracy: 0.3639 - loss: 1.0933 - val_accuracy: 0.3421 - val_loss: 1.1016
Epoch 6/500
[1m5/5[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 584ms/step - accuracy: 0.3908 - loss: 1.0908 - val_accuracy: 0.3158 - val_loss: 1.1038
Epoch 7/500
[1m5/5[0m [32m━━━━━━━━━━━━━━━━━



LSTM Model Accuracy: 34.21%


In [5]:
# Assuming the test data is in the same format as the training data
# Load the test dataset (replace with your actual test data file path)
test_df = pd.read_csv('/content/test - train.csv')  # Replace with actual test data file path

# Preprocess the test data (same preprocessing as for training data)
test_df['cleaned_text'] = test_df['text'].apply(preprocess_text)
X_test_data = tokenizer.texts_to_sequences(test_df['cleaned_text'])
X_test_data_padded = pad_sequences(X_test_data, maxlen=max_length, padding='post')

# Predict the sentiment for the test data
predictions = model.predict(X_test_data_padded)

# Convert the predictions from one-hot encoding to the actual sentiment labels
predicted_labels = pd.DataFrame(predictions, columns=['negative', 'neutral', 'positive']).idxmax(axis=1)

# Prepare the submission DataFrame with the exact format of the sample_submission.csv
submission_df = pd.DataFrame({
    'id': test_df['id'],  # Ensure 'id' is taken from the test data
    'sentiment': predicted_labels  # The predicted sentiment labels
})

# Save the submission DataFrame to a CSV file
submission_df.to_csv('submission(2w).csv', index=False)



[1m6/6[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 101ms/step


In [6]:
# Install Hugging Face Transformers and PyTorch
!pip install transformers
!pip install torch


Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch)
  Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.4.5.8 (from torch)
  Downloading nvidia_cublas_cu12-12.4.5.8-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cufft-cu12==11.2.1.3 (from torch)
  Downloading nvidia_cufft_cu12-11.2.1.3-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-curand-cu12==10.3.5.147 (from torch)
  Downloading nvidia_curand_cu12-10.3.5

In [1]:
from transformers import BertTokenizer, BertForSequenceClassification
from transformers import Trainer, TrainingArguments
import torch
from sklearn.model_selection import train_test_split
import pandas as pd

# Load BanglaBERT tokenizer and model (you can use a general BERT model for other languages)
tokenizer = BertTokenizer.from_pretrained('sagorsarker/banglabert')
model = BertForSequenceClassification.from_pretrained('sagorsarker/banglabert', num_labels=3)

# Load your dataset (e.g., train.csv)
train_df = pd.read_csv('/content/train.csv')

# Tokenize the text data
def encode_text(text):
    return tokenizer(text, padding=True, truncation=True, max_length=128, return_tensors="pt")

train_df['encoded'] = train_df['text'].apply(encode_text)

# Prepare the labels (assuming 'sentiment' is the column with labels)
labels = train_df['sentiment'].map({'positive': 2, 'neutral': 1, 'negative': 0}).values

# Convert to PyTorch tensors
input_ids = torch.cat([x['input_ids'] for x in train_df['encoded']]).squeeze()
attention_mask = torch.cat([x['attention_mask'] for x in train_df['encoded']]).squeeze()
labels_tensor = torch.tensor(labels)

# Split into train and validation sets
X_train, X_val, y_train, y_val = train_test_split(input_ids, labels_tensor, test_size=0.2)

# Define the training dataset
train_dataset = torch.utils.data.TensorDataset(X_train, attention_mask[X_train], y_train)
val_dataset = torch.utils.data.TensorDataset(X_val, attention_mask[X_val], y_val)

# Define the training arguments
training_args = TrainingArguments(
    output_dir='./results',          # output directory
    num_train_epochs=3,              # number of training epochs
    per_device_train_batch_size=8,   # batch size for training
    per_device_eval_batch_size=16,   # batch size for evaluation
    warmup_steps=500,                # number of warmup steps for learning rate scheduler
    weight_decay=0.01,               # strength of weight decay
    logging_dir='./logs',            # directory for storing logs
    logging_steps=10,
    evaluation_strategy="epoch",     # evaluation strategy to adopt during training
)

# Initialize Trainer
trainer = Trainer(
    model=model,                         # the pre-trained model
    args=training_args,                  # training arguments
    train_dataset=train_dataset,         # training dataset
    eval_dataset=val_dataset             # evaluation dataset
)

# Train the model
trainer.train()

# Evaluate the model
trainer.evaluate()

# Save the trained model
trainer.save_model("sentiment_analysis_banglabert_model")


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


OSError: sagorsarker/banglabert is not a local folder and is not a valid model identifier listed on 'https://huggingface.co/models'
If this is a private repository, make sure to pass a token having permission to this repo either by logging in with `huggingface-cli login` or by passing `token=<your_token>`

In [17]:
from transformers import BertTokenizer, BertForSequenceClassification, Trainer, TrainingArguments
from sklearn.model_selection import train_test_split
import torch
import pandas as pd
from torch.utils.data import Dataset, DataLoader
from sklearn.metrics import accuracy_score
from transformers import Trainer, TrainingArguments, EarlyStoppingCallback

# Load the tokenizer and model (use 'bert-base-uncased' for general BERT or a Bangla-specific model)
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=3)  # 3 classes: positive, neutral, negative

# Define the function to compute accuracy during evaluation
def compute_metrics(p):
    predictions, labels = p
    predictions = predictions.argmax(axis=-1)  # Convert logits to predicted labels
    accuracy = accuracy_score(labels, predictions)
    return {'eval_accuracy': accuracy}

# Load your dataset (replace with the actual path to your dataset)
train_df = pd.read_csv('/content/train.csv')

# Tokenize the text data with padding and truncation
def encode_text(text):
    return tokenizer(text, padding='max_length', truncation=True, max_length=128, return_tensors="pt")

train_df['encoded'] = train_df['text'].apply(encode_text)

# Prepare the labels (assuming 'sentiment' column contains labels)
labels = train_df['sentiment'].map({'positive': 2, 'neutral': 1, 'negative': 0}).values

# Convert to PyTorch tensors
input_ids = torch.cat([x['input_ids'] for x in train_df['encoded']]).squeeze(1)  # Remove the extra dimension
attention_mask = torch.cat([x['attention_mask'] for x in train_df['encoded']]).squeeze(1)
labels_tensor = torch.tensor(labels)

# Split the data into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(input_ids, labels_tensor, test_size=0.2, random_state=42)

# Split the attention mask in the same way
attention_mask_train, attention_mask_val = train_test_split(attention_mask, test_size=0.2, random_state=42)

# Custom Dataset class
class CustomDataset(Dataset):
    def __init__(self, input_ids, attention_mask, labels):
        self.input_ids = input_ids
        self.attention_mask = attention_mask
        self.labels = labels

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        return {
            'input_ids': self.input_ids[idx],
            'attention_mask': self.attention_mask[idx],
            'labels': self.labels[idx]
        }

# Define the training and validation datasets
train_dataset = CustomDataset(X_train, attention_mask_train, y_train)
val_dataset = CustomDataset(X_val, attention_mask_val, y_val)

# Define the DataLoader for batching
train_dataloader = DataLoader(train_dataset, batch_size=8, shuffle=True)
val_dataloader = DataLoader(val_dataset, batch_size=16)

# # Define the training arguments
# training_args = TrainingArguments(
#     output_dir='./results',          # output directory
#     num_train_epochs=1,              # number of training epochs
#     per_device_train_batch_size=8,   # batch size for training
#     per_device_eval_batch_size=16,   # batch size for evaluation
#     warmup_steps=500,                # number of warmup steps for learning rate scheduler
#     weight_decay=0.01,               # strength of weight decay
#     logging_dir='./logs',            # directory for storing logs
#     logging_steps=10,
#     evaluation_strategy="epoch",     # evaluation strategy to adopt during training
# )

# # Initialize the Trainer
# trainer = Trainer(
#     model=model,                         # the pre-trained model
#     args=training_args,                  # training arguments
#     train_dataset=train_dataset,         # training dataset
#     eval_dataset=val_dataset             # evaluation dataset
# )




# # Train the model
# trainer.train()

# # Evaluate the model
# trainer.evaluate()

# # Save the trained model
# trainer.save_model("sentiment_analysis_bert_model")
# Update TrainingArguments with eval_strategy (since evaluation_strategy is deprecated)
training_args = TrainingArguments(
    output_dir='./results',          # output directory
    num_train_epochs=3,              # number of training epochs
    per_device_train_batch_size=8,   # batch size for training
    per_device_eval_batch_size=16,   # batch size for evaluation
    warmup_steps=500,                # number of warmup steps for learning rate scheduler
    weight_decay=0.01,               # strength of weight decay
    logging_dir='./logs',            # directory for storing logs
    logging_steps=10,
    eval_strategy="epoch",           # evaluation strategy to adopt during training (fixing deprecated argument)
    save_strategy="epoch",           # save model at the end of each epoch
    load_best_model_at_end=True,     # load the best model when training finishes
    metric_for_best_model="eval_accuracy",  # monitor accuracy for early stopping
    greater_is_better=True,          # we want higher accuracy
    learning_rate=5e-5,              # learning rate
)

# Early stopping callback
early_stopping_callback = EarlyStoppingCallback(early_stopping_patience=2)

# Initialize the Trainer with the compute_metrics function
trainer = Trainer(
    model=model,                         # the pre-trained model
    args=training_args,                  # training arguments
    train_dataset=train_dataset,         # training dataset
    eval_dataset=val_dataset,            # evaluation dataset
    callbacks=[early_stopping_callback],  # early stopping callback
    compute_metrics=compute_metrics      # add compute_metrics function for accuracy
)

# Train the model
trainer.train()

# Evaluate the model
trainer.evaluate()

# Save the trained model
trainer.save_model("sentiment_analysis_bert_model")


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy
1,1.0664,0.998643,0.5
2,1.0722,0.911412,0.578947
3,0.9549,0.833071,0.605263


In [19]:
from transformers import BertTokenizer, BertForSequenceClassification
import torch
import pandas as pd
from torch.utils.data import DataLoader

# Load the fine-tuned BERT model and tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertForSequenceClassification.from_pretrained('sentiment_analysis_bert_model')  # Path to your saved model

# Load the test dataset
test_df = pd.read_csv('/content/test - train.csv')  # Replace with your actual test data file path

# Tokenize the test data
def encode_text(text):
    return tokenizer(text, padding='max_length', truncation=True, max_length=128, return_tensors="pt")

test_df['encoded'] = test_df['text'].apply(encode_text)

# Prepare the test data tensors
input_ids_test = torch.cat([x['input_ids'] for x in test_df['encoded']]).squeeze(1)
attention_mask_test = torch.cat([x['attention_mask'] for x in test_df['encoded']]).squeeze(1)

# Create DataLoader for the test set
test_dataset = torch.utils.data.TensorDataset(input_ids_test, attention_mask_test)
test_dataloader = DataLoader(test_dataset, batch_size=16)

# Make predictions
model.eval()  # Set model to evaluation mode
predictions = []
with torch.no_grad():
    for batch in test_dataloader:
        input_ids_batch, attention_mask_batch = batch
        outputs = model(input_ids_batch, attention_mask=attention_mask_batch)
        logits = outputs.logits
        predicted_labels_batch = torch.argmax(logits, dim=-1)
        predictions.extend(predicted_labels_batch.cpu().numpy())

# Map numeric predictions to sentiment labels
sentiment_map = {0: 'negative', 1: 'neutral', 2: 'positive'}
predicted_labels = [sentiment_map[label] for label in predictions]

# Prepare the submission DataFrame
submission_df = pd.DataFrame({
    'id': test_df['id'],  # Ensure 'id' is taken from the test data
    'sentiment': predicted_labels  # The predicted sentiment labels
})

# Save the submission DataFrame to a CSV file
submission_df.to_csv('submission_bert(erly).csv', index=False)

# Optionally, download the submission file in Google Colab
from google.colab import files
files.download('submission_bert(erly).csv')


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>