<a href="https://www.kaggle.com/code/sibindratimalsina/nepali-sentiment-analysis-model-training?scriptVersionId=134555064" target="_blank"><img align="left" alt="Kaggle" title="Open in Kaggle" src="https://kaggle.com/static/images/open-in-kaggle.svg"></a>

In [None]:
from transformers import TFAutoModel , AutoTokenizer

import matplotlib.pyplot as plt
import os 
import pandas as pd
import torch

from sklearn.preprocessing import LabelEncoder
import numpy as np
from torch.utils.data import TensorDataset

from transformers import BertForSequenceClassification, AdamW
from torch.utils.data import DataLoader, RandomSampler

In [None]:
# Print the list of files in the input directory
print(os.listdir('/kaggle/input/'))

# Load the train and test datasets
df_train = pd.read_csv('/kaggle/input/sentiment-analysis/train.csv')
df_test = pd.read_csv('/kaggle/input/sentiment-analysis/test.csv')

In [None]:
# Bar graph for train dataset
train_labels = df_train['label'].value_counts()

# Bar graph for test dataset
test_labels = df_test['label'].value_counts()

# Create subplots
fig, axs = plt.subplots(1, 2, figsize=(12, 6))

# Train dataset subplot
axs[0].bar(train_labels.index, train_labels.values)
axs[0].set_xlabel('Labels')
axs[0].set_ylabel('Count')
axs[0].set_title('Train Dataset - Label Distribution')

# Test dataset subplot
axs[1].bar(test_labels.index, test_labels.values)
axs[1].set_xlabel('Labels')
axs[1].set_ylabel('Count')
axs[1].set_title('Test Dataset - Label Distribution')

# Adjust spacing between subplots
plt.subplots_adjust(wspace=0.4)

# Display the plot
plt.show()


In [None]:
# Check for missing values in the train dataset
train_na = df_train.isna().sum()
print('Train Dataset - Missing Values:')
print(train_na)

test_na = df_test.isna().sum()
print('\nTest Dataset - Missing Values:')
print(test_na)

In [None]:
df_train = df_train.dropna()
df_test = df_test.dropna()

# reset indexing 
df_train = df_train.reset_index(drop=True)

In [None]:
# Check for missing values in the train dataset
train_na = df_train.isna().sum()
print('Train Dataset - Missing Values:')
print(train_na)

test_na = df_test.isna().sum()
print('\nTest Dataset - Missing Values:')
print(test_na)

print("Train Dataset Size:",df_train.shape )
print("Test Dataset Size:", df_test.shape)

In [None]:
# remove unwanted datas
print(df_train['label'].unique())
print(df_test['label'].unique())

valid_labels = ['0', '1', '2']


# Filter train dataset
df_train = df_train.loc[df_train['label'].isin(valid_labels)]
df_train.loc[:, 'label'] = df_train['label'].astype(int)

# Filter test dataset
df_test = df_test.loc[df_test['label'].isin(valid_labels)]
df_test.loc[:, 'label'] = df_test['label'].astype(int)


In [None]:
print(df_train['label'])
print(df_test['label'])

In [None]:
label_encoder = LabelEncoder()

In [None]:
tokenizer = AutoTokenizer.from_pretrained('bert-base-multilingual-cased')

In [None]:
df_train['text'] = df_train['text'].tolist()

print(type(df_train['text']))
print(df_train['text'].head())

In [None]:
# tokenizing training data
train_tokens = tokenizer.batch_encode_plus(
    df_train['text'].tolist(),
    padding = True,
    truncation = True,
    max_length = 512,
    return_tensors = 'pt'
)

train_labels = label_encoder.fit_transform(df_train['label'])


In [None]:
print(train_labels)

In [None]:
# tokenizing test data 
test_tokens = tokenizer.batch_encode_plus(
    df_test['text'].tolist(),
    padding=True,
    truncation=True,
    max_length=512,
    return_tensors='pt'
)

test_labels = label_encoder.fit_transform(df_test['label'])

In [None]:
print(test_labels)

In [None]:
# Convert input sequences to tensors
train_input_ids = torch.tensor(train_tokens['input_ids'])
train_attention_masks = torch.tensor(train_tokens['attention_mask'])
train_labels = torch.tensor(df_train['label'].values)

test_input_ids = torch.tensor(test_tokens['input_ids'])
test_attention_masks = torch.tensor(test_tokens['attention_mask'])
test_labels = torch.tensor(df_test['label'].values)

# labels
train_labels = torch.tensor(df_train['label'].values)
test_labels = torch.tensor(df_test['label'].values)


# Create a TensorDataset
train_dataset = TensorDataset(train_input_ids, train_attention_masks, train_labels)
test_dataset = TensorDataset(test_input_ids, test_attention_masks, test_labels)

In [None]:
type(train_dataset)

In [None]:
model = BertForSequenceClassification.from_pretrained('bert-base-multilingual-cased', num_labels=3)

In [None]:
# Setting device
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)

In [None]:
batch_size = 16
learning_rate = 2e-5
epochs = 5

train_dataloader = DataLoader(train_dataset, sampler=RandomSampler(train_dataset), batch_size=batch_size)
optimizer = AdamW(model.parameters(), lr=learning_rate, eps=1e-8)

In [None]:
from tqdm import tqdm

# Set random seed for reproducibility
seed = 42
torch.manual_seed(seed)
torch.cuda.manual_seed_all(seed)
np.random.seed(seed)

# Training
model.train()

for epoch in range(epochs):
    total_loss = 0
    progress_bar = tqdm(train_dataloader, desc=f'Epoch {epoch+1}')

    for batch in progress_bar:
        batch = tuple(t.to(device) for t in batch)
        inputs = {
            'input_ids': batch[0],
            'attention_mask': batch[1],
            'labels': batch[2]
        }

        optimizer.zero_grad()
        outputs = model(**inputs)
        loss = outputs.loss
        total_loss += loss.item()

        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)  # Apply gradient clipping if necessary
        optimizer.step()

        progress_bar.set_postfix({'Training Loss': loss.item()})

    average_loss = total_loss / len(train_dataloader)
    print(f'Epoch {epoch+1}: Average Loss: {average_loss:.4f}')

In [None]:
# Saving the trained model
output_dir = './trained_model'
os.makedirs(output_dir, exist_ok=True)
model.save_pretrained(output_dir)

print('Training completed!')

In [None]:
model.eval()  # Set the model in evaluation mode

correct = 0
total = 0

test_dataloader = DataLoader(test_dataset, sampler=RandomSampler(test_dataset), batch_size=batch_size)

with torch.no_grad():  # Disable gradient calculations during evaluation
    progress_bar = tqdm(test_dataloader, desc="Evaluating")
    for batch in progress_bar:
        batch_input_ids, batch_attention_masks, batch_labels = batch
        
        batch_input_ids = batch_input_ids.to(device)
        batch_attention_masks = batch_attention_masks.to(device)
        batch_labels = batch_labels.to(device)

        outputs = model(input_ids=batch_input_ids, attention_mask=batch_attention_masks)
        logits = outputs.logits
        _, predicted_labels = torch.max(logits, dim=1)
        
        total += batch_labels.size(0)
        correct += (predicted_labels == batch_labels).sum().item()

        accuracy = (correct / total) * 100
        progress_bar.set_postfix({'Accuracy': f'{accuracy:.2f}%'})

print(f"Accuracy: {accuracy:.2f}%")


In [None]:
# SAVING MODEL
model.save_pretrained("my-model")

# Download
import shutil

model_directory = "my-model"

# Zip the model directory
shutil.make_archive("my-model", 'zip', model_directory)