Install Required Libraries

In [None]:
%pip install transformers datasets torch pandas scikit-learn




Loading and Preprocessing our Dataset

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from transformers import AutoTokenizer,logging
logging.set_verbosity_error()  # Suppress logs

# Load CSV file
df = pd.read_csv("/content/drive/MyDrive/Colab Notebooks/translated_4000_reviews.csv")

# Check for missing values
df = df.dropna()
df['label'] = df['label'].apply(lambda x: 1 if x == 'OR' else 0)
df.head()

# Convert ratings to string and concatenate with review text
df["combined_text"] = df["rating"].astype(str) + " [SEP] " + df["Nepali Review"]


# Splitting dataset into train & test sets
train_texts, test_texts, train_labels, test_labels = train_test_split(
    df["combined_text"].tolist(), df["label"].tolist(), test_size=0.2, random_state=1)

# Load NepBERT tokenizer
model_name = "NepBERTa/NepBERTa"  # Change if you're using another Nepali BERT model
tokenizer = AutoTokenizer.from_pretrained(model_name)

# Tokenize the text
train_encodings = tokenizer(train_texts, truncation=True, padding=True, max_length=128)
test_encodings = tokenizer(test_texts, truncation=True, padding=True, max_length=128)


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Convert Data to PyTorch Dataset

In [3]:
import torch

class ReviewDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item["labels"] = torch.tensor(self.labels[idx])
        return item

# Convert to PyTorch dataset
train_dataset = ReviewDataset(train_encodings, train_labels)
test_dataset = ReviewDataset(test_encodings, test_labels)


Loading Pre-trained NepBERT Model

In [4]:
from transformers import AutoModelForSequenceClassification

# Load NepBERT with 2 output labels (Fake vs. Real)
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=2, from_tf=True)


All TF 2.0 model weights were used when initializing BertForSequenceClassification.

All the weights of BertForSequenceClassification were initialized from the TF 2.0 model.
If your task is similar to the task the model of the checkpoint was trained on, you can already use BertForSequenceClassification for predictions without further training.


Model Training

In [7]:
import os
from transformers import Trainer, TrainingArguments

# Disable W&B if not using it
os.environ["WANDB_DISABLED"] = "true"

# Define training parameters
training_args = TrainingArguments(
    output_dir="./results",
    eval_strategy="epoch",  # Updated from evaluation_strategy
    save_strategy="epoch",
    per_device_train_batch_size=10,
    per_device_eval_batch_size=10,
    num_train_epochs=5,
    weight_decay=0.01,
    logging_dir="./logs",
    fp16=True,
    # gradient_accumulation_steps=4,  # Accumulates gradients over 4 steps
)


# Use Trainer API to fine-tune NepBERT
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
)

# Train the model
trainer.train()


Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).


Epoch,Training Loss,Validation Loss
1,No log,0.779361
2,0.206400,0.557385
3,0.206400,0.700805
4,0.069600,0.897976
5,0.033300,0.846605


TrainOutput(global_step=1600, training_loss=0.09680771628394723, metrics={'train_runtime': 233.9822, 'train_samples_per_second': 68.381, 'train_steps_per_second': 6.838, 'total_flos': 1052444221440000.0, 'train_loss': 0.09680771628394723, 'epoch': 5.0})

Model Evaluation

In [8]:
from sklearn.metrics import accuracy_score

# Get predictions
preds = trainer.predict(test_dataset)
pred_labels = torch.argmax(torch.tensor(preds.predictions), axis=1).numpy()

# Compute accuracy
accuracy = accuracy_score(test_labels, pred_labels)
print(f"Model Accuracy: {accuracy:.2f}")


Model Accuracy: 0.87


**Save and Test the Model**

Using "save_pretrained" which is the preferred Hugging Face's method

In [9]:
# Save model using Hugging Face's method
model.save_pretrained("./fine_tuned_nepbert")

# Load model using Hugging Face's method
# from transformers import BertForSequenceClassification, BertTokenizer
# model = BertForSequenceClassification.from_pretrained("./fine_tuned_nepbert")
# tokenizer = BertTokenizer.from_pretrained("./fine_tuned_nepbert")


Using pickle

In [10]:
import pickle

# Save the model to a pickle file
with open("nepbert_model.pkl", "wb") as f:
    pickle.dump(model, f)


Checking Model output

In [None]:
from transformers import pipeline

classifier = pipeline("text-classification", model="./fine_tuned_nepbert", tokenizer=tokenizer)
# or
# Load model using Hugging Face's method
# from transformers import BertForSequenceClassification, BertTokenizer
# model = BertForSequenceClassification.from_pretrained("./fine_tuned_nepbert")
# tokenizer = BertTokenizer.from_pretrained("./fine_tuned_nepbert")


Device set to use cuda:0


In [12]:
# Example rating and review
rating = 1
review = "यो होटल ठगि हो।"  # Example Nepali review

# Concatenate rating with review (same format as training)
input_text = f"{rating} [SEP] {review}"

# Get prediction
result = classifier(input_text)

# print(result) #gives following o/p
# [{'label': 'LABEL_0', 'score': 0.9900834560394287}]

# Extract label and confidence score
label = result[0]['label']
score = result[0]['score'] * 100  # Convert to percentage

# Interpret the label (assuming LABEL_0 = Fake, LABEL_1 = Real)
label_text = "Fake" if label == "LABEL_0" else "Real"

# Display output
print(f"Review is {label_text} with {score:.2f}% accuracy.")


Review is Fake with 99.84% accuracy.


In [13]:
# Example rating and review
rating = 5
review = "यसले मेरो फोनलाई चार्जर पूर्ण रूपमा चार्ज भएपछि मात्र चार्ज गर्न अनुमति दिन्छ। यसको एक मात्र नकारात्मक पक्ष भनेको जब तपाईं"

# Concatenate rating with review (same format as training)
input_text = f"{rating} [SEP] {review}"

# Get prediction
result = classifier(input_text)

# Extract label and confidence score
label = result[0]['label']
score = result[0]['score'] * 100  # Convert to percentage

# Interpret the label (assuming LABEL_0 = Fake, LABEL_1 = Real)
label_text = "Fake" if label == "LABEL_0" else "Real"

# Display output
print(f"Review is {label_text} with {score:.2f}% accuracy.")


Review is Fake with 99.98% accuracy.
