In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
!pip install transformers



In [None]:
import torch
import re
import pandas as pd
from transformers import BertTokenizer, BertForSequenceClassification

# Load the pre-trained mBERT model and tokenizer
model_path = "/content/drive/MyDrive/hudai/mbert_trained_model.pth"  # Replace with the path to your .pth file
model = BertForSequenceClassification.from_pretrained('bert-base-multilingual-cased')
tokenizer = BertTokenizer.from_pretrained('bert-base-multilingual-cased')

# Load your fine-tuned model weights
model.load_state_dict(torch.load(model_path, map_location=torch.device('cpu')))
model.eval()

# Check if CUDA is available and use it if possible
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)

# Function to preprocess a word
def preprocess_word(word):
    word = word.lower()
    word = re.sub(r'[^a-zA-Z\s]', '', word)
    word = word.strip()
    word = re.sub(r'[^\x00-\x7f]', '', word)
    return word

# Function to predict the class of a given sentence based on the count of "benglish" words
def predict_class(sentences):
    total_benglish_word_count = 0
    total_word_count = 0

    for sentence in sentences:
        words = sentence.split()
        benglish_word_count = 0
        sentence_word_count = 0

        for word in words:
            word = preprocess_word(word)
            if not word:
                continue
            sentence_word_count += 1
            inputs = tokenizer(word, return_tensors='pt').to(device)

            with torch.no_grad():
                outputs = model(**inputs)

            predicted_class = torch.argmax(outputs.logits, dim=1).item()

            if predicted_class == 1:
                benglish_word_count += 1

        total_benglish_word_count += benglish_word_count
        total_word_count += sentence_word_count

    if total_word_count < 4:
        return 0

    benglish_percentage = total_benglish_word_count / total_word_count
    return 1 if benglish_percentage >= 0.3 else 0

# Load your dataset from Google Drive (assuming it's a CSV file)
dataset_path = "/content/drive/MyDrive/hudai/hudai.csv"  # Replace with the path to your dataset
df = pd.read_csv(dataset_path)

# Predict the class for each row in the dataset
for index, row in df.iterrows():
    sentence = row['sentence1']

    if not pd.isnull(sentence):
        sentences = sentence.split('.')
        final_prediction = predict_class(sentences)
        df.at[index, 'label'] = final_prediction

# Save the updated DataFrame back to the CSV file
df.to_csv(dataset_path, index=False)
