In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

### indicbert + bert

original

In [4]:
import pandas as pd
import torch
from sklearn.model_selection import train_test_split
from transformers import BertTokenizer, BertForSequenceClassification, AdamW, AutoTokenizer, AutoModelForSequenceClassification
from sklearn.metrics import accuracy_score, classification_report

# Load your CSV file
df = pd.read_csv('/kaggle/input/telugu-data/training_data_telugu-hate.csv')

# Mapping labels to integers
label_map = {'hate': 1, 'non-hate': 0}
df['Label'] = df['Label'].map(label_map)

# Extract input texts and labels
texts = df['Comments'].tolist()
labels = df['Label'].tolist()

# Split data into train and test sets
train_texts, test_texts, train_labels, test_labels = train_test_split(texts, labels, test_size=0.2, random_state=42)

# Load pre-trained BERT and IndicBERT models and tokenizers
bert_tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
bert_model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=2)

indicbert_tokenizer = AutoTokenizer.from_pretrained("ai4bharat/indic-bert")
indicbert_model = AutoModelForSequenceClassification.from_pretrained("ai4bharat/indic-bert", num_labels=2)

# Tokenize inputs
train_encodings_bert = bert_tokenizer(train_texts, truncation=True, padding=True)
test_encodings_bert = bert_tokenizer(test_texts, truncation=True, padding=True)

train_encodings_indicbert = indicbert_tokenizer(train_texts, truncation=True, padding=True)
test_encodings_indicbert = indicbert_tokenizer(test_texts, truncation=True, padding=True)

# Convert labels to PyTorch tensors
train_labels = torch.tensor(train_labels)
test_labels = torch.tensor(test_labels)

# Create PyTorch datasets
train_dataset_bert = torch.utils.data.TensorDataset(torch.tensor(train_encodings_bert['input_ids']),
                                                    torch.tensor(train_encodings_bert['attention_mask']),
                                                    train_labels)
test_dataset_bert = torch.utils.data.TensorDataset(torch.tensor(test_encodings_bert['input_ids']),
                                                   torch.tensor(test_encodings_bert['attention_mask']),
                                                   test_labels)

train_dataset_indicbert = torch.utils.data.TensorDataset(torch.tensor(train_encodings_indicbert['input_ids']),
                                                         torch.tensor(train_encodings_indicbert['attention_mask']),
                                                         train_labels)
test_dataset_indicbert = torch.utils.data.TensorDataset(torch.tensor(test_encodings_indicbert['input_ids']),
                                                        torch.tensor(test_encodings_indicbert['attention_mask']),
                                                        test_labels)

# Create data loaders
train_loader_bert = torch.utils.data.DataLoader(train_dataset_bert, batch_size=8, shuffle=True)
test_loader_bert = torch.utils.data.DataLoader(test_dataset_bert, batch_size=8, shuffle=False)

train_loader_indicbert = torch.utils.data.DataLoader(train_dataset_indicbert, batch_size=8, shuffle=True)
test_loader_indicbert = torch.utils.data.DataLoader(test_dataset_indicbert, batch_size=8, shuffle=False)

# Set device (GPU if available, otherwise CPU)
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
bert_model.to(device)
indicbert_model.to(device)

# Set optimizers and learning rate schedulers
optimizer_bert = AdamW(bert_model.parameters(), lr=1e-5)
optimizer_indicbert = AdamW(indicbert_model.parameters(), lr=1e-5)
num_epochs = 10

# Training loop for BERT
for epoch in range(num_epochs):
    print(f"Epoch {epoch+1}/{num_epochs} (BERT)")
    bert_model.train()
    for batch in train_loader_bert:
        input_ids, attention_mask, labels = batch
        input_ids, attention_mask, labels = input_ids.to(device), attention_mask.to(device), labels.to(device)
        
        optimizer_bert.zero_grad()
        outputs = bert_model(input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        loss.backward()
        optimizer_bert.step()

# Training loop for IndicBERT
for epoch in range(num_epochs):
    print(f"Epoch {epoch+1}/{num_epochs} (IndicBERT)")
    indicbert_model.train()
    for batch in train_loader_indicbert:
        input_ids, attention_mask, labels = batch
        input_ids, attention_mask, labels = input_ids.to(device), attention_mask.to(device), labels.to(device)
        
        optimizer_indicbert.zero_grad()
        outputs = indicbert_model(input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        loss.backward()
        optimizer_indicbert.step()

# Evaluation
bert_model.eval()
indicbert_model.eval()
predictions_bert = []
predictions_indicbert = []
true_labels = []
for batch in test_loader_bert:
    input_ids, attention_mask, labels = batch
    input_ids, attention_mask, labels = input_ids.to(device), attention_mask.to(device), labels.to(device)
    
    with torch.no_grad():
        outputs = bert_model(input_ids, attention_mask=attention_mask)
    
    logits = outputs.logits
    preds = torch.argmax(logits, dim=1)
    
    predictions_bert.extend(preds.cpu().numpy())

for batch in test_loader_indicbert:
    input_ids, attention_mask, labels = batch
    input_ids, attention_mask, labels = input_ids.to(device), attention_mask.to(device), labels.to(device)
    
    with torch.no_grad():
        outputs = indicbert_model(input_ids, attention_mask=attention_mask)
    
    logits = outputs.logits
    preds = torch.argmax(logits, dim=1)
    
    predictions_indicbert.extend(preds.cpu().numpy())
    true_labels.extend(labels.cpu().numpy())

# Combine predictions from both models
final_predictions = []
for pred_bert, pred_indicbert in zip(predictions_bert, predictions_indicbert):
    # Simple voting scheme, you can choose a different method for combining predictions
    combined_prediction = 1 if (pred_bert + pred_indicbert) >= 1 else 0
    final_predictions.append(combined_prediction)

# Calculate accuracy and other metrics
accuracy = accuracy_score(true_labels, final_predictions)
report = classification_report(true_labels, final_predictions)

print("Accuracy:", accuracy)
print("Classification Report:")
print(report)


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


config.json:   0%|          | 0.00/507 [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/5.65M [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/135M [00:00<?, ?B/s]

  return self.fget.__get__(instance, owner)()
Some weights of AlbertForSequenceClassification were not initialized from the model checkpoint at ai4bharat/indic-bert and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


Epoch 1/10 (BERT)
Epoch 2/10 (BERT)
Epoch 3/10 (BERT)
Epoch 4/10 (BERT)
Epoch 5/10 (BERT)
Epoch 6/10 (BERT)
Epoch 7/10 (BERT)
Epoch 8/10 (BERT)
Epoch 9/10 (BERT)
Epoch 10/10 (BERT)
Epoch 1/10 (IndicBERT)
Epoch 2/10 (IndicBERT)
Epoch 3/10 (IndicBERT)
Epoch 4/10 (IndicBERT)
Epoch 5/10 (IndicBERT)
Epoch 6/10 (IndicBERT)
Epoch 7/10 (IndicBERT)
Epoch 8/10 (IndicBERT)
Epoch 9/10 (IndicBERT)
Epoch 10/10 (IndicBERT)
Accuracy: 0.725
Classification Report:
              precision    recall  f1-score   support

           0       0.78      0.68      0.73       430
           1       0.68      0.78      0.72       370

    accuracy                           0.73       800
   macro avg       0.73      0.73      0.72       800
weighted avg       0.73      0.72      0.73       800



translated

In [5]:
import pandas as pd
import torch
from sklearn.model_selection import train_test_split
from transformers import BertTokenizer, BertForSequenceClassification, AdamW, AutoTokenizer, AutoModelForSequenceClassification
from sklearn.metrics import accuracy_score, classification_report

# Load your CSV file
df = pd.read_csv('/kaggle/input/telugu-data/Hos_train_translated.csv')

# Mapping labels to integers
label_map = {'hate': 1, 'non-hate': 0}
df['Label'] = df['Label'].map(label_map)

# Extract input texts and labels
texts = df['comments '].tolist()
labels = df['Label'].tolist()

# Split data into train and test sets
train_texts, test_texts, train_labels, test_labels = train_test_split(texts, labels, test_size=0.2, random_state=42)

# Load pre-trained BERT and IndicBERT models and tokenizers
bert_tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
bert_model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=2)

indicbert_tokenizer = AutoTokenizer.from_pretrained("ai4bharat/indic-bert")
indicbert_model = AutoModelForSequenceClassification.from_pretrained("ai4bharat/indic-bert", num_labels=2)

# Tokenize inputs
train_encodings_bert = bert_tokenizer(train_texts, truncation=True, padding=True)
test_encodings_bert = bert_tokenizer(test_texts, truncation=True, padding=True)

train_encodings_indicbert = indicbert_tokenizer(train_texts, truncation=True, padding=True)
test_encodings_indicbert = indicbert_tokenizer(test_texts, truncation=True, padding=True)

# Convert labels to PyTorch tensors
train_labels = torch.tensor(train_labels)
test_labels = torch.tensor(test_labels)

# Create PyTorch datasets
train_dataset_bert = torch.utils.data.TensorDataset(torch.tensor(train_encodings_bert['input_ids']),
                                                    torch.tensor(train_encodings_bert['attention_mask']),
                                                    train_labels)
test_dataset_bert = torch.utils.data.TensorDataset(torch.tensor(test_encodings_bert['input_ids']),
                                                   torch.tensor(test_encodings_bert['attention_mask']),
                                                   test_labels)

train_dataset_indicbert = torch.utils.data.TensorDataset(torch.tensor(train_encodings_indicbert['input_ids']),
                                                         torch.tensor(train_encodings_indicbert['attention_mask']),
                                                         train_labels)
test_dataset_indicbert = torch.utils.data.TensorDataset(torch.tensor(test_encodings_indicbert['input_ids']),
                                                        torch.tensor(test_encodings_indicbert['attention_mask']),
                                                        test_labels)

# Create data loaders
train_loader_bert = torch.utils.data.DataLoader(train_dataset_bert, batch_size=8, shuffle=True)
test_loader_bert = torch.utils.data.DataLoader(test_dataset_bert, batch_size=8, shuffle=False)

train_loader_indicbert = torch.utils.data.DataLoader(train_dataset_indicbert, batch_size=8, shuffle=True)
test_loader_indicbert = torch.utils.data.DataLoader(test_dataset_indicbert, batch_size=8, shuffle=False)

# Set device (GPU if available, otherwise CPU)
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
bert_model.to(device)
indicbert_model.to(device)

# Set optimizers and learning rate schedulers
optimizer_bert = AdamW(bert_model.parameters(), lr=1e-5)
optimizer_indicbert = AdamW(indicbert_model.parameters(), lr=1e-5)
num_epochs = 10

# Training loop for BERT
for epoch in range(num_epochs):
    print(f"Epoch {epoch+1}/{num_epochs} (BERT)")
    bert_model.train()
    for batch in train_loader_bert:
        input_ids, attention_mask, labels = batch
        input_ids, attention_mask, labels = input_ids.to(device), attention_mask.to(device), labels.to(device)
        
        optimizer_bert.zero_grad()
        outputs = bert_model(input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        loss.backward()
        optimizer_bert.step()

# Training loop for IndicBERT
for epoch in range(num_epochs):
    print(f"Epoch {epoch+1}/{num_epochs} (IndicBERT)")
    indicbert_model.train()
    for batch in train_loader_indicbert:
        input_ids, attention_mask, labels = batch
        input_ids, attention_mask, labels = input_ids.to(device), attention_mask.to(device), labels.to(device)
        
        optimizer_indicbert.zero_grad()
        outputs = indicbert_model(input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        loss.backward()
        optimizer_indicbert.step()

# Evaluation
bert_model.eval()
indicbert_model.eval()
predictions_bert = []
predictions_indicbert = []
true_labels = []
for batch in test_loader_bert:
    input_ids, attention_mask, labels = batch
    input_ids, attention_mask, labels = input_ids.to(device), attention_mask.to(device), labels.to(device)
    
    with torch.no_grad():
        outputs = bert_model(input_ids, attention_mask=attention_mask)
    
    logits = outputs.logits
    preds = torch.argmax(logits, dim=1)
    
    predictions_bert.extend(preds.cpu().numpy())

for batch in test_loader_indicbert:
    input_ids, attention_mask, labels = batch
    input_ids, attention_mask, labels = input_ids.to(device), attention_mask.to(device), labels.to(device)
    
    with torch.no_grad():
        outputs = indicbert_model(input_ids, attention_mask=attention_mask)
    
    logits = outputs.logits
    preds = torch.argmax(logits, dim=1)
    
    predictions_indicbert.extend(preds.cpu().numpy())
    true_labels.extend(labels.cpu().numpy())

# Combine predictions from both models
final_predictions = []
for pred_bert, pred_indicbert in zip(predictions_bert, predictions_indicbert):
    # Simple voting scheme, you can choose a different method for combining predictions
    combined_prediction = 1 if (pred_bert + pred_indicbert) >= 1 else 0
    final_predictions.append(combined_prediction)

# Calculate accuracy and other metrics
accuracy = accuracy_score(true_labels, final_predictions)
report = classification_report(true_labels, final_predictions)

print("Accuracy:", accuracy)
print("Classification Report:")
print(report)


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of AlbertForSequenceClassification were not initialized from the model checkpoint at ai4bharat/indic-bert and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


Epoch 1/10 (BERT)
Epoch 2/10 (BERT)
Epoch 3/10 (BERT)
Epoch 4/10 (BERT)
Epoch 5/10 (BERT)
Epoch 6/10 (BERT)
Epoch 7/10 (BERT)
Epoch 8/10 (BERT)
Epoch 9/10 (BERT)
Epoch 10/10 (BERT)
Epoch 1/10 (IndicBERT)
Epoch 2/10 (IndicBERT)
Epoch 3/10 (IndicBERT)
Epoch 4/10 (IndicBERT)
Epoch 5/10 (IndicBERT)
Epoch 6/10 (IndicBERT)
Epoch 7/10 (IndicBERT)
Epoch 8/10 (IndicBERT)
Epoch 9/10 (IndicBERT)
Epoch 10/10 (IndicBERT)
Accuracy: 0.71375
Classification Report:
              precision    recall  f1-score   support

           0       0.81      0.61      0.70       430
           1       0.65      0.84      0.73       370

    accuracy                           0.71       800
   macro avg       0.73      0.72      0.71       800
weighted avg       0.74      0.71      0.71       800



transliterated

In [7]:
import pandas as pd
import torch
from sklearn.model_selection import train_test_split
from transformers import BertTokenizer, BertForSequenceClassification, AdamW, AutoTokenizer, AutoModelForSequenceClassification
from sklearn.metrics import accuracy_score, classification_report

# Load your CSV file
df = pd.read_csv('/kaggle/input/telugu-data/transliteration_4000.csv')

# Mapping labels to integers
label_map = {'hate': 1, 'non-hate': 0}
df['Label'] = df['Label'].map(label_map)

# Extract input texts and labels
texts = df['Comments'].tolist()
labels = df['Label'].tolist()

# Split data into train and test sets
train_texts, test_texts, train_labels, test_labels = train_test_split(texts, labels, test_size=0.2, random_state=42)

# Load pre-trained BERT and IndicBERT models and tokenizers
bert_tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
bert_model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=2)

indicbert_tokenizer = AutoTokenizer.from_pretrained("ai4bharat/indic-bert")
indicbert_model = AutoModelForSequenceClassification.from_pretrained("ai4bharat/indic-bert", num_labels=2)

# Tokenize inputs
train_encodings_bert = bert_tokenizer(train_texts, truncation=True, padding=True)
test_encodings_bert = bert_tokenizer(test_texts, truncation=True, padding=True)

train_encodings_indicbert = indicbert_tokenizer(train_texts, truncation=True, padding=True)
test_encodings_indicbert = indicbert_tokenizer(test_texts, truncation=True, padding=True)

# Convert labels to PyTorch tensors
train_labels = torch.tensor(train_labels)
test_labels = torch.tensor(test_labels)

# Create PyTorch datasets
train_dataset_bert = torch.utils.data.TensorDataset(torch.tensor(train_encodings_bert['input_ids']),
                                                    torch.tensor(train_encodings_bert['attention_mask']),
                                                    train_labels)
test_dataset_bert = torch.utils.data.TensorDataset(torch.tensor(test_encodings_bert['input_ids']),
                                                   torch.tensor(test_encodings_bert['attention_mask']),
                                                   test_labels)

train_dataset_indicbert = torch.utils.data.TensorDataset(torch.tensor(train_encodings_indicbert['input_ids']),
                                                         torch.tensor(train_encodings_indicbert['attention_mask']),
                                                         train_labels)
test_dataset_indicbert = torch.utils.data.TensorDataset(torch.tensor(test_encodings_indicbert['input_ids']),
                                                        torch.tensor(test_encodings_indicbert['attention_mask']),
                                                        test_labels)

# Create data loaders
train_loader_bert = torch.utils.data.DataLoader(train_dataset_bert, batch_size=8, shuffle=True)
test_loader_bert = torch.utils.data.DataLoader(test_dataset_bert, batch_size=8, shuffle=False)

train_loader_indicbert = torch.utils.data.DataLoader(train_dataset_indicbert, batch_size=8, shuffle=True)
test_loader_indicbert = torch.utils.data.DataLoader(test_dataset_indicbert, batch_size=8, shuffle=False)

# Set device (GPU if available, otherwise CPU)
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
bert_model.to(device)
indicbert_model.to(device)

# Set optimizers and learning rate schedulers
optimizer_bert = AdamW(bert_model.parameters(), lr=1e-5)
optimizer_indicbert = AdamW(indicbert_model.parameters(), lr=1e-5)
num_epochs = 10

# Training loop for BERT
for epoch in range(num_epochs):
    print(f"Epoch {epoch+1}/{num_epochs} (BERT)")
    bert_model.train()
    for batch in train_loader_bert:
        input_ids, attention_mask, labels = batch
        input_ids, attention_mask, labels = input_ids.to(device), attention_mask.to(device), labels.to(device)
        
        optimizer_bert.zero_grad()
        outputs = bert_model(input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        loss.backward()
        optimizer_bert.step()

# Training loop for IndicBERT
for epoch in range(num_epochs):
    print(f"Epoch {epoch+1}/{num_epochs} (IndicBERT)")
    indicbert_model.train()
    for batch in train_loader_indicbert:
        input_ids, attention_mask, labels = batch
        input_ids, attention_mask, labels = input_ids.to(device), attention_mask.to(device), labels.to(device)
        
        optimizer_indicbert.zero_grad()
        outputs = indicbert_model(input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        loss.backward()
        optimizer_indicbert.step()

# Evaluation
bert_model.eval()
indicbert_model.eval()
predictions_bert = []
predictions_indicbert = []
true_labels = []
for batch in test_loader_bert:
    input_ids, attention_mask, labels = batch
    input_ids, attention_mask, labels = input_ids.to(device), attention_mask.to(device), labels.to(device)
    
    with torch.no_grad():
        outputs = bert_model(input_ids, attention_mask=attention_mask)
    
    logits = outputs.logits
    preds = torch.argmax(logits, dim=1)
    
    predictions_bert.extend(preds.cpu().numpy())

for batch in test_loader_indicbert:
    input_ids, attention_mask, labels = batch
    input_ids, attention_mask, labels = input_ids.to(device), attention_mask.to(device), labels.to(device)
    
    with torch.no_grad():
        outputs = indicbert_model(input_ids, attention_mask=attention_mask)
    
    logits = outputs.logits
    preds = torch.argmax(logits, dim=1)
    
    predictions_indicbert.extend(preds.cpu().numpy())
    true_labels.extend(labels.cpu().numpy())

# Combine predictions from both models
final_predictions = []
for pred_bert, pred_indicbert in zip(predictions_bert, predictions_indicbert):
    # Simple voting scheme, you can choose a different method for combining predictions
    combined_prediction = 1 if (pred_bert + pred_indicbert) >= 1 else 0
    final_predictions.append(combined_prediction)

# Calculate accuracy and other metrics
accuracy = accuracy_score(true_labels, final_predictions)
report = classification_report(true_labels, final_predictions)

print("Accuracy:", accuracy)
print("Classification Report:")
print(report)


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of AlbertForSequenceClassification were not initialized from the model checkpoint at ai4bharat/indic-bert and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


Epoch 1/10 (BERT)
Epoch 2/10 (BERT)
Epoch 3/10 (BERT)
Epoch 4/10 (BERT)
Epoch 5/10 (BERT)
Epoch 6/10 (BERT)
Epoch 7/10 (BERT)
Epoch 8/10 (BERT)
Epoch 9/10 (BERT)
Epoch 10/10 (BERT)
Epoch 1/10 (IndicBERT)
Epoch 2/10 (IndicBERT)
Epoch 3/10 (IndicBERT)
Epoch 4/10 (IndicBERT)
Epoch 5/10 (IndicBERT)
Epoch 6/10 (IndicBERT)
Epoch 7/10 (IndicBERT)
Epoch 8/10 (IndicBERT)
Epoch 9/10 (IndicBERT)
Epoch 10/10 (IndicBERT)
Accuracy: 0.4625
Classification Report:
              precision    recall  f1-score   support

           0       0.00      0.00      0.00       430
           1       0.46      1.00      0.63       370

    accuracy                           0.46       800
   macro avg       0.23      0.50      0.32       800
weighted avg       0.21      0.46      0.29       800



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


### indicbert + Distilbert

original

In [8]:
import pandas as pd
import torch
from sklearn.model_selection import train_test_split
from transformers import DistilBertTokenizer, DistilBertForSequenceClassification, AdamW, AutoTokenizer, AutoModelForSequenceClassification
from sklearn.metrics import accuracy_score, classification_report

# Load your CSV file
df = pd.read_csv('/kaggle/input/telugu-data/training_data_telugu-hate.csv')

# Mapping labels to integers
label_map = {'hate': 1, 'non-hate': 0}
df['Label'] = df['Label'].map(label_map)

# Extract input texts and labels
texts = df['Comments'].tolist()
labels = df['Label'].tolist()

# Split data into train and test sets
train_texts, test_texts, train_labels, test_labels = train_test_split(texts, labels, test_size=0.2, random_state=42)

# Load pre-trained DistilBERT and IndicBERT models and tokenizers
distilbert_tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')
distilbert_model = DistilBertForSequenceClassification.from_pretrained('distilbert-base-uncased', num_labels=2)

indicbert_tokenizer = AutoTokenizer.from_pretrained("ai4bharat/indic-bert")
indicbert_model = AutoModelForSequenceClassification.from_pretrained("ai4bharat/indic-bert", num_labels=2)

# Tokenize inputs
train_encodings_distilbert = distilbert_tokenizer(train_texts, truncation=True, padding=True)
test_encodings_distilbert = distilbert_tokenizer(test_texts, truncation=True, padding=True)

train_encodings_indicbert = indicbert_tokenizer(train_texts, truncation=True, padding=True)
test_encodings_indicbert = indicbert_tokenizer(test_texts, truncation=True, padding=True)

# Convert labels to PyTorch tensors
train_labels = torch.tensor(train_labels)
test_labels = torch.tensor(test_labels)

# Create PyTorch datasets
train_dataset_distilbert = torch.utils.data.TensorDataset(torch.tensor(train_encodings_distilbert['input_ids']),
                                                          torch.tensor(train_encodings_distilbert['attention_mask']),
                                                          train_labels)
test_dataset_distilbert = torch.utils.data.TensorDataset(torch.tensor(test_encodings_distilbert['input_ids']),
                                                         torch.tensor(test_encodings_distilbert['attention_mask']),
                                                         test_labels)

train_dataset_indicbert = torch.utils.data.TensorDataset(torch.tensor(train_encodings_indicbert['input_ids']),
                                                         torch.tensor(train_encodings_indicbert['attention_mask']),
                                                         train_labels)
test_dataset_indicbert = torch.utils.data.TensorDataset(torch.tensor(test_encodings_indicbert['input_ids']),
                                                        torch.tensor(test_encodings_indicbert['attention_mask']),
                                                        test_labels)

# Create data loaders
train_loader_distilbert = torch.utils.data.DataLoader(train_dataset_distilbert, batch_size=8, shuffle=True)
test_loader_distilbert = torch.utils.data.DataLoader(test_dataset_distilbert, batch_size=8, shuffle=False)

train_loader_indicbert = torch.utils.data.DataLoader(train_dataset_indicbert, batch_size=8, shuffle=True)
test_loader_indicbert = torch.utils.data.DataLoader(test_dataset_indicbert, batch_size=8, shuffle=False)

# Set device (GPU if available, otherwise CPU)
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
distilbert_model.to(device)
indicbert_model.to(device)

# Set optimizers and learning rate schedulers
optimizer_distilbert = AdamW(distilbert_model.parameters(), lr=1e-5)
optimizer_indicbert = AdamW(indicbert_model.parameters(), lr=1e-5)
num_epochs = 10

# Training loop for DistilBERT
for epoch in range(num_epochs):
    print(f"Epoch {epoch+1}/{num_epochs} (DistilBERT)")
    distilbert_model.train()
    for batch in train_loader_distilbert:
        input_ids, attention_mask, labels = batch
        input_ids, attention_mask, labels = input_ids.to(device), attention_mask.to(device), labels.to(device)
        
        optimizer_distilbert.zero_grad()
        outputs = distilbert_model(input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        loss.backward()
        optimizer_distilbert.step()

# Training loop for IndicBERT
for epoch in range(num_epochs):
    print(f"Epoch {epoch+1}/{num_epochs} (IndicBERT)")
    indicbert_model.train()
    for batch in train_loader_indicbert:
        input_ids, attention_mask, labels = batch
        input_ids, attention_mask, labels = input_ids.to(device), attention_mask.to(device), labels.to(device)
        
        optimizer_indicbert.zero_grad()
        outputs = indicbert_model(input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        loss.backward()
        optimizer_indicbert.step()

# Evaluation
distilbert_model.eval()
indicbert_model.eval()
predictions_distilbert = []
predictions_indicbert = []
true_labels = []
for batch in test_loader_distilbert:
    input_ids, attention_mask, labels = batch
    input_ids, attention_mask, labels = input_ids.to(device), attention_mask.to(device), labels.to(device)
    
    with torch.no_grad():
        outputs = distilbert_model(input_ids, attention_mask=attention_mask)
    
    logits = outputs.logits
    preds = torch.argmax(logits, dim=1)
    
    predictions_distilbert.extend(preds.cpu().numpy())

for batch in test_loader_indicbert:
    input_ids, attention_mask, labels = batch
    input_ids, attention_mask, labels = input_ids.to(device), attention_mask.to(device), labels.to(device)
    
    with torch.no_grad():
        outputs = indicbert_model(input_ids, attention_mask=attention_mask)
    
    logits = outputs.logits
    preds = torch.argmax(logits, dim=1)
    
    predictions_indicbert.extend(preds.cpu().numpy())
    true_labels.extend(labels.cpu().numpy())

# Combine predictions from both models
final_predictions = []
for pred_distilbert, pred_indicbert in zip(predictions_distilbert, predictions_indicbert):
    # Simple voting scheme, you can choose a different method for combining predictions
    combined_prediction = 1 if (pred_distilbert + pred_indicbert) >= 1 else 0
    final_predictions.append(combined_prediction)

# Calculate accuracy and other metrics
accuracy = accuracy_score(true_labels, final_predictions)
report = classification_report(true_labels, final_predictions)

print("Accuracy:", accuracy)
print("Classification Report:")
print(report)


tokenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of AlbertForSequenceClassification were not initialized from the model checkpoint at ai4bharat/indic-bert and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


Epoch 1/10 (DistilBERT)
Epoch 2/10 (DistilBERT)
Epoch 3/10 (DistilBERT)
Epoch 4/10 (DistilBERT)
Epoch 5/10 (DistilBERT)
Epoch 6/10 (DistilBERT)
Epoch 7/10 (DistilBERT)
Epoch 8/10 (DistilBERT)
Epoch 9/10 (DistilBERT)
Epoch 10/10 (DistilBERT)
Epoch 1/10 (IndicBERT)
Epoch 2/10 (IndicBERT)
Epoch 3/10 (IndicBERT)
Epoch 4/10 (IndicBERT)
Epoch 5/10 (IndicBERT)
Epoch 6/10 (IndicBERT)
Epoch 7/10 (IndicBERT)
Epoch 8/10 (IndicBERT)
Epoch 9/10 (IndicBERT)
Epoch 10/10 (IndicBERT)
Accuracy: 0.70375
Classification Report:
              precision    recall  f1-score   support

           0       0.76      0.66      0.70       430
           1       0.66      0.76      0.70       370

    accuracy                           0.70       800
   macro avg       0.71      0.71      0.70       800
weighted avg       0.71      0.70      0.70       800



translated

In [9]:
import pandas as pd
import torch
from sklearn.model_selection import train_test_split
from transformers import DistilBertTokenizer, DistilBertForSequenceClassification, AdamW, AutoTokenizer, AutoModelForSequenceClassification
from sklearn.metrics import accuracy_score, classification_report

# Load your CSV file
df = pd.read_csv('/kaggle/input/telugu-data/Hos_train_translated.csv')

# Mapping labels to integers
label_map = {'hate': 1, 'non-hate': 0}
df['Label'] = df['Label'].map(label_map)

# Extract input texts and labels
texts = df['comments '].tolist()
labels = df['Label'].tolist()

# Split data into train and test sets
train_texts, test_texts, train_labels, test_labels = train_test_split(texts, labels, test_size=0.2, random_state=42)

# Load pre-trained DistilBERT and IndicBERT models and tokenizers
distilbert_tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')
distilbert_model = DistilBertForSequenceClassification.from_pretrained('distilbert-base-uncased', num_labels=2)

indicbert_tokenizer = AutoTokenizer.from_pretrained("ai4bharat/indic-bert")
indicbert_model = AutoModelForSequenceClassification.from_pretrained("ai4bharat/indic-bert", num_labels=2)

# Tokenize inputs
train_encodings_distilbert = distilbert_tokenizer(train_texts, truncation=True, padding=True)
test_encodings_distilbert = distilbert_tokenizer(test_texts, truncation=True, padding=True)

train_encodings_indicbert = indicbert_tokenizer(train_texts, truncation=True, padding=True)
test_encodings_indicbert = indicbert_tokenizer(test_texts, truncation=True, padding=True)

# Convert labels to PyTorch tensors
train_labels = torch.tensor(train_labels)
test_labels = torch.tensor(test_labels)

# Create PyTorch datasets
train_dataset_distilbert = torch.utils.data.TensorDataset(torch.tensor(train_encodings_distilbert['input_ids']),
                                                          torch.tensor(train_encodings_distilbert['attention_mask']),
                                                          train_labels)
test_dataset_distilbert = torch.utils.data.TensorDataset(torch.tensor(test_encodings_distilbert['input_ids']),
                                                         torch.tensor(test_encodings_distilbert['attention_mask']),
                                                         test_labels)

train_dataset_indicbert = torch.utils.data.TensorDataset(torch.tensor(train_encodings_indicbert['input_ids']),
                                                         torch.tensor(train_encodings_indicbert['attention_mask']),
                                                         train_labels)
test_dataset_indicbert = torch.utils.data.TensorDataset(torch.tensor(test_encodings_indicbert['input_ids']),
                                                        torch.tensor(test_encodings_indicbert['attention_mask']),
                                                        test_labels)

# Create data loaders
train_loader_distilbert = torch.utils.data.DataLoader(train_dataset_distilbert, batch_size=8, shuffle=True)
test_loader_distilbert = torch.utils.data.DataLoader(test_dataset_distilbert, batch_size=8, shuffle=False)

train_loader_indicbert = torch.utils.data.DataLoader(train_dataset_indicbert, batch_size=8, shuffle=True)
test_loader_indicbert = torch.utils.data.DataLoader(test_dataset_indicbert, batch_size=8, shuffle=False)

# Set device (GPU if available, otherwise CPU)
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
distilbert_model.to(device)
indicbert_model.to(device)

# Set optimizers and learning rate schedulers
optimizer_distilbert = AdamW(distilbert_model.parameters(), lr=1e-5)
optimizer_indicbert = AdamW(indicbert_model.parameters(), lr=1e-5)
num_epochs = 10

# Training loop for DistilBERT
for epoch in range(num_epochs):
    print(f"Epoch {epoch+1}/{num_epochs} (DistilBERT)")
    distilbert_model.train()
    for batch in train_loader_distilbert:
        input_ids, attention_mask, labels = batch
        input_ids, attention_mask, labels = input_ids.to(device), attention_mask.to(device), labels.to(device)
        
        optimizer_distilbert.zero_grad()
        outputs = distilbert_model(input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        loss.backward()
        optimizer_distilbert.step()

# Training loop for IndicBERT
for epoch in range(num_epochs):
    print(f"Epoch {epoch+1}/{num_epochs} (IndicBERT)")
    indicbert_model.train()
    for batch in train_loader_indicbert:
        input_ids, attention_mask, labels = batch
        input_ids, attention_mask, labels = input_ids.to(device), attention_mask.to(device), labels.to(device)
        
        optimizer_indicbert.zero_grad()
        outputs = indicbert_model(input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        loss.backward()
        optimizer_indicbert.step()

# Evaluation
distilbert_model.eval()
indicbert_model.eval()
predictions_distilbert = []
predictions_indicbert = []
true_labels = []
for batch in test_loader_distilbert:
    input_ids, attention_mask, labels = batch
    input_ids, attention_mask, labels = input_ids.to(device), attention_mask.to(device), labels.to(device)
    
    with torch.no_grad():
        outputs = distilbert_model(input_ids, attention_mask=attention_mask)
    
    logits = outputs.logits
    preds = torch.argmax(logits, dim=1)
    
    predictions_distilbert.extend(preds.cpu().numpy())

for batch in test_loader_indicbert:
    input_ids, attention_mask, labels = batch
    input_ids, attention_mask, labels = input_ids.to(device), attention_mask.to(device), labels.to(device)
    
    with torch.no_grad():
        outputs = indicbert_model(input_ids, attention_mask=attention_mask)
    
    logits = outputs.logits
    preds = torch.argmax(logits, dim=1)
    
    predictions_indicbert.extend(preds.cpu().numpy())
    true_labels.extend(labels.cpu().numpy())

# Combine predictions from both models
final_predictions = []
for pred_distilbert, pred_indicbert in zip(predictions_distilbert, predictions_indicbert):
    # Simple voting scheme, you can choose a different method for combining predictions
    combined_prediction = 1 if (pred_distilbert + pred_indicbert) >= 1 else 0
    final_predictions.append(combined_prediction)

# Calculate accuracy and other metrics
accuracy = accuracy_score(true_labels, final_predictions)
report = classification_report(true_labels, final_predictions)

print("Accuracy:", accuracy)
print("Classification Report:")
print(report)


Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of AlbertForSequenceClassification were not initialized from the model checkpoint at ai4bharat/indic-bert and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


Epoch 1/10 (DistilBERT)
Epoch 2/10 (DistilBERT)
Epoch 3/10 (DistilBERT)
Epoch 4/10 (DistilBERT)
Epoch 5/10 (DistilBERT)
Epoch 6/10 (DistilBERT)
Epoch 7/10 (DistilBERT)
Epoch 8/10 (DistilBERT)
Epoch 9/10 (DistilBERT)
Epoch 10/10 (DistilBERT)
Epoch 1/10 (IndicBERT)
Epoch 2/10 (IndicBERT)
Epoch 3/10 (IndicBERT)
Epoch 4/10 (IndicBERT)
Epoch 5/10 (IndicBERT)
Epoch 6/10 (IndicBERT)
Epoch 7/10 (IndicBERT)
Epoch 8/10 (IndicBERT)
Epoch 9/10 (IndicBERT)
Epoch 10/10 (IndicBERT)
Accuracy: 0.73
Classification Report:
              precision    recall  f1-score   support

           0       0.81      0.65      0.72       430
           1       0.67      0.82      0.74       370

    accuracy                           0.73       800
   macro avg       0.74      0.74      0.73       800
weighted avg       0.74      0.73      0.73       800



transliterated

In [10]:
import pandas as pd
import torch
from sklearn.model_selection import train_test_split
from transformers import DistilBertTokenizer, DistilBertForSequenceClassification, AdamW, AutoTokenizer, AutoModelForSequenceClassification
from sklearn.metrics import accuracy_score, classification_report

# Load your CSV file
df = pd.read_csv('/kaggle/input/telugu-data/transliteration_4000.csv')

# Mapping labels to integers
label_map = {'hate': 1, 'non-hate': 0}
df['Label'] = df['Label'].map(label_map)

# Extract input texts and labels
texts = df['Comments'].tolist()
labels = df['Label'].tolist()

# Split data into train and test sets
train_texts, test_texts, train_labels, test_labels = train_test_split(texts, labels, test_size=0.2, random_state=42)

# Load pre-trained DistilBERT and IndicBERT models and tokenizers
distilbert_tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')
distilbert_model = DistilBertForSequenceClassification.from_pretrained('distilbert-base-uncased', num_labels=2)

indicbert_tokenizer = AutoTokenizer.from_pretrained("ai4bharat/indic-bert")
indicbert_model = AutoModelForSequenceClassification.from_pretrained("ai4bharat/indic-bert", num_labels=2)

# Tokenize inputs
train_encodings_distilbert = distilbert_tokenizer(train_texts, truncation=True, padding=True)
test_encodings_distilbert = distilbert_tokenizer(test_texts, truncation=True, padding=True)

train_encodings_indicbert = indicbert_tokenizer(train_texts, truncation=True, padding=True)
test_encodings_indicbert = indicbert_tokenizer(test_texts, truncation=True, padding=True)

# Convert labels to PyTorch tensors
train_labels = torch.tensor(train_labels)
test_labels = torch.tensor(test_labels)

# Create PyTorch datasets
train_dataset_distilbert = torch.utils.data.TensorDataset(torch.tensor(train_encodings_distilbert['input_ids']),
                                                          torch.tensor(train_encodings_distilbert['attention_mask']),
                                                          train_labels)
test_dataset_distilbert = torch.utils.data.TensorDataset(torch.tensor(test_encodings_distilbert['input_ids']),
                                                         torch.tensor(test_encodings_distilbert['attention_mask']),
                                                         test_labels)

train_dataset_indicbert = torch.utils.data.TensorDataset(torch.tensor(train_encodings_indicbert['input_ids']),
                                                         torch.tensor(train_encodings_indicbert['attention_mask']),
                                                         train_labels)
test_dataset_indicbert = torch.utils.data.TensorDataset(torch.tensor(test_encodings_indicbert['input_ids']),
                                                        torch.tensor(test_encodings_indicbert['attention_mask']),
                                                        test_labels)

# Create data loaders
train_loader_distilbert = torch.utils.data.DataLoader(train_dataset_distilbert, batch_size=8, shuffle=True)
test_loader_distilbert = torch.utils.data.DataLoader(test_dataset_distilbert, batch_size=8, shuffle=False)

train_loader_indicbert = torch.utils.data.DataLoader(train_dataset_indicbert, batch_size=8, shuffle=True)
test_loader_indicbert = torch.utils.data.DataLoader(test_dataset_indicbert, batch_size=8, shuffle=False)

# Set device (GPU if available, otherwise CPU)
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
distilbert_model.to(device)
indicbert_model.to(device)

# Set optimizers and learning rate schedulers
optimizer_distilbert = AdamW(distilbert_model.parameters(), lr=1e-5)
optimizer_indicbert = AdamW(indicbert_model.parameters(), lr=1e-5)
num_epochs = 10

# Training loop for DistilBERT
for epoch in range(num_epochs):
    print(f"Epoch {epoch+1}/{num_epochs} (DistilBERT)")
    distilbert_model.train()
    for batch in train_loader_distilbert:
        input_ids, attention_mask, labels = batch
        input_ids, attention_mask, labels = input_ids.to(device), attention_mask.to(device), labels.to(device)
        
        optimizer_distilbert.zero_grad()
        outputs = distilbert_model(input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        loss.backward()
        optimizer_distilbert.step()

# Training loop for IndicBERT
for epoch in range(num_epochs):
    print(f"Epoch {epoch+1}/{num_epochs} (IndicBERT)")
    indicbert_model.train()
    for batch in train_loader_indicbert:
        input_ids, attention_mask, labels = batch
        input_ids, attention_mask, labels = input_ids.to(device), attention_mask.to(device), labels.to(device)
        
        optimizer_indicbert.zero_grad()
        outputs = indicbert_model(input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        loss.backward()
        optimizer_indicbert.step()

# Evaluation
distilbert_model.eval()
indicbert_model.eval()
predictions_distilbert = []
predictions_indicbert = []
true_labels = []
for batch in test_loader_distilbert:
    input_ids, attention_mask, labels = batch
    input_ids, attention_mask, labels = input_ids.to(device), attention_mask.to(device), labels.to(device)
    
    with torch.no_grad():
        outputs = distilbert_model(input_ids, attention_mask=attention_mask)
    
    logits = outputs.logits
    preds = torch.argmax(logits, dim=1)
    
    predictions_distilbert.extend(preds.cpu().numpy())

for batch in test_loader_indicbert:
    input_ids, attention_mask, labels = batch
    input_ids, attention_mask, labels = input_ids.to(device), attention_mask.to(device), labels.to(device)
    
    with torch.no_grad():
        outputs = indicbert_model(input_ids, attention_mask=attention_mask)
    
    logits = outputs.logits
    preds = torch.argmax(logits, dim=1)
    
    predictions_indicbert.extend(preds.cpu().numpy())
    true_labels.extend(labels.cpu().numpy())

# Combine predictions from both models
final_predictions = []
for pred_distilbert, pred_indicbert in zip(predictions_distilbert, predictions_indicbert):
    # Simple voting scheme, you can choose a different method for combining predictions
    combined_prediction = 1 if (pred_distilbert + pred_indicbert) >= 1 else 0
    final_predictions.append(combined_prediction)

# Calculate accuracy and other metrics
accuracy = accuracy_score(true_labels, final_predictions)
report = classification_report(true_labels, final_predictions)

print("Accuracy:", accuracy)
print("Classification Report:")
print(report)


Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of AlbertForSequenceClassification were not initialized from the model checkpoint at ai4bharat/indic-bert and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


Epoch 1/10 (DistilBERT)
Epoch 2/10 (DistilBERT)
Epoch 3/10 (DistilBERT)
Epoch 4/10 (DistilBERT)
Epoch 5/10 (DistilBERT)
Epoch 6/10 (DistilBERT)
Epoch 7/10 (DistilBERT)
Epoch 8/10 (DistilBERT)
Epoch 9/10 (DistilBERT)
Epoch 10/10 (DistilBERT)
Epoch 1/10 (IndicBERT)
Epoch 2/10 (IndicBERT)
Epoch 3/10 (IndicBERT)
Epoch 4/10 (IndicBERT)
Epoch 5/10 (IndicBERT)
Epoch 6/10 (IndicBERT)
Epoch 7/10 (IndicBERT)
Epoch 8/10 (IndicBERT)
Epoch 9/10 (IndicBERT)
Epoch 10/10 (IndicBERT)
Accuracy: 0.74
Classification Report:
              precision    recall  f1-score   support

           0       0.80      0.68      0.74       430
           1       0.69      0.81      0.74       370

    accuracy                           0.74       800
   macro avg       0.74      0.74      0.74       800
weighted avg       0.75      0.74      0.74       800



### indicbert + LaBSE


original

In [11]:
import pandas as pd
import torch
from sklearn.model_selection import train_test_split
from transformers import AutoTokenizer, AutoModelForSequenceClassification, AdamW
from sklearn.metrics import accuracy_score, classification_report

# Load your CSV file
df = pd.read_csv('/kaggle/input/telugu-data/training_data_telugu-hate.csv')

# Mapping labels to integers
label_map = {'hate': 1, 'non-hate': 0}
df['Label'] = df['Label'].map(label_map)

# Extract input texts and labels
texts = df['Comments'].tolist()
labels = df['Label'].tolist()

# Split data into train and test sets
train_texts, test_texts, train_labels, test_labels = train_test_split(texts, labels, test_size=0.2, random_state=42)

# Load pre-trained LaBSE and IndicBERT models and tokenizers
labse_tokenizer = AutoTokenizer.from_pretrained("sentence-transformers/LaBSE")
labse_model = AutoModelForSequenceClassification.from_pretrained("sentence-transformers/LaBSE", num_labels=2)

indicbert_tokenizer = AutoTokenizer.from_pretrained("ai4bharat/indic-bert")
indicbert_model = AutoModelForSequenceClassification.from_pretrained("ai4bharat/indic-bert", num_labels=2)

# Tokenize inputs
train_encodings_labse = labse_tokenizer(train_texts, truncation=True, padding=True)
test_encodings_labse = labse_tokenizer(test_texts, truncation=True, padding=True)

train_encodings_indicbert = indicbert_tokenizer(train_texts, truncation=True, padding=True)
test_encodings_indicbert = indicbert_tokenizer(test_texts, truncation=True, padding=True)

# Convert labels to PyTorch tensors
train_labels = torch.tensor(train_labels)
test_labels = torch.tensor(test_labels)

# Create PyTorch datasets
train_dataset_labse = torch.utils.data.TensorDataset(torch.tensor(train_encodings_labse['input_ids']),
                                                     torch.tensor(train_encodings_labse['attention_mask']),
                                                     train_labels)
test_dataset_labse = torch.utils.data.TensorDataset(torch.tensor(test_encodings_labse['input_ids']),
                                                    torch.tensor(test_encodings_labse['attention_mask']),
                                                    test_labels)

train_dataset_indicbert = torch.utils.data.TensorDataset(torch.tensor(train_encodings_indicbert['input_ids']),
                                                         torch.tensor(train_encodings_indicbert['attention_mask']),
                                                         train_labels)
test_dataset_indicbert = torch.utils.data.TensorDataset(torch.tensor(test_encodings_indicbert['input_ids']),
                                                        torch.tensor(test_encodings_indicbert['attention_mask']),
                                                        test_labels)

# Create data loaders
train_loader_labse = torch.utils.data.DataLoader(train_dataset_labse, batch_size=8, shuffle=True)
test_loader_labse = torch.utils.data.DataLoader(test_dataset_labse, batch_size=8, shuffle=False)

train_loader_indicbert = torch.utils.data.DataLoader(train_dataset_indicbert, batch_size=8, shuffle=True)
test_loader_indicbert = torch.utils.data.DataLoader(test_dataset_indicbert, batch_size=8, shuffle=False)

# Set device (GPU if available, otherwise CPU)
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
labse_model.to(device)
indicbert_model.to(device)

# Set optimizers and learning rate schedulers
optimizer_labse = AdamW(labse_model.parameters(), lr=1e-5)
optimizer_indicbert = AdamW(indicbert_model.parameters(), lr=1e-5)
num_epochs = 10

# Training loop for LaBSE
for epoch in range(num_epochs):
    print(f"Epoch {epoch+1}/{num_epochs} (LaBSE)")
    labse_model.train()
    for batch in train_loader_labse:
        input_ids, attention_mask, labels = batch
        input_ids, attention_mask, labels = input_ids.to(device), attention_mask.to(device), labels.to(device)
        
        optimizer_labse.zero_grad()
        outputs = labse_model(input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        loss.backward()
        optimizer_labse.step()

# Training loop for IndicBERT
for epoch in range(num_epochs):
    print(f"Epoch {epoch+1}/{num_epochs} (IndicBERT)")
    indicbert_model.train()
    for batch in train_loader_indicbert:
        input_ids, attention_mask, labels = batch
        input_ids, attention_mask, labels = input_ids.to(device), attention_mask.to(device), labels.to(device)
        
        optimizer_indicbert.zero_grad()
        outputs = indicbert_model(input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        loss.backward()
        optimizer_indicbert.step()

# Evaluation
labse_model.eval()
indicbert_model.eval()
predictions_labse = []
predictions_indicbert = []
true_labels = []
for batch in test_loader_labse:
    input_ids, attention_mask, labels = batch
    input_ids, attention_mask, labels = input_ids.to(device), attention_mask.to(device), labels.to(device)
    
    with torch.no_grad():
        outputs = labse_model(input_ids, attention_mask=attention_mask)
    
    logits = outputs.logits
    preds = torch.argmax(logits, dim=1)
    
    predictions_labse.extend(preds.cpu().numpy())

for batch in test_loader_indicbert:
    input_ids, attention_mask, labels = batch
    input_ids, attention_mask, labels = input_ids.to(device), attention_mask.to(device), labels.to(device)
    
    with torch.no_grad():
        outputs = indicbert_model(input_ids, attention_mask=attention_mask)
    
    logits = outputs.logits
    preds = torch.argmax(logits, dim=1)
    
    predictions_indicbert.extend(preds.cpu().numpy())
    true_labels.extend(labels.cpu().numpy())

# Combine predictions from both models
final_predictions = []
for pred_labse, pred_indicbert in zip(predictions_labse, predictions_indicbert):
    # Simple voting scheme, you can choose a different method for combining predictions
    combined_prediction = 1 if (pred_labse + pred_indicbert) >= 1 else 0
    final_predictions.append(combined_prediction)

# Calculate accuracy and other metrics
accuracy = accuracy_score(true_labels, final_predictions)
report = classification_report(true_labels, final_predictions)

print("Accuracy:", accuracy)
print("Classification Report:")
print(report)


tokenizer_config.json:   0%|          | 0.00/397 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/804 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/5.22M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.62M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/1.88G [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at sentence-transformers/LaBSE and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of AlbertForSequenceClassification were not initialized from the model checkpoint at ai4bharat/indic-bert and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


Epoch 1/10 (LaBSE)
Epoch 2/10 (LaBSE)
Epoch 3/10 (LaBSE)
Epoch 4/10 (LaBSE)
Epoch 5/10 (LaBSE)
Epoch 6/10 (LaBSE)
Epoch 7/10 (LaBSE)
Epoch 8/10 (LaBSE)
Epoch 9/10 (LaBSE)
Epoch 10/10 (LaBSE)
Epoch 1/10 (IndicBERT)
Epoch 2/10 (IndicBERT)
Epoch 3/10 (IndicBERT)
Epoch 4/10 (IndicBERT)
Epoch 5/10 (IndicBERT)
Epoch 6/10 (IndicBERT)
Epoch 7/10 (IndicBERT)
Epoch 8/10 (IndicBERT)
Epoch 9/10 (IndicBERT)
Epoch 10/10 (IndicBERT)
Accuracy: 0.69125
Classification Report:
              precision    recall  f1-score   support

           0       0.84      0.53      0.65       430
           1       0.62      0.88      0.73       370

    accuracy                           0.69       800
   macro avg       0.73      0.70      0.69       800
weighted avg       0.74      0.69      0.68       800



translated

In [12]:
import pandas as pd
import torch
from sklearn.model_selection import train_test_split
from transformers import AutoTokenizer, AutoModelForSequenceClassification, AdamW
from sklearn.metrics import accuracy_score, classification_report

# Load your CSV file
df = pd.read_csv('/kaggle/input/telugu-data/Hos_train_translated.csv')

# Mapping labels to integers
label_map = {'hate': 1, 'non-hate': 0}
df['Label'] = df['Label'].map(label_map)

# Extract input texts and labels
texts = df['comments '].tolist()
labels = df['Label'].tolist()

# Split data into train and test sets
train_texts, test_texts, train_labels, test_labels = train_test_split(texts, labels, test_size=0.2, random_state=42)

# Load pre-trained LaBSE and IndicBERT models and tokenizers
labse_tokenizer = AutoTokenizer.from_pretrained("sentence-transformers/LaBSE")
labse_model = AutoModelForSequenceClassification.from_pretrained("sentence-transformers/LaBSE", num_labels=2)

indicbert_tokenizer = AutoTokenizer.from_pretrained("ai4bharat/indic-bert")
indicbert_model = AutoModelForSequenceClassification.from_pretrained("ai4bharat/indic-bert", num_labels=2)

# Tokenize inputs
train_encodings_labse = labse_tokenizer(train_texts, truncation=True, padding=True)
test_encodings_labse = labse_tokenizer(test_texts, truncation=True, padding=True)

train_encodings_indicbert = indicbert_tokenizer(train_texts, truncation=True, padding=True)
test_encodings_indicbert = indicbert_tokenizer(test_texts, truncation=True, padding=True)

# Convert labels to PyTorch tensors
train_labels = torch.tensor(train_labels)
test_labels = torch.tensor(test_labels)

# Create PyTorch datasets
train_dataset_labse = torch.utils.data.TensorDataset(torch.tensor(train_encodings_labse['input_ids']),
                                                     torch.tensor(train_encodings_labse['attention_mask']),
                                                     train_labels)
test_dataset_labse = torch.utils.data.TensorDataset(torch.tensor(test_encodings_labse['input_ids']),
                                                    torch.tensor(test_encodings_labse['attention_mask']),
                                                    test_labels)

train_dataset_indicbert = torch.utils.data.TensorDataset(torch.tensor(train_encodings_indicbert['input_ids']),
                                                         torch.tensor(train_encodings_indicbert['attention_mask']),
                                                         train_labels)
test_dataset_indicbert = torch.utils.data.TensorDataset(torch.tensor(test_encodings_indicbert['input_ids']),
                                                        torch.tensor(test_encodings_indicbert['attention_mask']),
                                                        test_labels)

# Create data loaders
train_loader_labse = torch.utils.data.DataLoader(train_dataset_labse, batch_size=8, shuffle=True)
test_loader_labse = torch.utils.data.DataLoader(test_dataset_labse, batch_size=8, shuffle=False)

train_loader_indicbert = torch.utils.data.DataLoader(train_dataset_indicbert, batch_size=8, shuffle=True)
test_loader_indicbert = torch.utils.data.DataLoader(test_dataset_indicbert, batch_size=8, shuffle=False)

# Set device (GPU if available, otherwise CPU)
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
labse_model.to(device)
indicbert_model.to(device)

# Set optimizers and learning rate schedulers
optimizer_labse = AdamW(labse_model.parameters(), lr=1e-5)
optimizer_indicbert = AdamW(indicbert_model.parameters(), lr=1e-5)
num_epochs = 10

# Training loop for LaBSE
for epoch in range(num_epochs):
    print(f"Epoch {epoch+1}/{num_epochs} (LaBSE)")
    labse_model.train()
    for batch in train_loader_labse:
        input_ids, attention_mask, labels = batch
        input_ids, attention_mask, labels = input_ids.to(device), attention_mask.to(device), labels.to(device)
        
        optimizer_labse.zero_grad()
        outputs = labse_model(input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        loss.backward()
        optimizer_labse.step()

# Training loop for IndicBERT
for epoch in range(num_epochs):
    print(f"Epoch {epoch+1}/{num_epochs} (IndicBERT)")
    indicbert_model.train()
    for batch in train_loader_indicbert:
        input_ids, attention_mask, labels = batch
        input_ids, attention_mask, labels = input_ids.to(device), attention_mask.to(device), labels.to(device)
        
        optimizer_indicbert.zero_grad()
        outputs = indicbert_model(input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        loss.backward()
        optimizer_indicbert.step()

# Evaluation
labse_model.eval()
indicbert_model.eval()
predictions_labse = []
predictions_indicbert = []
true_labels = []
for batch in test_loader_labse:
    input_ids, attention_mask, labels = batch
    input_ids, attention_mask, labels = input_ids.to(device), attention_mask.to(device), labels.to(device)
    
    with torch.no_grad():
        outputs = labse_model(input_ids, attention_mask=attention_mask)
    
    logits = outputs.logits
    preds = torch.argmax(logits, dim=1)
    
    predictions_labse.extend(preds.cpu().numpy())

for batch in test_loader_indicbert:
    input_ids, attention_mask, labels = batch
    input_ids, attention_mask, labels = input_ids.to(device), attention_mask.to(device), labels.to(device)
    
    with torch.no_grad():
        outputs = indicbert_model(input_ids, attention_mask=attention_mask)
    
    logits = outputs.logits
    preds = torch.argmax(logits, dim=1)
    
    predictions_indicbert.extend(preds.cpu().numpy())
    true_labels.extend(labels.cpu().numpy())

# Combine predictions from both models
final_predictions = []
for pred_labse, pred_indicbert in zip(predictions_labse, predictions_indicbert):
    # Simple voting scheme, you can choose a different method for combining predictions
    combined_prediction = 1 if (pred_labse + pred_indicbert) >= 1 else 0
    final_predictions.append(combined_prediction)

# Calculate accuracy and other metrics
accuracy = accuracy_score(true_labels, final_predictions)
report = classification_report(true_labels, final_predictions)

print("Accuracy:", accuracy)
print("Classification Report:")
print(report)


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at sentence-transformers/LaBSE and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of AlbertForSequenceClassification were not initialized from the model checkpoint at ai4bharat/indic-bert and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


Epoch 1/10 (LaBSE)
Epoch 2/10 (LaBSE)
Epoch 3/10 (LaBSE)
Epoch 4/10 (LaBSE)
Epoch 5/10 (LaBSE)
Epoch 6/10 (LaBSE)
Epoch 7/10 (LaBSE)
Epoch 8/10 (LaBSE)
Epoch 9/10 (LaBSE)
Epoch 10/10 (LaBSE)
Epoch 1/10 (IndicBERT)
Epoch 2/10 (IndicBERT)
Epoch 3/10 (IndicBERT)
Epoch 4/10 (IndicBERT)
Epoch 5/10 (IndicBERT)
Epoch 6/10 (IndicBERT)
Epoch 7/10 (IndicBERT)
Epoch 8/10 (IndicBERT)
Epoch 9/10 (IndicBERT)
Epoch 10/10 (IndicBERT)
Accuracy: 0.7275
Classification Report:
              precision    recall  f1-score   support

           0       0.87      0.58      0.70       430
           1       0.65      0.90      0.75       370

    accuracy                           0.73       800
   macro avg       0.76      0.74      0.72       800
weighted avg       0.77      0.73      0.72       800



transliterated

In [1]:
import pandas as pd
import torch
from sklearn.model_selection import train_test_split
from transformers import AutoTokenizer, AutoModelForSequenceClassification, AdamW
from sklearn.metrics import accuracy_score, classification_report

# Load your CSV file
df = pd.read_csv('/kaggle/input/telugu-data/transliteration_4000.csv')

# Mapping labels to integers
label_map = {'hate': 1, 'non-hate': 0}
df['Label'] = df['Label'].map(label_map)

# Extract input texts and labels
texts = df['Comments'].tolist()
labels = df['Label'].tolist()

# Split data into train and test sets
train_texts, test_texts, train_labels, test_labels = train_test_split(texts, labels, test_size=0.2, random_state=42)

# Load pre-trained LaBSE and IndicBERT models and tokenizers
labse_tokenizer = AutoTokenizer.from_pretrained("sentence-transformers/LaBSE")
labse_model = AutoModelForSequenceClassification.from_pretrained("sentence-transformers/LaBSE", num_labels=2)

indicbert_tokenizer = AutoTokenizer.from_pretrained("ai4bharat/indic-bert")
indicbert_model = AutoModelForSequenceClassification.from_pretrained("ai4bharat/indic-bert", num_labels=2)

# Tokenize inputs
train_encodings_labse = labse_tokenizer(train_texts, truncation=True, padding=True)
test_encodings_labse = labse_tokenizer(test_texts, truncation=True, padding=True)

train_encodings_indicbert = indicbert_tokenizer(train_texts, truncation=True, padding=True)
test_encodings_indicbert = indicbert_tokenizer(test_texts, truncation=True, padding=True)

# Convert labels to PyTorch tensors
train_labels = torch.tensor(train_labels)
test_labels = torch.tensor(test_labels)

# Create PyTorch datasets
train_dataset_labse = torch.utils.data.TensorDataset(torch.tensor(train_encodings_labse['input_ids']),
                                                     torch.tensor(train_encodings_labse['attention_mask']),
                                                     train_labels)
test_dataset_labse = torch.utils.data.TensorDataset(torch.tensor(test_encodings_labse['input_ids']),
                                                    torch.tensor(test_encodings_labse['attention_mask']),
                                                    test_labels)

train_dataset_indicbert = torch.utils.data.TensorDataset(torch.tensor(train_encodings_indicbert['input_ids']),
                                                         torch.tensor(train_encodings_indicbert['attention_mask']),
                                                         train_labels)
test_dataset_indicbert = torch.utils.data.TensorDataset(torch.tensor(test_encodings_indicbert['input_ids']),
                                                        torch.tensor(test_encodings_indicbert['attention_mask']),
                                                        test_labels)

# Create data loaders
train_loader_labse = torch.utils.data.DataLoader(train_dataset_labse, batch_size=8, shuffle=True)
test_loader_labse = torch.utils.data.DataLoader(test_dataset_labse, batch_size=8, shuffle=False)

train_loader_indicbert = torch.utils.data.DataLoader(train_dataset_indicbert, batch_size=8, shuffle=True)
test_loader_indicbert = torch.utils.data.DataLoader(test_dataset_indicbert, batch_size=8, shuffle=False)

# Set device (GPU if available, otherwise CPU)
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
labse_model.to(device)
indicbert_model.to(device)

# Set optimizers and learning rate schedulers
optimizer_labse = AdamW(labse_model.parameters(), lr=1e-5)
optimizer_indicbert = AdamW(indicbert_model.parameters(), lr=1e-5)
num_epochs = 10

# Training loop for LaBSE
for epoch in range(num_epochs):
    print(f"Epoch {epoch+1}/{num_epochs} (LaBSE)")
    labse_model.train()
    for batch in train_loader_labse:
        input_ids, attention_mask, labels = batch
        input_ids, attention_mask, labels = input_ids.to(device), attention_mask.to(device), labels.to(device)
        
        optimizer_labse.zero_grad()
        outputs = labse_model(input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        loss.backward()
        optimizer_labse.step()

# Training loop for IndicBERT
for epoch in range(num_epochs):
    print(f"Epoch {epoch+1}/{num_epochs} (IndicBERT)")
    indicbert_model.train()
    for batch in train_loader_indicbert:
        input_ids, attention_mask, labels = batch
        input_ids, attention_mask, labels = input_ids.to(device), attention_mask.to(device), labels.to(device)
        
        optimizer_indicbert.zero_grad()
        outputs = indicbert_model(input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        loss.backward()
        optimizer_indicbert.step()

# Evaluation
labse_model.eval()
indicbert_model.eval()
predictions_labse = []
predictions_indicbert = []
true_labels = []
for batch in test_loader_labse:
    input_ids, attention_mask, labels = batch
    input_ids, attention_mask, labels = input_ids.to(device), attention_mask.to(device), labels.to(device)
    
    with torch.no_grad():
        outputs = labse_model(input_ids, attention_mask=attention_mask)
    
    logits = outputs.logits
    preds = torch.argmax(logits, dim=1)
    
    predictions_labse.extend(preds.cpu().numpy())

for batch in test_loader_indicbert:
    input_ids, attention_mask, labels = batch
    input_ids, attention_mask, labels = input_ids.to(device), attention_mask.to(device), labels.to(device)
    
    with torch.no_grad():
        outputs = indicbert_model(input_ids, attention_mask=attention_mask)
    
    logits = outputs.logits
    preds = torch.argmax(logits, dim=1)
    
    predictions_indicbert.extend(preds.cpu().numpy())
    true_labels.extend(labels.cpu().numpy())

# Combine predictions from both models
final_predictions = []
for pred_labse, pred_indicbert in zip(predictions_labse, predictions_indicbert):
    # Simple voting scheme, you can choose a different method for combining predictions
    combined_prediction = 1 if (pred_labse + pred_indicbert) >= 1 else 0
    final_predictions.append(combined_prediction)

# Calculate accuracy and other metrics
accuracy = accuracy_score(true_labels, final_predictions)
report = classification_report(true_labels, final_predictions)

print("Accuracy:", accuracy)
print("Classification Report:")
print(report)


tokenizer_config.json:   0%|          | 0.00/397 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/804 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/5.22M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.62M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/1.88G [00:00<?, ?B/s]

  return self.fget.__get__(instance, owner)()
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at sentence-transformers/LaBSE and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


config.json:   0%|          | 0.00/507 [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/5.65M [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/135M [00:00<?, ?B/s]

Some weights of AlbertForSequenceClassification were not initialized from the model checkpoint at ai4bharat/indic-bert and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


Epoch 1/10 (LaBSE)
Epoch 2/10 (LaBSE)
Epoch 3/10 (LaBSE)
Epoch 4/10 (LaBSE)
Epoch 5/10 (LaBSE)
Epoch 6/10 (LaBSE)
Epoch 7/10 (LaBSE)
Epoch 8/10 (LaBSE)
Epoch 9/10 (LaBSE)
Epoch 10/10 (LaBSE)
Epoch 1/10 (IndicBERT)
Epoch 2/10 (IndicBERT)
Epoch 3/10 (IndicBERT)
Epoch 4/10 (IndicBERT)
Epoch 5/10 (IndicBERT)
Epoch 6/10 (IndicBERT)
Epoch 7/10 (IndicBERT)
Epoch 8/10 (IndicBERT)
Epoch 9/10 (IndicBERT)
Epoch 10/10 (IndicBERT)
Accuracy: 0.70125
Classification Report:
              precision    recall  f1-score   support

           0       0.82      0.57      0.67       430
           1       0.63      0.86      0.73       370

    accuracy                           0.70       800
   macro avg       0.73      0.71      0.70       800
weighted avg       0.73      0.70      0.70       800



### indicbert + muril

original

In [2]:
import pandas as pd
import torch
from sklearn.model_selection import train_test_split
from transformers import AutoTokenizer, AutoModelForSequenceClassification, AdamW
from sklearn.metrics import accuracy_score, classification_report

# Load your CSV file
df = pd.read_csv('/kaggle/input/telugu-data/training_data_telugu-hate.csv')

# Mapping labels to integers
label_map = {'hate': 1, 'non-hate': 0}
df['Label'] = df['Label'].map(label_map)

# Extract input texts and labels
texts = df['Comments'].tolist()
labels = df['Label'].tolist()

# Split data into train and test sets
train_texts, test_texts, train_labels, test_labels = train_test_split(texts, labels, test_size=0.2, random_state=42)

# Load pre-trained MuRIL and IndicBERT models and tokenizers
muril_tokenizer = AutoTokenizer.from_pretrained("google/muril-base-cased")
muril_model = AutoModelForSequenceClassification.from_pretrained("google/muril-base-cased", num_labels=2)

indicbert_tokenizer = AutoTokenizer.from_pretrained("ai4bharat/indic-bert")
indicbert_model = AutoModelForSequenceClassification.from_pretrained("ai4bharat/indic-bert", num_labels=2)

# Tokenize inputs
train_encodings_muril = muril_tokenizer(train_texts, truncation=True, padding=True)
test_encodings_muril = muril_tokenizer(test_texts, truncation=True, padding=True)

train_encodings_indicbert = indicbert_tokenizer(train_texts, truncation=True, padding=True)
test_encodings_indicbert = indicbert_tokenizer(test_texts, truncation=True, padding=True)

# Convert labels to PyTorch tensors
train_labels = torch.tensor(train_labels)
test_labels = torch.tensor(test_labels)

# Create PyTorch datasets
train_dataset_muril = torch.utils.data.TensorDataset(torch.tensor(train_encodings_muril['input_ids']),
                                                     torch.tensor(train_encodings_muril['attention_mask']),
                                                     train_labels)
test_dataset_muril = torch.utils.data.TensorDataset(torch.tensor(test_encodings_muril['input_ids']),
                                                    torch.tensor(test_encodings_muril['attention_mask']),
                                                    test_labels)

train_dataset_indicbert = torch.utils.data.TensorDataset(torch.tensor(train_encodings_indicbert['input_ids']),
                                                         torch.tensor(train_encodings_indicbert['attention_mask']),
                                                         train_labels)
test_dataset_indicbert = torch.utils.data.TensorDataset(torch.tensor(test_encodings_indicbert['input_ids']),
                                                        torch.tensor(test_encodings_indicbert['attention_mask']),
                                                        test_labels)

# Create data loaders
train_loader_muril = torch.utils.data.DataLoader(train_dataset_muril, batch_size=8, shuffle=True)
test_loader_muril = torch.utils.data.DataLoader(test_dataset_muril, batch_size=8, shuffle=False)

train_loader_indicbert = torch.utils.data.DataLoader(train_dataset_indicbert, batch_size=8, shuffle=True)
test_loader_indicbert = torch.utils.data.DataLoader(test_dataset_indicbert, batch_size=8, shuffle=False)

# Set device (GPU if available, otherwise CPU)
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
muril_model.to(device)
indicbert_model.to(device)

# Set optimizers and learning rate schedulers
optimizer_muril = AdamW(muril_model.parameters(), lr=1e-5)
optimizer_indicbert = AdamW(indicbert_model.parameters(), lr=1e-5)
num_epochs = 10

# Training loop for MuRIL
for epoch in range(num_epochs):
    print(f"Epoch {epoch+1}/{num_epochs} (MuRIL)")
    muril_model.train()
    for batch in train_loader_muril:
        input_ids, attention_mask, labels = batch
        input_ids, attention_mask, labels = input_ids.to(device), attention_mask.to(device), labels.to(device)
        
        optimizer_muril.zero_grad()
        outputs = muril_model(input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        loss.backward()
        optimizer_muril.step()

# Training loop for IndicBERT
for epoch in range(num_epochs):
    print(f"Epoch {epoch+1}/{num_epochs} (IndicBERT)")
    indicbert_model.train()
    for batch in train_loader_indicbert:
        input_ids, attention_mask, labels = batch
        input_ids, attention_mask, labels = input_ids.to(device), attention_mask.to(device), labels.to(device)
        
        optimizer_indicbert.zero_grad()
        outputs = indicbert_model(input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        loss.backward()
        optimizer_indicbert.step()

# Evaluation
muril_model.eval()
indicbert_model.eval()
predictions_muril = []
predictions_indicbert = []
true_labels = []
for batch in test_loader_muril:
    input_ids, attention_mask, labels = batch
    input_ids, attention_mask, labels = input_ids.to(device), attention_mask.to(device), labels.to(device)
    
    with torch.no_grad():
        outputs = muril_model(input_ids, attention_mask=attention_mask)
    
    logits = outputs.logits
    preds = torch.argmax(logits, dim=1)
    
    predictions_muril.extend(preds.cpu().numpy())

for batch in test_loader_indicbert:
    input_ids, attention_mask, labels = batch
    input_ids, attention_mask, labels = input_ids.to(device), attention_mask.to(device), labels.to(device)
    
    with torch.no_grad():
        outputs = indicbert_model(input_ids, attention_mask=attention_mask)
    
    logits = outputs.logits
    preds = torch.argmax(logits, dim=1)
    
    predictions_indicbert.extend(preds.cpu().numpy())
    true_labels.extend(labels.cpu().numpy())

# Combine predictions from both models
final_predictions = []
for pred_muril, pred_indicbert in zip(predictions_muril, predictions_indicbert):
    # Simple voting scheme, you can choose a different method for combining predictions
    combined_prediction = 1 if (pred_muril + pred_indicbert) >= 1 else 0
    final_predictions.append(combined_prediction)

# Calculate accuracy and other metrics
accuracy = accuracy_score(true_labels, final_predictions)
report = classification_report(true_labels, final_predictions)

print("Accuracy:", accuracy)
print("Classification Report:")
print(report)


tokenizer_config.json:   0%|          | 0.00/206 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/411 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/3.16M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/113 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/953M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/muril-base-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of AlbertForSequenceClassification were not initialized from the model checkpoint at ai4bharat/indic-bert and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


Epoch 1/10 (MuRIL)
Epoch 2/10 (MuRIL)
Epoch 3/10 (MuRIL)
Epoch 4/10 (MuRIL)
Epoch 5/10 (MuRIL)
Epoch 6/10 (MuRIL)
Epoch 7/10 (MuRIL)
Epoch 8/10 (MuRIL)
Epoch 9/10 (MuRIL)
Epoch 10/10 (MuRIL)
Epoch 1/10 (IndicBERT)
Epoch 2/10 (IndicBERT)
Epoch 3/10 (IndicBERT)
Epoch 4/10 (IndicBERT)
Epoch 5/10 (IndicBERT)
Epoch 6/10 (IndicBERT)
Epoch 7/10 (IndicBERT)
Epoch 8/10 (IndicBERT)
Epoch 9/10 (IndicBERT)
Epoch 10/10 (IndicBERT)
Accuracy: 0.70125
Classification Report:
              precision    recall  f1-score   support

           0       0.86      0.53      0.66       430
           1       0.62      0.90      0.74       370

    accuracy                           0.70       800
   macro avg       0.74      0.72      0.70       800
weighted avg       0.75      0.70      0.69       800



translated

In [1]:
import pandas as pd
import torch
from sklearn.model_selection import train_test_split
from transformers import AutoTokenizer, AutoModelForSequenceClassification, AdamW
from sklearn.metrics import accuracy_score, classification_report

# Load your CSV file
df = pd.read_csv('/kaggle/input/telugu-data/Hos_train_translated.csv')

# Mapping labels to integers
label_map = {'hate': 1, 'non-hate': 0}
df['Label'] = df['Label'].map(label_map)

# Extract input texts and labels
texts = df['comments '].tolist()
labels = df['Label'].tolist()

# Split data into train and test sets
train_texts, test_texts, train_labels, test_labels = train_test_split(texts, labels, test_size=0.2, random_state=42)

# Load pre-trained MuRIL and IndicBERT models and tokenizers
muril_tokenizer = AutoTokenizer.from_pretrained("google/muril-base-cased")
muril_model = AutoModelForSequenceClassification.from_pretrained("google/muril-base-cased", num_labels=2)

indicbert_tokenizer = AutoTokenizer.from_pretrained("ai4bharat/indic-bert")
indicbert_model = AutoModelForSequenceClassification.from_pretrained("ai4bharat/indic-bert", num_labels=2)

# Tokenize inputs
train_encodings_muril = muril_tokenizer(train_texts, truncation=True, padding=True)
test_encodings_muril = muril_tokenizer(test_texts, truncation=True, padding=True)

train_encodings_indicbert = indicbert_tokenizer(train_texts, truncation=True, padding=True)
test_encodings_indicbert = indicbert_tokenizer(test_texts, truncation=True, padding=True)

# Convert labels to PyTorch tensors
train_labels = torch.tensor(train_labels)
test_labels = torch.tensor(test_labels)

# Create PyTorch datasets
train_dataset_muril = torch.utils.data.TensorDataset(torch.tensor(train_encodings_muril['input_ids']),
                                                     torch.tensor(train_encodings_muril['attention_mask']),
                                                     train_labels)
test_dataset_muril = torch.utils.data.TensorDataset(torch.tensor(test_encodings_muril['input_ids']),
                                                    torch.tensor(test_encodings_muril['attention_mask']),
                                                    test_labels)

train_dataset_indicbert = torch.utils.data.TensorDataset(torch.tensor(train_encodings_indicbert['input_ids']),
                                                         torch.tensor(train_encodings_indicbert['attention_mask']),
                                                         train_labels)
test_dataset_indicbert = torch.utils.data.TensorDataset(torch.tensor(test_encodings_indicbert['input_ids']),
                                                        torch.tensor(test_encodings_indicbert['attention_mask']),
                                                        test_labels)

# Create data loaders
train_loader_muril = torch.utils.data.DataLoader(train_dataset_muril, batch_size=8, shuffle=True)
test_loader_muril = torch.utils.data.DataLoader(test_dataset_muril, batch_size=8, shuffle=False)

train_loader_indicbert = torch.utils.data.DataLoader(train_dataset_indicbert, batch_size=8, shuffle=True)
test_loader_indicbert = torch.utils.data.DataLoader(test_dataset_indicbert, batch_size=8, shuffle=False)

# Set device (GPU if available, otherwise CPU)
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
muril_model.to(device)
indicbert_model.to(device)

# Set optimizers and learning rate schedulers
optimizer_muril = AdamW(muril_model.parameters(), lr=1e-5)
optimizer_indicbert = AdamW(indicbert_model.parameters(), lr=1e-5)
num_epochs = 10

# Training loop for MuRIL
for epoch in range(num_epochs):
    print(f"Epoch {epoch+1}/{num_epochs} (MuRIL)")
    muril_model.train()
    for batch in train_loader_muril:
        input_ids, attention_mask, labels = batch
        input_ids, attention_mask, labels = input_ids.to(device), attention_mask.to(device), labels.to(device)
        
        optimizer_muril.zero_grad()
        outputs = muril_model(input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        loss.backward()
        optimizer_muril.step()

# Training loop for IndicBERT
for epoch in range(num_epochs):
    print(f"Epoch {epoch+1}/{num_epochs} (IndicBERT)")
    indicbert_model.train()
    for batch in train_loader_indicbert:
        input_ids, attention_mask, labels = batch
        input_ids, attention_mask, labels = input_ids.to(device), attention_mask.to(device), labels.to(device)
        
        optimizer_indicbert.zero_grad()
        outputs = indicbert_model(input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        loss.backward()
        optimizer_indicbert.step()

# Evaluation
muril_model.eval()
indicbert_model.eval()
predictions_muril = []
predictions_indicbert = []
true_labels = []
for batch in test_loader_muril:
    input_ids, attention_mask, labels = batch
    input_ids, attention_mask, labels = input_ids.to(device), attention_mask.to(device), labels.to(device)
    
    with torch.no_grad():
        outputs = muril_model(input_ids, attention_mask=attention_mask)
    
    logits = outputs.logits
    preds = torch.argmax(logits, dim=1)
    
    predictions_muril.extend(preds.cpu().numpy())

for batch in test_loader_indicbert:
    input_ids, attention_mask, labels = batch
    input_ids, attention_mask, labels = input_ids.to(device), attention_mask.to(device), labels.to(device)
    
    with torch.no_grad():
        outputs = indicbert_model(input_ids, attention_mask=attention_mask)
    
    logits = outputs.logits
    preds = torch.argmax(logits, dim=1)
    
    predictions_indicbert.extend(preds.cpu().numpy())
    true_labels.extend(labels.cpu().numpy())

# Combine predictions from both models
final_predictions = []
for pred_muril, pred_indicbert in zip(predictions_muril, predictions_indicbert):
    # Simple voting scheme, you can choose a different method for combining predictions
    combined_prediction = 1 if (pred_muril + pred_indicbert) >= 1 else 0
    final_predictions.append(combined_prediction)

# Calculate accuracy and other metrics
accuracy = accuracy_score(true_labels, final_predictions)
report = classification_report(true_labels, final_predictions)

print("Accuracy:", accuracy)
print("Classification Report:")
print(report)


tokenizer_config.json:   0%|          | 0.00/206 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/411 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/3.16M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/113 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/953M [00:00<?, ?B/s]

  return self.fget.__get__(instance, owner)()
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/muril-base-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


config.json:   0%|          | 0.00/507 [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/5.65M [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/135M [00:00<?, ?B/s]

Some weights of AlbertForSequenceClassification were not initialized from the model checkpoint at ai4bharat/indic-bert and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


Epoch 1/10 (MuRIL)
Epoch 2/10 (MuRIL)
Epoch 3/10 (MuRIL)
Epoch 4/10 (MuRIL)
Epoch 5/10 (MuRIL)
Epoch 6/10 (MuRIL)
Epoch 7/10 (MuRIL)
Epoch 8/10 (MuRIL)
Epoch 9/10 (MuRIL)
Epoch 10/10 (MuRIL)
Epoch 1/10 (IndicBERT)
Epoch 2/10 (IndicBERT)
Epoch 3/10 (IndicBERT)
Epoch 4/10 (IndicBERT)
Epoch 5/10 (IndicBERT)
Epoch 6/10 (IndicBERT)
Epoch 7/10 (IndicBERT)
Epoch 8/10 (IndicBERT)
Epoch 9/10 (IndicBERT)
Epoch 10/10 (IndicBERT)
Accuracy: 0.71875
Classification Report:
              precision    recall  f1-score   support

           0       0.86      0.57      0.68       430
           1       0.64      0.90      0.75       370

    accuracy                           0.72       800
   macro avg       0.75      0.73      0.72       800
weighted avg       0.76      0.72      0.71       800



transliterated

In [3]:
import pandas as pd
import torch
from sklearn.model_selection import train_test_split
from transformers import AutoTokenizer, AutoModelForSequenceClassification, AdamW
from sklearn.metrics import accuracy_score, classification_report

# Load your CSV file
df = pd.read_csv('/kaggle/input/telugu-data/transliteration_4000.csv')

# Mapping labels to integers
label_map = {'hate': 1, 'non-hate': 0}
df['Label'] = df['Label'].map(label_map)

# Extract input texts and labels
texts = df['Comments'].tolist()
labels = df['Label'].tolist()

# Split data into train and test sets
train_texts, test_texts, train_labels, test_labels = train_test_split(texts, labels, test_size=0.2, random_state=42)

# Load pre-trained MuRIL and IndicBERT models and tokenizers
muril_tokenizer = AutoTokenizer.from_pretrained("google/muril-base-cased")
muril_model = AutoModelForSequenceClassification.from_pretrained("google/muril-base-cased", num_labels=2)

indicbert_tokenizer = AutoTokenizer.from_pretrained("ai4bharat/indic-bert")
indicbert_model = AutoModelForSequenceClassification.from_pretrained("ai4bharat/indic-bert", num_labels=2)

# Tokenize inputs
train_encodings_muril = muril_tokenizer(train_texts, truncation=True, padding=True)
test_encodings_muril = muril_tokenizer(test_texts, truncation=True, padding=True)

train_encodings_indicbert = indicbert_tokenizer(train_texts, truncation=True, padding=True)
test_encodings_indicbert = indicbert_tokenizer(test_texts, truncation=True, padding=True)

# Convert labels to PyTorch tensors
train_labels = torch.tensor(train_labels)
test_labels = torch.tensor(test_labels)

# Create PyTorch datasets
train_dataset_muril = torch.utils.data.TensorDataset(torch.tensor(train_encodings_muril['input_ids']),
                                                     torch.tensor(train_encodings_muril['attention_mask']),
                                                     train_labels)
test_dataset_muril = torch.utils.data.TensorDataset(torch.tensor(test_encodings_muril['input_ids']),
                                                    torch.tensor(test_encodings_muril['attention_mask']),
                                                    test_labels)

train_dataset_indicbert = torch.utils.data.TensorDataset(torch.tensor(train_encodings_indicbert['input_ids']),
                                                         torch.tensor(train_encodings_indicbert['attention_mask']),
                                                         train_labels)
test_dataset_indicbert = torch.utils.data.TensorDataset(torch.tensor(test_encodings_indicbert['input_ids']),
                                                        torch.tensor(test_encodings_indicbert['attention_mask']),
                                                        test_labels)

# Create data loaders
train_loader_muril = torch.utils.data.DataLoader(train_dataset_muril, batch_size=8, shuffle=True)
test_loader_muril = torch.utils.data.DataLoader(test_dataset_muril, batch_size=8, shuffle=False)

train_loader_indicbert = torch.utils.data.DataLoader(train_dataset_indicbert, batch_size=8, shuffle=True)
test_loader_indicbert = torch.utils.data.DataLoader(test_dataset_indicbert, batch_size=8, shuffle=False)

# Set device (GPU if available, otherwise CPU)
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
muril_model.to(device)
indicbert_model.to(device)

# Set optimizers and learning rate schedulers
optimizer_muril = AdamW(muril_model.parameters(), lr=1e-5)
optimizer_indicbert = AdamW(indicbert_model.parameters(), lr=1e-5)
num_epochs = 10

# Training loop for MuRIL
for epoch in range(num_epochs):
    print(f"Epoch {epoch+1}/{num_epochs} (MuRIL)")
    muril_model.train()
    for batch in train_loader_muril:
        input_ids, attention_mask, labels = batch
        input_ids, attention_mask, labels = input_ids.to(device), attention_mask.to(device), labels.to(device)
        
        optimizer_muril.zero_grad()
        outputs = muril_model(input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        loss.backward()
        optimizer_muril.step()

# Training loop for IndicBERT
for epoch in range(num_epochs):
    print(f"Epoch {epoch+1}/{num_epochs} (IndicBERT)")
    indicbert_model.train()
    for batch in train_loader_indicbert:
        input_ids, attention_mask, labels = batch
        input_ids, attention_mask, labels = input_ids.to(device), attention_mask.to(device), labels.to(device)
        
        optimizer_indicbert.zero_grad()
        outputs = indicbert_model(input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        loss.backward()
        optimizer_indicbert.step()

# Evaluation
muril_model.eval()
indicbert_model.eval()
predictions_muril = []
predictions_indicbert = []
true_labels = []
for batch in test_loader_muril:
    input_ids, attention_mask, labels = batch
    input_ids, attention_mask, labels = input_ids.to(device), attention_mask.to(device), labels.to(device)
    
    with torch.no_grad():
        outputs = muril_model(input_ids, attention_mask=attention_mask)
    
    logits = outputs.logits
    preds = torch.argmax(logits, dim=1)
    
    predictions_muril.extend(preds.cpu().numpy())

for batch in test_loader_indicbert:
    input_ids, attention_mask, labels = batch
    input_ids, attention_mask, labels = input_ids.to(device), attention_mask.to(device), labels.to(device)
    
    with torch.no_grad():
        outputs = indicbert_model(input_ids, attention_mask=attention_mask)
    
    logits = outputs.logits
    preds = torch.argmax(logits, dim=1)
    
    predictions_indicbert.extend(preds.cpu().numpy())
    true_labels.extend(labels.cpu().numpy())

# Combine predictions from both models
final_predictions = []
for pred_muril, pred_indicbert in zip(predictions_muril, predictions_indicbert):
    # Simple voting scheme, you can choose a different method for combining predictions
    combined_prediction = 1 if (pred_muril + pred_indicbert) >= 1 else 0
    final_predictions.append(combined_prediction)

# Calculate accuracy and other metrics
accuracy = accuracy_score(true_labels, final_predictions)
report = classification_report(true_labels, final_predictions)

print("Accuracy:", accuracy)
print("Classification Report:")
print(report)


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/muril-base-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of AlbertForSequenceClassification were not initialized from the model checkpoint at ai4bharat/indic-bert and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


Epoch 1/10 (MuRIL)
Epoch 2/10 (MuRIL)
Epoch 3/10 (MuRIL)
Epoch 4/10 (MuRIL)
Epoch 5/10 (MuRIL)
Epoch 6/10 (MuRIL)
Epoch 7/10 (MuRIL)
Epoch 8/10 (MuRIL)
Epoch 9/10 (MuRIL)
Epoch 10/10 (MuRIL)
Epoch 1/10 (IndicBERT)
Epoch 2/10 (IndicBERT)
Epoch 3/10 (IndicBERT)
Epoch 4/10 (IndicBERT)
Epoch 5/10 (IndicBERT)
Epoch 6/10 (IndicBERT)
Epoch 7/10 (IndicBERT)
Epoch 8/10 (IndicBERT)
Epoch 9/10 (IndicBERT)
Epoch 10/10 (IndicBERT)
Accuracy: 0.65125
Classification Report:
              precision    recall  f1-score   support

           0       0.86      0.42      0.56       430
           1       0.58      0.92      0.71       370

    accuracy                           0.65       800
   macro avg       0.72      0.67      0.64       800
weighted avg       0.73      0.65      0.63       800

