In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/hos-original-bert/training_data_telugu-hate.csv

/kaggle/input/transliteration/transliteration_4000.csv

/kaggle/input/translated/Hos_train_translated.csv


In [1]:
  pip install transformers


















Note: you may need to restart the kernel to use updated packages.


### bert+distilbert

In [4]:
import pandas as pd
import torch
from sklearn.model_selection import train_test_split
from transformers import BertTokenizer, BertForSequenceClassification, DistilBertTokenizer, DistilBertForSequenceClassification, AdamW
from sklearn.metrics import accuracy_score, classification_report

# Load your CSV file
df = pd.read_csv('training_data_telugu-hate.csv')

# Mapping labels to integers
label_map = {'hate': 1, 'non-hate': 0}
df['Label'] = df['Label'].map(label_map)

# Extract input texts and labels
texts = df['Comments'].tolist()
labels = df['Label'].tolist()

# Split data into train and test sets
train_texts, test_texts, train_labels, test_labels = train_test_split(texts, labels, test_size=0.2, random_state=42)

# Load pre-trained BERT and DistilBERT models and tokenizers
bert_tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
bert_model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=2)

distilbert_tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')
distilbert_model = DistilBertForSequenceClassification.from_pretrained('distilbert-base-uncased', num_labels=2)

# Tokenize inputs
train_encodings_bert = bert_tokenizer(train_texts, truncation=True, padding=True)
test_encodings_bert = bert_tokenizer(test_texts, truncation=True, padding=True)

train_encodings_distilbert = distilbert_tokenizer(train_texts, truncation=True, padding=True)
test_encodings_distilbert = distilbert_tokenizer(test_texts, truncation=True, padding=True)

# Convert labels to PyTorch tensors
train_labels = torch.tensor(train_labels)
test_labels = torch.tensor(test_labels)

# Create PyTorch datasets
train_dataset_bert = torch.utils.data.TensorDataset(torch.tensor(train_encodings_bert['input_ids']),
                                                    torch.tensor(train_encodings_bert['attention_mask']),
                                                    train_labels)
test_dataset_bert = torch.utils.data.TensorDataset(torch.tensor(test_encodings_bert['input_ids']),
                                                   torch.tensor(test_encodings_bert['attention_mask']),
                                                   test_labels)

train_dataset_distilbert = torch.utils.data.TensorDataset(torch.tensor(train_encodings_distilbert['input_ids']),
                                                          torch.tensor(train_encodings_distilbert['attention_mask']),
                                                          train_labels)
test_dataset_distilbert = torch.utils.data.TensorDataset(torch.tensor(test_encodings_distilbert['input_ids']),
                                                         torch.tensor(test_encodings_distilbert['attention_mask']),
                                                         test_labels)

# Create data loaders
train_loader_bert = torch.utils.data.DataLoader(train_dataset_bert, batch_size=8, shuffle=True)
test_loader_bert = torch.utils.data.DataLoader(test_dataset_bert, batch_size=8, shuffle=False)

train_loader_distilbert = torch.utils.data.DataLoader(train_dataset_distilbert, batch_size=8, shuffle=True)
test_loader_distilbert = torch.utils.data.DataLoader(test_dataset_distilbert, batch_size=8, shuffle=False)

# Set device (GPU if available, otherwise CPU)
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
bert_model.to(device)
distilbert_model.to(device)

# Set optimizers and learning rate schedulers
optimizer_bert = AdamW(bert_model.parameters(), lr=1e-5)
optimizer_distilbert = AdamW(distilbert_model.parameters(), lr=1e-5)
num_epochs = 10

# Training loop for BERT
for epoch in range(num_epochs):
    print(f"Epoch {epoch+1}/{num_epochs} (BERT)")
    bert_model.train()
    for batch in train_loader_bert:
        input_ids, attention_mask, labels = batch
        input_ids, attention_mask, labels = input_ids.to(device), attention_mask.to(device), labels.to(device)
        
        optimizer_bert.zero_grad()
        outputs = bert_model(input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        loss.backward()
        optimizer_bert.step()

# Training loop for DistilBERT
for epoch in range(num_epochs):
    print(f"Epoch {epoch+1}/{num_epochs} (DistilBERT)")
    distilbert_model.train()
    for batch in train_loader_distilbert:
        input_ids, attention_mask, labels = batch
        input_ids, attention_mask, labels = input_ids.to(device), attention_mask.to(device), labels.to(device)
        
        optimizer_distilbert.zero_grad()
        outputs = distilbert_model(input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        loss.backward()
        optimizer_distilbert.step()

# Evaluation
bert_model.eval()
distilbert_model.eval()
predictions_bert = []
predictions_distilbert = []
true_labels = []
for batch in test_loader_bert:
    input_ids, attention_mask, labels = batch
    input_ids, attention_mask, labels = input_ids.to(device), attention_mask.to(device), labels.to(device)
    
    with torch.no_grad():
        outputs = bert_model(input_ids, attention_mask=attention_mask)
    
    logits = outputs.logits
    preds = torch.argmax(logits, dim=1)
    
    predictions_bert.extend(preds.cpu().numpy())

for batch in test_loader_distilbert:
    input_ids, attention_mask, labels = batch
    input_ids, attention_mask, labels = input_ids.to(device), attention_mask.to(device), labels.to(device)
    
    with torch.no_grad():
        outputs = distilbert_model(input_ids, attention_mask=attention_mask)
    
    logits = outputs.logits
    preds = torch.argmax(logits, dim=1)
    
    predictions_distilbert.extend(preds.cpu().numpy())
    true_labels.extend(labels.cpu().numpy())

# Combine predictions from both models
final_predictions = []
for pred_bert, pred_distilbert in zip(predictions_bert, predictions_distilbert):
    # Simple voting scheme, you can choose a different method for combining predictions
    combined_prediction = 1 if (pred_bert + pred_distilbert) >= 1 else 0
    final_predictions.append(combined_prediction)

# Calculate accuracy and other metrics
accuracy = accuracy_score(true_labels, final_predictions)
report = classification_report(true_labels, final_predictions)

print("Accuracy:", accuracy)
print("Classification Report:")
print(report)


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']

You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']

You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.




Epoch 1/10 (BERT)

Epoch 2/10 (BERT)

Epoch 3/10 (BERT)

Epoch 4/10 (BERT)

Epoch 5/10 (BERT)

Epoch 6/10 (BERT)

Epoch 7/10 (BERT)

Epoch 8/10 (BERT)

Epoch 9/10 (BERT)

Epoch 10/10 (BERT)

Epoch 1/10 (DistilBERT)

Epoch 2/10 (DistilBERT)

Epoch 3/10 (DistilBERT)

Epoch 4/10 (DistilBERT)

Epoch 5/10 (DistilBERT)

Epoch 6/10 (DistilBERT)

Epoch 7/10 (DistilBERT)

Epoch 8/10 (DistilBERT)

Epoch 9/10 (DistilBERT)

Epoch 10/10 (DistilBERT)

Accuracy: 0.7275

Classification Report:

              precision    recall  f1-score   support



           0       0.73      0.78      0.75       430

           1       0.72      0.67      0.69       370



    accuracy                           0.73       800

   macro avg       0.73      0.72      0.72       800

weighted avg       0.73      0.73      0.73       800




translated

In [8]:
import pandas as pd
import torch
from sklearn.model_selection import train_test_split
from transformers import BertTokenizer, BertForSequenceClassification, DistilBertTokenizer, DistilBertForSequenceClassification, AdamW
from sklearn.metrics import accuracy_score, classification_report

# Load your CSV file
df = pd.read_csv('Hos_train_translated.csv')

# Mapping labels to integers
label_map = {'hate': 1, 'non-hate': 0}
df['Label'] = df['Label'].map(label_map)

# Extract input texts and labels
texts = df['comments '].tolist()
labels = df['Label'].tolist()

# Split data into train and test sets
train_texts, test_texts, train_labels, test_labels = train_test_split(texts, labels, test_size=0.2, random_state=42)

# Load pre-trained BERT and DistilBERT models and tokenizers
bert_tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
bert_model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=2)

distilbert_tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')
distilbert_model = DistilBertForSequenceClassification.from_pretrained('distilbert-base-uncased', num_labels=2)

# Tokenize inputs
train_encodings_bert = bert_tokenizer(train_texts, truncation=True, padding=True)
test_encodings_bert = bert_tokenizer(test_texts, truncation=True, padding=True)

train_encodings_distilbert = distilbert_tokenizer(train_texts, truncation=True, padding=True)
test_encodings_distilbert = distilbert_tokenizer(test_texts, truncation=True, padding=True)

# Convert labels to PyTorch tensors
train_labels = torch.tensor(train_labels)
test_labels = torch.tensor(test_labels)

# Create PyTorch datasets
train_dataset_bert = torch.utils.data.TensorDataset(torch.tensor(train_encodings_bert['input_ids']),
                                                    torch.tensor(train_encodings_bert['attention_mask']),
                                                    train_labels)
test_dataset_bert = torch.utils.data.TensorDataset(torch.tensor(test_encodings_bert['input_ids']),
                                                   torch.tensor(test_encodings_bert['attention_mask']),
                                                   test_labels)

train_dataset_distilbert = torch.utils.data.TensorDataset(torch.tensor(train_encodings_distilbert['input_ids']),
                                                          torch.tensor(train_encodings_distilbert['attention_mask']),
                                                          train_labels)
test_dataset_distilbert = torch.utils.data.TensorDataset(torch.tensor(test_encodings_distilbert['input_ids']),
                                                         torch.tensor(test_encodings_distilbert['attention_mask']),
                                                         test_labels)

# Create data loaders
train_loader_bert = torch.utils.data.DataLoader(train_dataset_bert, batch_size=8, shuffle=True)
test_loader_bert = torch.utils.data.DataLoader(test_dataset_bert, batch_size=8, shuffle=False)

train_loader_distilbert = torch.utils.data.DataLoader(train_dataset_distilbert, batch_size=8, shuffle=True)
test_loader_distilbert = torch.utils.data.DataLoader(test_dataset_distilbert, batch_size=8, shuffle=False)

# Set device (GPU if available, otherwise CPU)
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
bert_model.to(device)
distilbert_model.to(device)

# Set optimizers and learning rate schedulers
optimizer_bert = AdamW(bert_model.parameters(), lr=1e-5)
optimizer_distilbert = AdamW(distilbert_model.parameters(), lr=1e-5)
num_epochs = 10

# Training loop for BERT
for epoch in range(num_epochs):
    print(f"Epoch {epoch+1}/{num_epochs} (BERT)")
    bert_model.train()
    for batch in train_loader_bert:
        input_ids, attention_mask, labels = batch
        input_ids, attention_mask, labels = input_ids.to(device), attention_mask.to(device), labels.to(device)
        
        optimizer_bert.zero_grad()
        outputs = bert_model(input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        loss.backward()
        optimizer_bert.step()

# Training loop for DistilBERT
for epoch in range(num_epochs):
    print(f"Epoch {epoch+1}/{num_epochs} (DistilBERT)")
    distilbert_model.train()
    for batch in train_loader_distilbert:
        input_ids, attention_mask, labels = batch
        input_ids, attention_mask, labels = input_ids.to(device), attention_mask.to(device), labels.to(device)
        
        optimizer_distilbert.zero_grad()
        outputs = distilbert_model(input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        loss.backward()
        optimizer_distilbert.step()

# Evaluation
bert_model.eval()
distilbert_model.eval()
predictions_bert = []
predictions_distilbert = []
true_labels = []
for batch in test_loader_bert:
    input_ids, attention_mask, labels = batch
    input_ids, attention_mask, labels = input_ids.to(device), attention_mask.to(device), labels.to(device)
    
    with torch.no_grad():
        outputs = bert_model(input_ids, attention_mask=attention_mask)
    
    logits = outputs.logits
    preds = torch.argmax(logits, dim=1)
    
    predictions_bert.extend(preds.cpu().numpy())

for batch in test_loader_distilbert:
    input_ids, attention_mask, labels = batch
    input_ids, attention_mask, labels = input_ids.to(device), attention_mask.to(device), labels.to(device)
    
    with torch.no_grad():
        outputs = distilbert_model(input_ids, attention_mask=attention_mask)
    
    logits = outputs.logits
    preds = torch.argmax(logits, dim=1)
    
    predictions_distilbert.extend(preds.cpu().numpy())
    true_labels.extend(labels.cpu().numpy())

# Combine predictions from both models
final_predictions = []
for pred_bert, pred_distilbert in zip(predictions_bert, predictions_distilbert):
    # Simple voting scheme, you can choose a different method for combining predictions
    combined_prediction = 1 if (pred_bert + pred_distilbert) >= 1 else 0
    final_predictions.append(combined_prediction)

# Calculate accuracy and other metrics
accuracy = accuracy_score(true_labels, final_predictions)
report = classification_report(true_labels, final_predictions)

print("Accuracy:", accuracy)
print("Classification Report:")
print(report)


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']

You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']

You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.




Epoch 1/10 (BERT)

Epoch 2/10 (BERT)

Epoch 3/10 (BERT)

Epoch 4/10 (BERT)

Epoch 5/10 (BERT)

Epoch 6/10 (BERT)

Epoch 7/10 (BERT)

Epoch 8/10 (BERT)

Epoch 9/10 (BERT)

Epoch 10/10 (BERT)

Epoch 1/10 (DistilBERT)

Epoch 2/10 (DistilBERT)

Epoch 3/10 (DistilBERT)

Epoch 4/10 (DistilBERT)

Epoch 5/10 (DistilBERT)

Epoch 6/10 (DistilBERT)

Epoch 7/10 (DistilBERT)

Epoch 8/10 (DistilBERT)

Epoch 9/10 (DistilBERT)

Epoch 10/10 (DistilBERT)

Accuracy: 0.765

Classification Report:

              precision    recall  f1-score   support



           0       0.82      0.73      0.77       430

           1       0.72      0.81      0.76       370



    accuracy                           0.77       800

   macro avg       0.77      0.77      0.76       800

weighted avg       0.77      0.77      0.77       800




transliteration

In [10]:
import pandas as pd
import torch
from sklearn.model_selection import train_test_split
from transformers import BertTokenizer, BertForSequenceClassification, DistilBertTokenizer, DistilBertForSequenceClassification, AdamW
from sklearn.metrics import accuracy_score, classification_report

# Load your CSV file
df = pd.read_csv('transliteration_4000.csv')

# Mapping labels to integers
label_map = {'hate': 1, 'non-hate': 0}
df['Label'] = df['Label'].map(label_map)

# Extract input texts and labels
texts = df['Comments'].tolist()
labels = df['Label'].tolist()

# Split data into train and test sets
train_texts, test_texts, train_labels, test_labels = train_test_split(texts, labels, test_size=0.2, random_state=42)

# Load pre-trained BERT and DistilBERT models and tokenizers
bert_tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
bert_model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=2)

distilbert_tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')
distilbert_model = DistilBertForSequenceClassification.from_pretrained('distilbert-base-uncased', num_labels=2)

# Tokenize inputs
train_encodings_bert = bert_tokenizer(train_texts, truncation=True, padding=True)
test_encodings_bert = bert_tokenizer(test_texts, truncation=True, padding=True)

train_encodings_distilbert = distilbert_tokenizer(train_texts, truncation=True, padding=True)
test_encodings_distilbert = distilbert_tokenizer(test_texts, truncation=True, padding=True)

# Convert labels to PyTorch tensors
train_labels = torch.tensor(train_labels)
test_labels = torch.tensor(test_labels)

# Create PyTorch datasets
train_dataset_bert = torch.utils.data.TensorDataset(torch.tensor(train_encodings_bert['input_ids']),
                                                    torch.tensor(train_encodings_bert['attention_mask']),
                                                    train_labels)
test_dataset_bert = torch.utils.data.TensorDataset(torch.tensor(test_encodings_bert['input_ids']),
                                                   torch.tensor(test_encodings_bert['attention_mask']),
                                                   test_labels)

train_dataset_distilbert = torch.utils.data.TensorDataset(torch.tensor(train_encodings_distilbert['input_ids']),
                                                          torch.tensor(train_encodings_distilbert['attention_mask']),
                                                          train_labels)
test_dataset_distilbert = torch.utils.data.TensorDataset(torch.tensor(test_encodings_distilbert['input_ids']),
                                                         torch.tensor(test_encodings_distilbert['attention_mask']),
                                                         test_labels)

# Create data loaders
train_loader_bert = torch.utils.data.DataLoader(train_dataset_bert, batch_size=8, shuffle=True)
test_loader_bert = torch.utils.data.DataLoader(test_dataset_bert, batch_size=8, shuffle=False)

train_loader_distilbert = torch.utils.data.DataLoader(train_dataset_distilbert, batch_size=8, shuffle=True)
test_loader_distilbert = torch.utils.data.DataLoader(test_dataset_distilbert, batch_size=8, shuffle=False)

# Set device (GPU if available, otherwise CPU)
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
bert_model.to(device)
distilbert_model.to(device)

# Set optimizers and learning rate schedulers
optimizer_bert = AdamW(bert_model.parameters(), lr=1e-5)
optimizer_distilbert = AdamW(distilbert_model.parameters(), lr=1e-5)
num_epochs = 10

# Training loop for BERT
for epoch in range(num_epochs):
    print(f"Epoch {epoch+1}/{num_epochs} (BERT)")
    bert_model.train()
    for batch in train_loader_bert:
        input_ids, attention_mask, labels = batch
        input_ids, attention_mask, labels = input_ids.to(device), attention_mask.to(device), labels.to(device)
        
        optimizer_bert.zero_grad()
        outputs = bert_model(input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        loss.backward()
        optimizer_bert.step()

# Training loop for DistilBERT
for epoch in range(num_epochs):
    print(f"Epoch {epoch+1}/{num_epochs} (DistilBERT)")
    distilbert_model.train()
    for batch in train_loader_distilbert:
        input_ids, attention_mask, labels = batch
        input_ids, attention_mask, labels = input_ids.to(device), attention_mask.to(device), labels.to(device)
        
        optimizer_distilbert.zero_grad()
        outputs = distilbert_model(input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        loss.backward()
        optimizer_distilbert.step()

# Evaluation
bert_model.eval()
distilbert_model.eval()
predictions_bert = []
predictions_distilbert = []
true_labels = []
for batch in test_loader_bert:
    input_ids, attention_mask, labels = batch
    input_ids, attention_mask, labels = input_ids.to(device), attention_mask.to(device), labels.to(device)
    
    with torch.no_grad():
        outputs = bert_model(input_ids, attention_mask=attention_mask)
    
    logits = outputs.logits
    preds = torch.argmax(logits, dim=1)
    
    predictions_bert.extend(preds.cpu().numpy())

for batch in test_loader_distilbert:
    input_ids, attention_mask, labels = batch
    input_ids, attention_mask, labels = input_ids.to(device), attention_mask.to(device), labels.to(device)
    
    with torch.no_grad():
        outputs = distilbert_model(input_ids, attention_mask=attention_mask)
    
    logits = outputs.logits
    preds = torch.argmax(logits, dim=1)
    
    predictions_distilbert.extend(preds.cpu().numpy())
    true_labels.extend(labels.cpu().numpy())

# Combine predictions from both models
final_predictions = []
for pred_bert, pred_distilbert in zip(predictions_bert, predictions_distilbert):
    # Simple voting scheme, you can choose a different method for combining predictions
    combined_prediction = 1 if (pred_bert + pred_distilbert) >= 1 else 0
    final_predictions.append(combined_prediction)

# Calculate accuracy and other metrics
accuracy = accuracy_score(true_labels, final_predictions)
report = classification_report(true_labels, final_predictions)

print("Accuracy:", accuracy)
print("Classification Report:")
print(report)


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']

You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']

You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.




Epoch 1/10 (BERT)

Epoch 2/10 (BERT)

Epoch 3/10 (BERT)

Epoch 4/10 (BERT)

Epoch 5/10 (BERT)

Epoch 6/10 (BERT)

Epoch 7/10 (BERT)

Epoch 8/10 (BERT)

Epoch 9/10 (BERT)

Epoch 10/10 (BERT)

Epoch 1/10 (DistilBERT)

Epoch 2/10 (DistilBERT)

Epoch 3/10 (DistilBERT)

Epoch 4/10 (DistilBERT)

Epoch 5/10 (DistilBERT)

Epoch 6/10 (DistilBERT)

Epoch 7/10 (DistilBERT)

Epoch 8/10 (DistilBERT)

Epoch 9/10 (DistilBERT)

Epoch 10/10 (DistilBERT)

Accuracy: 0.74125

Classification Report:

              precision    recall  f1-score   support



           0       0.81      0.68      0.74       430

           1       0.69      0.81      0.74       370



    accuracy                           0.74       800

   macro avg       0.75      0.75      0.74       800

weighted avg       0.75      0.74      0.74       800




In [6]:
from sklearn.utils.class_weight import compute_class_weight

# Compute class weights
class_weights = compute_class_weight('balanced', classes=[0, 1], y=train_labels.numpy())

# Convert class weights to a dictionary
class_weights_dict = {i: class_weights[i] for i in range(len(class_weights))}

print("Class Weights:", class_weights_dict)


Class Weights: {0: 0.9809932556713673, 1: 1.0197578075207139}


### bert+labse

#### original

In [2]:
import pandas as pd
import torch
from sklearn.model_selection import train_test_split
from transformers import BertTokenizer, BertForSequenceClassification, AutoTokenizer, AutoModelForSequenceClassification, AdamW
from sklearn.metrics import accuracy_score, classification_report

# Load your CSV file
df = pd.read_csv('/kaggle/input/telugudat/training_data_telugu-hate.csv')

# Mapping labels to integers
label_map = {'hate': 1, 'non-hate': 0}
df['Label'] = df['Label'].map(label_map)

# Extract input texts and labels
texts = df['Comments'].tolist()
labels = df['Label'].tolist()

# Split data into train and test sets
train_texts, test_texts, train_labels, test_labels = train_test_split(texts, labels, test_size=0.2, random_state=42)

# Load pre-trained BERT and LaBSE models and tokenizers
bert_tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
bert_model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=2)

labse_tokenizer = AutoTokenizer.from_pretrained("sentence-transformers/LaBSE")
labse_model = AutoModelForSequenceClassification.from_pretrained("sentence-transformers/LaBSE", num_labels=2)

# Tokenize inputs
train_encodings_bert = bert_tokenizer(train_texts, truncation=True, padding=True)
test_encodings_bert = bert_tokenizer(test_texts, truncation=True, padding=True)

train_encodings_labse = labse_tokenizer(train_texts, truncation=True, padding=True)
test_encodings_labse = labse_tokenizer(test_texts, truncation=True, padding=True)

# Convert labels to PyTorch tensors
train_labels = torch.tensor(train_labels)
test_labels = torch.tensor(test_labels)

# Create PyTorch datasets
train_dataset_bert = torch.utils.data.TensorDataset(torch.tensor(train_encodings_bert['input_ids']),
                                                    torch.tensor(train_encodings_bert['attention_mask']),
                                                    train_labels)
test_dataset_bert = torch.utils.data.TensorDataset(torch.tensor(test_encodings_bert['input_ids']),
                                                   torch.tensor(test_encodings_bert['attention_mask']),
                                                   test_labels)

train_dataset_labse = torch.utils.data.TensorDataset(torch.tensor(train_encodings_labse['input_ids']),
                                                     torch.tensor(train_encodings_labse['attention_mask']),
                                                     train_labels)
test_dataset_labse = torch.utils.data.TensorDataset(torch.tensor(test_encodings_labse['input_ids']),
                                                    torch.tensor(test_encodings_labse['attention_mask']),
                                                    test_labels)

# Create data loaders
train_loader_bert = torch.utils.data.DataLoader(train_dataset_bert, batch_size=8, shuffle=True)
test_loader_bert = torch.utils.data.DataLoader(test_dataset_bert, batch_size=8, shuffle=False)

train_loader_labse = torch.utils.data.DataLoader(train_dataset_labse, batch_size=8, shuffle=True)
test_loader_labse = torch.utils.data.DataLoader(test_dataset_labse, batch_size=8, shuffle=False)

# Set device (GPU if available, otherwise CPU)
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
bert_model.to(device)
labse_model.to(device)

# Set optimizers and learning rate schedulers
optimizer_bert = AdamW(bert_model.parameters(), lr=1e-5)
optimizer_labse = AdamW(labse_model.parameters(), lr=1e-5)
num_epochs = 10

# Training loop for BERT
for epoch in range(num_epochs):
    print(f"Epoch {epoch+1}/{num_epochs} (BERT)")
    bert_model.train()
    for batch in train_loader_bert:
        input_ids, attention_mask, labels = batch
        input_ids, attention_mask, labels = input_ids.to(device), attention_mask.to(device), labels.to(device)
        
        optimizer_bert.zero_grad()
        outputs = bert_model(input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        loss.backward()
        optimizer_bert.step()

# Training loop for LaBSE
for epoch in range(num_epochs):
    print(f"Epoch {epoch+1}/{num_epochs} (LaBSE)")
    labse_model.train()
    for batch in train_loader_labse:
        input_ids, attention_mask, labels = batch
        input_ids, attention_mask, labels = input_ids.to(device), attention_mask.to(device), labels.to(device)
        
        optimizer_labse.zero_grad()
        outputs = labse_model(input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        loss.backward()
        optimizer_labse.step()

# Evaluation
bert_model.eval()
labse_model.eval()
predictions_bert = []
predictions_labse = []
true_labels = []
for batch in test_loader_bert:
    input_ids, attention_mask, labels = batch
    input_ids, attention_mask, labels = input_ids.to(device), attention_mask.to(device), labels.to(device)
    
    with torch.no_grad():
        outputs = bert_model(input_ids, attention_mask=attention_mask)
    
    logits = outputs.logits
    preds = torch.argmax(logits, dim=1)
    
    predictions_bert.extend(preds.cpu().numpy())

for batch in test_loader_labse:
    input_ids, attention_mask, labels = batch
    input_ids, attention_mask, labels = input_ids.to(device), attention_mask.to(device), labels.to(device)
    
    with torch.no_grad():
        outputs = labse_model(input_ids, attention_mask=attention_mask)
    
    logits = outputs.logits
    preds = torch.argmax(logits, dim=1)
    
    predictions_labse.extend(preds.cpu().numpy())
    true_labels.extend(labels.cpu().numpy())

# Combine predictions from both models
final_predictions = []
for pred_bert, pred_labse in zip(predictions_bert, predictions_labse):
    # Simple voting scheme, you can choose a different method for combining predictions
    combined_prediction = 1 if (pred_bert + pred_labse) >= 1 else 0
    final_predictions.append(combined_prediction)

# Calculate accuracy and other metrics
accuracy = accuracy_score(true_labels, final_predictions)
report = classification_report(true_labels, final_predictions)

print("Accuracy:", accuracy)
print("Classification Report:")
print(report)


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


tokenizer_config.json:   0%|          | 0.00/397 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/804 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/5.22M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.62M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/1.88G [00:00<?, ?B/s]

  return self.fget.__get__(instance, owner)()
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at sentence-transformers/LaBSE and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch 1/10 (BERT)
Epoch 2/10 (BERT)
Epoch 3/10 (BERT)
Epoch 4/10 (BERT)
Epoch 5/10 (BERT)
Epoch 6/10 (BERT)
Epoch 7/10 (BERT)
Epoch 8/10 (BERT)
Epoch 9/10 (BERT)
Epoch 10/10 (BERT)
Epoch 1/10 (LaBSE)
Epoch 2/10 (LaBSE)
Epoch 3/10 (LaBSE)
Epoch 4/10 (LaBSE)
Epoch 5/10 (LaBSE)
Epoch 6/10 (LaBSE)
Epoch 7/10 (LaBSE)
Epoch 8/10 (LaBSE)
Epoch 9/10 (LaBSE)
Epoch 10/10 (LaBSE)
Accuracy: 0.74875
Classification Report:
              precision    recall  f1-score   support

           0       0.80      0.71      0.75       430
           1       0.70      0.80      0.75       370

    accuracy                           0.75       800
   macro avg       0.75      0.75      0.75       800
weighted avg       0.76      0.75      0.75       800



### translated

In [4]:
import pandas as pd
import torch
from sklearn.model_selection import train_test_split
from transformers import BertTokenizer, BertForSequenceClassification, AutoTokenizer, AutoModelForSequenceClassification, AdamW
from sklearn.metrics import accuracy_score, classification_report

# Load your CSV file
df = pd.read_csv('/kaggle/input/telugudat/Hos_train_translated.csv')

# Mapping labels to integers
label_map = {'hate': 1, 'non-hate': 0}
df['Label'] = df['Label'].map(label_map)

# Extract input texts and labels
texts = df['comments '].tolist()
labels = df['Label'].tolist()

# Split data into train and test sets
train_texts, test_texts, train_labels, test_labels = train_test_split(texts, labels, test_size=0.2, random_state=42)

# Load pre-trained BERT and LaBSE models and tokenizers
bert_tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
bert_model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=2)

labse_tokenizer = AutoTokenizer.from_pretrained("sentence-transformers/LaBSE")
labse_model = AutoModelForSequenceClassification.from_pretrained("sentence-transformers/LaBSE", num_labels=2)

# Tokenize inputs
train_encodings_bert = bert_tokenizer(train_texts, truncation=True, padding=True)
test_encodings_bert = bert_tokenizer(test_texts, truncation=True, padding=True)

train_encodings_labse = labse_tokenizer(train_texts, truncation=True, padding=True)
test_encodings_labse = labse_tokenizer(test_texts, truncation=True, padding=True)

# Convert labels to PyTorch tensors
train_labels = torch.tensor(train_labels)
test_labels = torch.tensor(test_labels)

# Create PyTorch datasets
train_dataset_bert = torch.utils.data.TensorDataset(torch.tensor(train_encodings_bert['input_ids']),
                                                    torch.tensor(train_encodings_bert['attention_mask']),
                                                    train_labels)
test_dataset_bert = torch.utils.data.TensorDataset(torch.tensor(test_encodings_bert['input_ids']),
                                                   torch.tensor(test_encodings_bert['attention_mask']),
                                                   test_labels)

train_dataset_labse = torch.utils.data.TensorDataset(torch.tensor(train_encodings_labse['input_ids']),
                                                     torch.tensor(train_encodings_labse['attention_mask']),
                                                     train_labels)
test_dataset_labse = torch.utils.data.TensorDataset(torch.tensor(test_encodings_labse['input_ids']),
                                                    torch.tensor(test_encodings_labse['attention_mask']),
                                                    test_labels)

# Create data loaders
train_loader_bert = torch.utils.data.DataLoader(train_dataset_bert, batch_size=8, shuffle=True)
test_loader_bert = torch.utils.data.DataLoader(test_dataset_bert, batch_size=8, shuffle=False)

train_loader_labse = torch.utils.data.DataLoader(train_dataset_labse, batch_size=8, shuffle=True)
test_loader_labse = torch.utils.data.DataLoader(test_dataset_labse, batch_size=8, shuffle=False)

# Set device (GPU if available, otherwise CPU)
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
bert_model.to(device)
labse_model.to(device)

# Set optimizers and learning rate schedulers
optimizer_bert = AdamW(bert_model.parameters(), lr=1e-5)
optimizer_labse = AdamW(labse_model.parameters(), lr=1e-5)
num_epochs = 10

# Training loop for BERT
for epoch in range(num_epochs):
    print(f"Epoch {epoch+1}/{num_epochs} (BERT)")
    bert_model.train()
    for batch in train_loader_bert:
        input_ids, attention_mask, labels = batch
        input_ids, attention_mask, labels = input_ids.to(device), attention_mask.to(device), labels.to(device)
        
        optimizer_bert.zero_grad()
        outputs = bert_model(input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        loss.backward()
        optimizer_bert.step()

# Training loop for LaBSE
for epoch in range(num_epochs):
    print(f"Epoch {epoch+1}/{num_epochs} (LaBSE)")
    labse_model.train()
    for batch in train_loader_labse:
        input_ids, attention_mask, labels = batch
        input_ids, attention_mask, labels = input_ids.to(device), attention_mask.to(device), labels.to(device)
        
        optimizer_labse.zero_grad()
        outputs = labse_model(input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        loss.backward()
        optimizer_labse.step()

# Evaluation
bert_model.eval()
labse_model.eval()
predictions_bert = []
predictions_labse = []
true_labels = []
for batch in test_loader_bert:
    input_ids, attention_mask, labels = batch
    input_ids, attention_mask, labels = input_ids.to(device), attention_mask.to(device), labels.to(device)
    
    with torch.no_grad():
        outputs = bert_model(input_ids, attention_mask=attention_mask)
    
    logits = outputs.logits
    preds = torch.argmax(logits, dim=1)
    
    predictions_bert.extend(preds.cpu().numpy())

for batch in test_loader_labse:
    input_ids, attention_mask, labels = batch
    input_ids, attention_mask, labels = input_ids.to(device), attention_mask.to(device), labels.to(device)
    
    with torch.no_grad():
        outputs = labse_model(input_ids, attention_mask=attention_mask)
    
    logits = outputs.logits
    preds = torch.argmax(logits, dim=1)
    
    predictions_labse.extend(preds.cpu().numpy())
    true_labels.extend(labels.cpu().numpy())

# Combine predictions from both models
final_predictions = []
for pred_bert, pred_labse in zip(predictions_bert, predictions_labse):
    # Simple voting scheme, you can choose a different method for combining predictions
    combined_prediction = 1 if (pred_bert + pred_labse) >= 1 else 0
    final_predictions.append(combined_prediction)

# Calculate accuracy and other metrics
accuracy = accuracy_score(true_labels, final_predictions)
report = classification_report(true_labels, final_predictions)

print("Accuracy:", accuracy)
print("Classification Report:")
print(report)


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at sentence-transformers/LaBSE and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch 1/10 (BERT)
Epoch 2/10 (BERT)
Epoch 3/10 (BERT)
Epoch 4/10 (BERT)
Epoch 5/10 (BERT)
Epoch 6/10 (BERT)
Epoch 7/10 (BERT)
Epoch 8/10 (BERT)
Epoch 9/10 (BERT)
Epoch 10/10 (BERT)
Epoch 1/10 (LaBSE)
Epoch 2/10 (LaBSE)
Epoch 3/10 (LaBSE)
Epoch 4/10 (LaBSE)
Epoch 5/10 (LaBSE)
Epoch 6/10 (LaBSE)
Epoch 7/10 (LaBSE)
Epoch 8/10 (LaBSE)
Epoch 9/10 (LaBSE)
Epoch 10/10 (LaBSE)
Accuracy: 0.75
Classification Report:
              precision    recall  f1-score   support

           0       0.80      0.71      0.75       430
           1       0.70      0.80      0.75       370

    accuracy                           0.75       800
   macro avg       0.75      0.75      0.75       800
weighted avg       0.76      0.75      0.75       800



### transliterated

In [3]:
import pandas as pd
import torch
from sklearn.model_selection import train_test_split
from transformers import BertTokenizer, BertForSequenceClassification, AutoTokenizer, AutoModelForSequenceClassification, AdamW
from sklearn.metrics import accuracy_score, classification_report

# Load your CSV file
df = pd.read_csv('/kaggle/input/telugudat/transliteration_4000.csv')

# Mapping labels to integers
label_map = {'hate': 1, 'non-hate': 0}
df['Label'] = df['Label'].map(label_map)

# Extract input texts and labels
texts = df['Comments'].tolist()
labels = df['Label'].tolist()

# Split data into train and test sets
train_texts, test_texts, train_labels, test_labels = train_test_split(texts, labels, test_size=0.2, random_state=42)

# Load pre-trained BERT and LaBSE models and tokenizers
bert_tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
bert_model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=2)

labse_tokenizer = AutoTokenizer.from_pretrained("sentence-transformers/LaBSE")
labse_model = AutoModelForSequenceClassification.from_pretrained("sentence-transformers/LaBSE", num_labels=2)

# Tokenize inputs
train_encodings_bert = bert_tokenizer(train_texts, truncation=True, padding=True)
test_encodings_bert = bert_tokenizer(test_texts, truncation=True, padding=True)

train_encodings_labse = labse_tokenizer(train_texts, truncation=True, padding=True)
test_encodings_labse = labse_tokenizer(test_texts, truncation=True, padding=True)

# Convert labels to PyTorch tensors
train_labels = torch.tensor(train_labels)
test_labels = torch.tensor(test_labels)

# Create PyTorch datasets
train_dataset_bert = torch.utils.data.TensorDataset(torch.tensor(train_encodings_bert['input_ids']),
                                                    torch.tensor(train_encodings_bert['attention_mask']),
                                                    train_labels)
test_dataset_bert = torch.utils.data.TensorDataset(torch.tensor(test_encodings_bert['input_ids']),
                                                   torch.tensor(test_encodings_bert['attention_mask']),
                                                   test_labels)

train_dataset_labse = torch.utils.data.TensorDataset(torch.tensor(train_encodings_labse['input_ids']),
                                                     torch.tensor(train_encodings_labse['attention_mask']),
                                                     train_labels)
test_dataset_labse = torch.utils.data.TensorDataset(torch.tensor(test_encodings_labse['input_ids']),
                                                    torch.tensor(test_encodings_labse['attention_mask']),
                                                    test_labels)

# Create data loaders
train_loader_bert = torch.utils.data.DataLoader(train_dataset_bert, batch_size=8, shuffle=True)
test_loader_bert = torch.utils.data.DataLoader(test_dataset_bert, batch_size=8, shuffle=False)

train_loader_labse = torch.utils.data.DataLoader(train_dataset_labse, batch_size=8, shuffle=True)
test_loader_labse = torch.utils.data.DataLoader(test_dataset_labse, batch_size=8, shuffle=False)

# Set device (GPU if available, otherwise CPU)
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
bert_model.to(device)
labse_model.to(device)

# Set optimizers and learning rate schedulers
optimizer_bert = AdamW(bert_model.parameters(), lr=1e-5)
optimizer_labse = AdamW(labse_model.parameters(), lr=1e-5)
num_epochs = 10

# Training loop for BERT
for epoch in range(num_epochs):
    print(f"Epoch {epoch+1}/{num_epochs} (BERT)")
    bert_model.train()
    for batch in train_loader_bert:
        input_ids, attention_mask, labels = batch
        input_ids, attention_mask, labels = input_ids.to(device), attention_mask.to(device), labels.to(device)
        
        optimizer_bert.zero_grad()
        outputs = bert_model(input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        loss.backward()
        optimizer_bert.step()

# Training loop for LaBSE
for epoch in range(num_epochs):
    print(f"Epoch {epoch+1}/{num_epochs} (LaBSE)")
    labse_model.train()
    for batch in train_loader_labse:
        input_ids, attention_mask, labels = batch
        input_ids, attention_mask, labels = input_ids.to(device), attention_mask.to(device), labels.to(device)
        
        optimizer_labse.zero_grad()
        outputs = labse_model(input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        loss.backward()
        optimizer_labse.step()

# Evaluation
bert_model.eval()
labse_model.eval()
predictions_bert = []
predictions_labse = []
true_labels = []
for batch in test_loader_bert:
    input_ids, attention_mask, labels = batch
    input_ids, attention_mask, labels = input_ids.to(device), attention_mask.to(device), labels.to(device)
    
    with torch.no_grad():
        outputs = bert_model(input_ids, attention_mask=attention_mask)
    
    logits = outputs.logits
    preds = torch.argmax(logits, dim=1)
    
    predictions_bert.extend(preds.cpu().numpy())

for batch in test_loader_labse:
    input_ids, attention_mask, labels = batch
    input_ids, attention_mask, labels = input_ids.to(device), attention_mask.to(device), labels.to(device)
    
    with torch.no_grad():
        outputs = labse_model(input_ids, attention_mask=attention_mask)
    
    logits = outputs.logits
    preds = torch.argmax(logits, dim=1)
    
    predictions_labse.extend(preds.cpu().numpy())
    true_labels.extend(labels.cpu().numpy())

# Combine predictions from both models
final_predictions = []
for pred_bert, pred_labse in zip(predictions_bert, predictions_labse):
    # Simple voting scheme, you can choose a different method for combining predictions
    combined_prediction = 1 if (pred_bert + pred_labse) >= 1 else 0
    final_predictions.append(combined_prediction)

# Calculate accuracy and other metrics
accuracy = accuracy_score(true_labels, final_predictions)
report = classification_report(true_labels, final_predictions)

print("Accuracy:", accuracy)
print("Classification Report:")
print(report)


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at sentence-transformers/LaBSE and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch 1/10 (BERT)
Epoch 2/10 (BERT)
Epoch 3/10 (BERT)
Epoch 4/10 (BERT)
Epoch 5/10 (BERT)
Epoch 6/10 (BERT)
Epoch 7/10 (BERT)
Epoch 8/10 (BERT)
Epoch 9/10 (BERT)
Epoch 10/10 (BERT)
Epoch 1/10 (LaBSE)
Epoch 2/10 (LaBSE)
Epoch 3/10 (LaBSE)
Epoch 4/10 (LaBSE)
Epoch 5/10 (LaBSE)
Epoch 6/10 (LaBSE)
Epoch 7/10 (LaBSE)
Epoch 8/10 (LaBSE)
Epoch 9/10 (LaBSE)
Epoch 10/10 (LaBSE)
Accuracy: 0.75125
Classification Report:
              precision    recall  f1-score   support

           0       0.84      0.66      0.74       430
           1       0.68      0.86      0.76       370

    accuracy                           0.75       800
   macro avg       0.76      0.76      0.75       800
weighted avg       0.77      0.75      0.75       800



### bert+muril

original

In [1]:
import pandas as pd
import torch
from sklearn.model_selection import train_test_split
from transformers import BertTokenizer, BertForSequenceClassification, AutoTokenizer, AutoModelForSequenceClassification, AdamW
from sklearn.metrics import accuracy_score, classification_report

# Load your CSV file
df = pd.read_csv('/kaggle/input/telugudat/training_data_telugu-hate.csv')

# Mapping labels to integers
label_map = {'hate': 1, 'non-hate': 0}
df['Label'] = df['Label'].map(label_map)

# Extract input texts and labels
texts = df['Comments'].tolist()
labels = df['Label'].tolist()

# Split data into train and test sets
train_texts, test_texts, train_labels, test_labels = train_test_split(texts, labels, test_size=0.2, random_state=42)

# Load pre-trained BERT and MuRIL models and tokenizers
bert_tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
bert_model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=2)

muril_tokenizer = AutoTokenizer.from_pretrained("google/muril-base-cased")
muril_model = AutoModelForSequenceClassification.from_pretrained("google/muril-base-cased", num_labels=2)

# Tokenize inputs
train_encodings_bert = bert_tokenizer(train_texts, truncation=True, padding=True)
test_encodings_bert = bert_tokenizer(test_texts, truncation=True, padding=True)

train_encodings_muril = muril_tokenizer(train_texts, truncation=True, padding=True)
test_encodings_muril = muril_tokenizer(test_texts, truncation=True, padding=True)

# Convert labels to PyTorch tensors
train_labels = torch.tensor(train_labels)
test_labels = torch.tensor(test_labels)

# Create PyTorch datasets
train_dataset_bert = torch.utils.data.TensorDataset(torch.tensor(train_encodings_bert['input_ids']),
                                                    torch.tensor(train_encodings_bert['attention_mask']),
                                                    train_labels)
test_dataset_bert = torch.utils.data.TensorDataset(torch.tensor(test_encodings_bert['input_ids']),
                                                   torch.tensor(test_encodings_bert['attention_mask']),
                                                   test_labels)

train_dataset_muril = torch.utils.data.TensorDataset(torch.tensor(train_encodings_muril['input_ids']),
                                                     torch.tensor(train_encodings_muril['attention_mask']),
                                                     train_labels)
test_dataset_muril = torch.utils.data.TensorDataset(torch.tensor(test_encodings_muril['input_ids']),
                                                    torch.tensor(test_encodings_muril['attention_mask']),
                                                    test_labels)

# Create data loaders
train_loader_bert = torch.utils.data.DataLoader(train_dataset_bert, batch_size=8, shuffle=True)
test_loader_bert = torch.utils.data.DataLoader(test_dataset_bert, batch_size=8, shuffle=False)

train_loader_muril = torch.utils.data.DataLoader(train_dataset_muril, batch_size=8, shuffle=True)
test_loader_muril = torch.utils.data.DataLoader(test_dataset_muril, batch_size=8, shuffle=False)

# Set device (GPU if available, otherwise CPU)
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
bert_model.to(device)
muril_model.to(device)

# Set optimizers and learning rate schedulers
optimizer_bert = AdamW(bert_model.parameters(), lr=1e-5)
optimizer_muril = AdamW(muril_model.parameters(), lr=1e-5)
num_epochs = 10

# Training loop for BERT
for epoch in range(num_epochs):
    print(f"Epoch {epoch+1}/{num_epochs} (BERT)")
    bert_model.train()
    for batch in train_loader_bert:
        input_ids, attention_mask, labels = batch
        input_ids, attention_mask, labels = input_ids.to(device), attention_mask.to(device), labels.to(device)
        
        optimizer_bert.zero_grad()
        outputs = bert_model(input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        loss.backward()
        optimizer_bert.step()

# Training loop for MuRIL
for epoch in range(num_epochs):
    print(f"Epoch {epoch+1}/{num_epochs} (MuRIL)")
    muril_model.train()
    for batch in train_loader_muril:
        input_ids, attention_mask, labels = batch
        input_ids, attention_mask, labels = input_ids.to(device), attention_mask.to(device), labels.to(device)
        
        optimizer_muril.zero_grad()
        outputs = muril_model(input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        loss.backward()
        optimizer_muril.step()

# Evaluation
bert_model.eval()
muril_model.eval()
predictions_bert = []
predictions_muril = []
true_labels = []
for batch in test_loader_bert:
    input_ids, attention_mask, labels = batch
    input_ids, attention_mask, labels = input_ids.to(device), attention_mask.to(device), labels.to(device)
    
    with torch.no_grad():
        outputs = bert_model(input_ids, attention_mask=attention_mask)
    
    logits = outputs.logits
    preds = torch.argmax(logits, dim=1)
    
    predictions_bert.extend(preds.cpu().numpy())

for batch in test_loader_muril:
    input_ids, attention_mask, labels = batch
    input_ids, attention_mask, labels = input_ids.to(device), attention_mask.to(device), labels.to(device)
    
    with torch.no_grad():
        outputs = muril_model(input_ids, attention_mask=attention_mask)
    
    logits = outputs.logits
    preds = torch.argmax(logits, dim=1)
    
    predictions_muril.extend(preds.cpu().numpy())
    true_labels.extend(labels.cpu().numpy())

# Combine predictions from both models
final_predictions = []
for pred_bert, pred_muril in zip(predictions_bert, predictions_muril):
    # Simple voting scheme, you can choose a different method for combining predictions
    combined_prediction = 1 if (pred_bert + pred_muril) >= 1 else 0
    final_predictions.append(combined_prediction)

# Calculate accuracy and other metrics
accuracy = accuracy_score(true_labels, final_predictions)
report = classification_report(true_labels, final_predictions)

print("Accuracy:", accuracy)
print("Classification Report:")
print(report)


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


tokenizer_config.json:   0%|          | 0.00/206 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/411 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/3.16M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/113 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/953M [00:00<?, ?B/s]

  return self.fget.__get__(instance, owner)()
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/muril-base-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch 1/10 (BERT)
Epoch 2/10 (BERT)
Epoch 3/10 (BERT)
Epoch 4/10 (BERT)
Epoch 5/10 (BERT)
Epoch 6/10 (BERT)
Epoch 7/10 (BERT)
Epoch 8/10 (BERT)
Epoch 9/10 (BERT)
Epoch 10/10 (BERT)
Epoch 1/10 (MuRIL)
Epoch 2/10 (MuRIL)
Epoch 3/10 (MuRIL)
Epoch 4/10 (MuRIL)
Epoch 5/10 (MuRIL)
Epoch 6/10 (MuRIL)
Epoch 7/10 (MuRIL)
Epoch 8/10 (MuRIL)
Epoch 9/10 (MuRIL)
Epoch 10/10 (MuRIL)
Accuracy: 0.755
Classification Report:
              precision    recall  f1-score   support

           0       0.78      0.75      0.77       430
           1       0.73      0.76      0.74       370

    accuracy                           0.76       800
   macro avg       0.75      0.76      0.75       800
weighted avg       0.76      0.76      0.76       800



translated

In [2]:
import pandas as pd
import torch
from sklearn.model_selection import train_test_split
from transformers import BertTokenizer, BertForSequenceClassification, AutoTokenizer, AutoModelForSequenceClassification, AdamW
from sklearn.metrics import accuracy_score, classification_report

# Load your CSV file
df = pd.read_csv('/kaggle/input/telugudat/Hos_train_translated.csv')

# Mapping labels to integers
label_map = {'hate': 1, 'non-hate': 0}
df['Label'] = df['Label'].map(label_map)

# Extract input texts and labels
texts = df['comments '].tolist()
labels = df['Label'].tolist()

# Split data into train and test sets
train_texts, test_texts, train_labels, test_labels = train_test_split(texts, labels, test_size=0.2, random_state=42)

# Load pre-trained BERT and MuRIL models and tokenizers
bert_tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
bert_model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=2)

muril_tokenizer = AutoTokenizer.from_pretrained("google/muril-base-cased")
muril_model = AutoModelForSequenceClassification.from_pretrained("google/muril-base-cased", num_labels=2)

# Tokenize inputs
train_encodings_bert = bert_tokenizer(train_texts, truncation=True, padding=True)
test_encodings_bert = bert_tokenizer(test_texts, truncation=True, padding=True)

train_encodings_muril = muril_tokenizer(train_texts, truncation=True, padding=True)
test_encodings_muril = muril_tokenizer(test_texts, truncation=True, padding=True)

# Convert labels to PyTorch tensors
train_labels = torch.tensor(train_labels)
test_labels = torch.tensor(test_labels)

# Create PyTorch datasets
train_dataset_bert = torch.utils.data.TensorDataset(torch.tensor(train_encodings_bert['input_ids']),
                                                    torch.tensor(train_encodings_bert['attention_mask']),
                                                    train_labels)
test_dataset_bert = torch.utils.data.TensorDataset(torch.tensor(test_encodings_bert['input_ids']),
                                                   torch.tensor(test_encodings_bert['attention_mask']),
                                                   test_labels)

train_dataset_muril = torch.utils.data.TensorDataset(torch.tensor(train_encodings_muril['input_ids']),
                                                     torch.tensor(train_encodings_muril['attention_mask']),
                                                     train_labels)
test_dataset_muril = torch.utils.data.TensorDataset(torch.tensor(test_encodings_muril['input_ids']),
                                                    torch.tensor(test_encodings_muril['attention_mask']),
                                                    test_labels)

# Create data loaders
train_loader_bert = torch.utils.data.DataLoader(train_dataset_bert, batch_size=8, shuffle=True)
test_loader_bert = torch.utils.data.DataLoader(test_dataset_bert, batch_size=8, shuffle=False)

train_loader_muril = torch.utils.data.DataLoader(train_dataset_muril, batch_size=8, shuffle=True)
test_loader_muril = torch.utils.data.DataLoader(test_dataset_muril, batch_size=8, shuffle=False)

# Set device (GPU if available, otherwise CPU)
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
bert_model.to(device)
muril_model.to(device)

# Set optimizers and learning rate schedulers
optimizer_bert = AdamW(bert_model.parameters(), lr=1e-5)
optimizer_muril = AdamW(muril_model.parameters(), lr=1e-5)
num_epochs = 10

# Training loop for BERT
for epoch in range(num_epochs):
    print(f"Epoch {epoch+1}/{num_epochs} (BERT)")
    bert_model.train()
    for batch in train_loader_bert:
        input_ids, attention_mask, labels = batch
        input_ids, attention_mask, labels = input_ids.to(device), attention_mask.to(device), labels.to(device)
        
        optimizer_bert.zero_grad()
        outputs = bert_model(input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        loss.backward()
        optimizer_bert.step()

# Training loop for MuRIL
for epoch in range(num_epochs):
    print(f"Epoch {epoch+1}/{num_epochs} (MuRIL)")
    muril_model.train()
    for batch in train_loader_muril:
        input_ids, attention_mask, labels = batch
        input_ids, attention_mask, labels = input_ids.to(device), attention_mask.to(device), labels.to(device)
        
        optimizer_muril.zero_grad()
        outputs = muril_model(input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        loss.backward()
        optimizer_muril.step()

# Evaluation
bert_model.eval()
muril_model.eval()
predictions_bert = []
predictions_muril = []
true_labels = []
for batch in test_loader_bert:
    input_ids, attention_mask, labels = batch
    input_ids, attention_mask, labels = input_ids.to(device), attention_mask.to(device), labels.to(device)
    
    with torch.no_grad():
        outputs = bert_model(input_ids, attention_mask=attention_mask)
    
    logits = outputs.logits
    preds = torch.argmax(logits, dim=1)
    
    predictions_bert.extend(preds.cpu().numpy())

for batch in test_loader_muril:
    input_ids, attention_mask, labels = batch
    input_ids, attention_mask, labels = input_ids.to(device), attention_mask.to(device), labels.to(device)
    
    with torch.no_grad():
        outputs = muril_model(input_ids, attention_mask=attention_mask)
    
    logits = outputs.logits
    preds = torch.argmax(logits, dim=1)
    
    predictions_muril.extend(preds.cpu().numpy())
    true_labels.extend(labels.cpu().numpy())

# Combine predictions from both models
final_predictions = []
for pred_bert, pred_muril in zip(predictions_bert, predictions_muril):
    # Simple voting scheme, you can choose a different method for combining predictions
    combined_prediction = 1 if (pred_bert + pred_muril) >= 1 else 0
    final_predictions.append(combined_prediction)

# Calculate accuracy and other metrics
accuracy = accuracy_score(true_labels, final_predictions)
report = classification_report(true_labels, final_predictions)

print("Accuracy:", accuracy)
print("Classification Report:")
print(report)


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/muril-base-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch 1/10 (BERT)
Epoch 2/10 (BERT)
Epoch 3/10 (BERT)
Epoch 4/10 (BERT)
Epoch 5/10 (BERT)
Epoch 6/10 (BERT)
Epoch 7/10 (BERT)
Epoch 8/10 (BERT)
Epoch 9/10 (BERT)
Epoch 10/10 (BERT)
Epoch 1/10 (MuRIL)
Epoch 2/10 (MuRIL)
Epoch 3/10 (MuRIL)
Epoch 4/10 (MuRIL)
Epoch 5/10 (MuRIL)
Epoch 6/10 (MuRIL)
Epoch 7/10 (MuRIL)
Epoch 8/10 (MuRIL)
Epoch 9/10 (MuRIL)
Epoch 10/10 (MuRIL)
Accuracy: 0.7525
Classification Report:
              precision    recall  f1-score   support

           0       0.84      0.66      0.74       430
           1       0.69      0.86      0.76       370

    accuracy                           0.75       800
   macro avg       0.76      0.76      0.75       800
weighted avg       0.77      0.75      0.75       800



transliterated

In [3]:
import pandas as pd
import torch
from sklearn.model_selection import train_test_split
from transformers import BertTokenizer, BertForSequenceClassification, AutoTokenizer, AutoModelForSequenceClassification, AdamW
from sklearn.metrics import accuracy_score, classification_report

# Load your CSV file
df = pd.read_csv('/kaggle/input/telugudat/transliteration_4000.csv')

# Mapping labels to integers
label_map = {'hate': 1, 'non-hate': 0}
df['Label'] = df['Label'].map(label_map)

# Extract input texts and labels
texts = df['Comments'].tolist()
labels = df['Label'].tolist()

# Split data into train and test sets
train_texts, test_texts, train_labels, test_labels = train_test_split(texts, labels, test_size=0.2, random_state=42)

# Load pre-trained BERT and MuRIL models and tokenizers
bert_tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
bert_model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=2)

muril_tokenizer = AutoTokenizer.from_pretrained("google/muril-base-cased")
muril_model = AutoModelForSequenceClassification.from_pretrained("google/muril-base-cased", num_labels=2)

# Tokenize inputs
train_encodings_bert = bert_tokenizer(train_texts, truncation=True, padding=True)
test_encodings_bert = bert_tokenizer(test_texts, truncation=True, padding=True)

train_encodings_muril = muril_tokenizer(train_texts, truncation=True, padding=True)
test_encodings_muril = muril_tokenizer(test_texts, truncation=True, padding=True)

# Convert labels to PyTorch tensors
train_labels = torch.tensor(train_labels)
test_labels = torch.tensor(test_labels)

# Create PyTorch datasets
train_dataset_bert = torch.utils.data.TensorDataset(torch.tensor(train_encodings_bert['input_ids']),
                                                    torch.tensor(train_encodings_bert['attention_mask']),
                                                    train_labels)
test_dataset_bert = torch.utils.data.TensorDataset(torch.tensor(test_encodings_bert['input_ids']),
                                                   torch.tensor(test_encodings_bert['attention_mask']),
                                                   test_labels)

train_dataset_muril = torch.utils.data.TensorDataset(torch.tensor(train_encodings_muril['input_ids']),
                                                     torch.tensor(train_encodings_muril['attention_mask']),
                                                     train_labels)
test_dataset_muril = torch.utils.data.TensorDataset(torch.tensor(test_encodings_muril['input_ids']),
                                                    torch.tensor(test_encodings_muril['attention_mask']),
                                                    test_labels)

# Create data loaders
train_loader_bert = torch.utils.data.DataLoader(train_dataset_bert, batch_size=8, shuffle=True)
test_loader_bert = torch.utils.data.DataLoader(test_dataset_bert, batch_size=8, shuffle=False)

train_loader_muril = torch.utils.data.DataLoader(train_dataset_muril, batch_size=8, shuffle=True)
test_loader_muril = torch.utils.data.DataLoader(test_dataset_muril, batch_size=8, shuffle=False)

# Set device (GPU if available, otherwise CPU)
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
bert_model.to(device)
muril_model.to(device)

# Set optimizers and learning rate schedulers
optimizer_bert = AdamW(bert_model.parameters(), lr=1e-5)
optimizer_muril = AdamW(muril_model.parameters(), lr=1e-5)
num_epochs = 10

# Training loop for BERT
for epoch in range(num_epochs):
    print(f"Epoch {epoch+1}/{num_epochs} (BERT)")
    bert_model.train()
    for batch in train_loader_bert:
        input_ids, attention_mask, labels = batch
        input_ids, attention_mask, labels = input_ids.to(device), attention_mask.to(device), labels.to(device)
        
        optimizer_bert.zero_grad()
        outputs = bert_model(input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        loss.backward()
        optimizer_bert.step()

# Training loop for MuRIL
for epoch in range(num_epochs):
    print(f"Epoch {epoch+1}/{num_epochs} (MuRIL)")
    muril_model.train()
    for batch in train_loader_muril:
        input_ids, attention_mask, labels = batch
        input_ids, attention_mask, labels = input_ids.to(device), attention_mask.to(device), labels.to(device)
        
        optimizer_muril.zero_grad()
        outputs = muril_model(input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        loss.backward()
        optimizer_muril.step()

# Evaluation
bert_model.eval()
muril_model.eval()
predictions_bert = []
predictions_muril = []
true_labels = []
for batch in test_loader_bert:
    input_ids, attention_mask, labels = batch
    input_ids, attention_mask, labels = input_ids.to(device), attention_mask.to(device), labels.to(device)
    
    with torch.no_grad():
        outputs = bert_model(input_ids, attention_mask=attention_mask)
    
    logits = outputs.logits
    preds = torch.argmax(logits, dim=1)
    
    predictions_bert.extend(preds.cpu().numpy())

for batch in test_loader_muril:
    input_ids, attention_mask, labels = batch
    input_ids, attention_mask, labels = input_ids.to(device), attention_mask.to(device), labels.to(device)
    
    with torch.no_grad():
        outputs = muril_model(input_ids, attention_mask=attention_mask)
    
    logits = outputs.logits
    preds = torch.argmax(logits, dim=1)
    
    predictions_muril.extend(preds.cpu().numpy())
    true_labels.extend(labels.cpu().numpy())

# Combine predictions from both models
final_predictions = []
for pred_bert, pred_muril in zip(predictions_bert, predictions_muril):
    # Simple voting scheme, you can choose a different method for combining predictions
    combined_prediction = 1 if (pred_bert + pred_muril) >= 1 else 0
    final_predictions.append(combined_prediction)

# Calculate accuracy and other metrics
accuracy = accuracy_score(true_labels, final_predictions)
report = classification_report(true_labels, final_predictions)

print("Accuracy:", accuracy)
print("Classification Report:")
print(report)


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/muril-base-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch 1/10 (BERT)
Epoch 2/10 (BERT)
Epoch 3/10 (BERT)
Epoch 4/10 (BERT)
Epoch 5/10 (BERT)
Epoch 6/10 (BERT)
Epoch 7/10 (BERT)
Epoch 8/10 (BERT)
Epoch 9/10 (BERT)
Epoch 10/10 (BERT)
Epoch 1/10 (MuRIL)
Epoch 2/10 (MuRIL)
Epoch 3/10 (MuRIL)
Epoch 4/10 (MuRIL)
Epoch 5/10 (MuRIL)
Epoch 6/10 (MuRIL)
Epoch 7/10 (MuRIL)
Epoch 8/10 (MuRIL)
Epoch 9/10 (MuRIL)
Epoch 10/10 (MuRIL)
Accuracy: 0.74375
Classification Report:
              precision    recall  f1-score   support

           0       0.80      0.70      0.75       430
           1       0.69      0.80      0.74       370

    accuracy                           0.74       800
   macro avg       0.75      0.75      0.74       800
weighted avg       0.75      0.74      0.74       800

