In [1]:
import pandas as pd
import numpy as np
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import AutoTokenizer, AutoModelForSequenceClassification, AdamW
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
import warnings
warnings.filterwarnings('ignore')

In [20]:
import re
import string

In [5]:
from typing import List

In [2]:
torch.manual_seed(42)
np.random.seed(42)

In [7]:
def clean_text(text: str) -> str:
    """Cleans text by removing unwanted symbols, URLs, HTML, and numbers."""
    text = re.sub(r'\[.*?\]', '', text)  # Remove text inside brackets
    text = re.sub(r'https?://\S+|www\.\S+', '', text)  # Remove URLs
    text = re.sub(r'<.*?>+', '', text)  # Remove HTML tags
    text = re.sub(r'[%s]' % re.escape(string.punctuation), '', text)  # Remove punctuation
    text = re.sub(r'\n', ' ', text)  # Remove newlines
    text = re.sub(r'\w*\d\w*', '', text)  # Remove words containing numbers
    return text

In [6]:
def lower_case_everything(t: str) -> str:
    return t.lower()

def replace_all_caps(tokens: List[str]) -> List[str]:
    return [f'xxup {t.lower()}' if t.isupper() else t for t in tokens]

def deal_caps(tokens: List[str]) -> List[str]:
    return [f'xxmaj {t}' if t.istitle() else t for t in tokens]

def handle_all_caps(t: str) -> str:
    tokens = t.split()
    tokens = replace_all_caps(tokens)
    return ' '.join(tokens)

def handle_upper_case_first_letter(t: str) -> str:
    tokens = t.split()
    tokens = deal_caps(tokens)
    return ' '.join(tokens)

In [8]:
custom_pre_rules = [lower_case_everything, handle_all_caps, handle_upper_case_first_letter]

In [9]:
def preprocess_text(text: str) -> str:
    text = clean_text(str(text))
    for rule in custom_pre_rules:
        text = rule(text)
    return text

In [11]:
def load_data(file_path):
    df = pd.read_csv(file_path)
    unwanted_labels = {'OFf','label','not'}
    df = df[~df['category'].isin(unwanted_labels)]
    label_mapping = {
        "NOT" : 0,
        "OFF" : 1
    }
    df["category"] = df["category"].map(label_mapping)
    df["text"] = df["text"].apply(preprocess_text)
    return df

In [12]:
class TanglishDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_len=128):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_len = max_len
        
    def __len__(self):
        return len(self.texts)
    
    def __getitem__(self, idx):
        text = str(self.texts[idx])
        label = self.labels[idx]
        
        # Use the tokenizer to handle Tanglish text properly
        encoding = self.tokenizer(
            text,
            truncation=True,
            padding='max_length',
            max_length=self.max_len,
            return_tensors='pt'
        )
        
        return {
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'label': torch.tensor(label, dtype=torch.long)
        }

In [13]:
def train_model(model, train_loader, val_loader, device, epochs=5):
    # Set up optimizer
    optimizer = AdamW(model.parameters(), lr=2e-5)
    
    # Training loop
    best_val_accuracy = 0
    for epoch in range(epochs):
        # Training phase
        model.train()
        train_loss = 0
        for batch in train_loader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['label'].to(device)
            
            # Zero gradients
            optimizer.zero_grad()
            
            # Forward pass
            outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
            loss = outputs.loss
            
            # Backward pass
            loss.backward()
            optimizer.step()
            
            train_loss += loss.item()
        
        avg_train_loss = train_loss / len(train_loader)
        
        # Validation phase
        model.eval()
        val_accuracy = 0
        val_preds = []
        val_true = []
        
        with torch.no_grad():
            for batch in val_loader:
                input_ids = batch['input_ids'].to(device)
                attention_mask = batch['attention_mask'].to(device)
                labels = batch['label'].to(device)
                
                outputs = model(input_ids=input_ids, attention_mask=attention_mask)
                logits = outputs.logits
                preds = torch.argmax(logits, dim=1)
                
                val_accuracy += (preds == labels).sum().item()
                val_preds.extend(preds.cpu().numpy())
                val_true.extend(labels.cpu().numpy())
        
        val_accuracy /= len(val_loader.dataset)
        print(f"Epoch {epoch+1}/{epochs}:")
        print(f"  Train Loss: {avg_train_loss:.4f}")
        print(f"  Val Accuracy: {val_accuracy:.4f}")
        
        # Save the best model
        if val_accuracy > best_val_accuracy:
            best_val_accuracy = val_accuracy
            torch.save(model.state_dict(), 'best_tanglish_model.pt')
            
            # Print classification report
            print("\nClassification Report:")
            print(classification_report(val_true, val_preds))
    
    return model

In [14]:
def evaluate_model(model, test_loader, device):
    model.eval()
    test_accuracy = 0
    test_preds = []
    test_true = []
    
    with torch.no_grad():
        for batch in test_loader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['label'].to(device)
            
            outputs = model(input_ids=input_ids, attention_mask=attention_mask)
            logits = outputs.logits
            preds = torch.argmax(logits, dim=1)
            
            test_accuracy += (preds == labels).sum().item()
            test_preds.extend(preds.cpu().numpy())
            test_true.extend(labels.cpu().numpy())
    
    test_accuracy /= len(test_loader.dataset)
    print(f"Test Accuracy: {test_accuracy:.4f}")
    print("\nTest Classification Report:")
    print(classification_report(test_true, test_preds))
    
    return test_preds

In [6]:
def predict_hate_speech(text, model, tokenizer, device, max_len=128):
    model.eval()
    encoding = tokenizer(
        text,
        truncation=True,
        padding='max_length',
        max_length=max_len,
        return_tensors='pt'
    )
    
    input_ids = encoding['input_ids'].to(device)
    attention_mask = encoding['attention_mask'].to(device)
    
    with torch.no_grad():
        outputs = model(input_ids=input_ids, attention_mask=attention_mask)
        logits = outputs.logits
        probs = torch.softmax(logits, dim=1)
        prediction = torch.argmax(logits, dim=1).item()
    
    return {
        'text': text,
        'prediction': prediction,
        'confidence': probs[0][prediction].item(),
        'result': 'Hate Speech' if prediction == 1 else 'Not Hate Speech'
    }

In [16]:
def main():
    # Load the dataset
    file_path = '/kaggle/input/main-dataset/main_dataset.csv'  # Replace with your dataset path
    df = load_data(file_path)
    
    # Split the data
    train_df, test_df = train_test_split(df, test_size=0.2, random_state=42)
    train_df, val_df = train_test_split(train_df, test_size=0.1, random_state=42)
    
    print(f"Training set size: {len(train_df)}")
    print(f"Validation set size: {len(val_df)}")
    print(f"Test set size: {len(test_df)}")
    
   
    model_name = "ai4bharat/indic-bert"  
    
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    model = AutoModelForSequenceClassification.from_pretrained(
        model_name, 
        num_labels=2
    )
    
    # Create datasets
    train_dataset = TanglishDataset(
        texts=train_df['text'].values,
        labels=train_df['category'].values,
        tokenizer=tokenizer
    )
    
    val_dataset = TanglishDataset(
        texts=val_df['text'].values,
        labels=val_df['category'].values,
        tokenizer=tokenizer
    )
    
    test_dataset = TanglishDataset(
        texts=test_df['text'].values,
        labels=test_df['category'].values,
        tokenizer=tokenizer
    )
    

    train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
    val_loader = DataLoader(val_dataset, batch_size=16)
    test_loader = DataLoader(test_dataset, batch_size=16)
    
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    print(f"Using device: {device}")

    model = model.to(device)
    
    # Train model
    print("Training model...")
    model = train_model(model, train_loader, val_loader, device, epochs=5)
    
    # Load best model
    model.load_state_dict(torch.load('best_tanglish_model.pt'))
    
    # Evaluate on test set
    print("\nEvaluating on test set...")
    evaluate_model(model, test_loader, device)
    
    # Example prediction
    print("\nExample predictions:")
    sample_texts = [
        "Intha post romba useful iruku, thanks for sharing!",  # Expected: Not Hate
        "Nee oru useless paiyan, engaluku vendam"  # Expected: Hate
    ]
    
    for text in sample_texts:
        result = predict_hate_speech(text, model, tokenizer, device)
        print(f"Text: {result['text']}")
        print(f"Prediction: {result['result']} (Confidence: {result['confidence']:.4f})")
        print()

In [21]:
if __name__ == "__main__":
    main()

Training set size: 9540
Validation set size: 1061
Test set size: 2651


config.json:   0%|          | 0.00/507 [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/5.65M [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/135M [00:00<?, ?B/s]

Some weights of AlbertForSequenceClassification were not initialized from the model checkpoint at ai4bharat/indic-bert and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Using device: cuda
Training model...
Epoch 1/5:
  Train Loss: 0.4930
  Val Accuracy: 0.8615

Classification Report:
              precision    recall  f1-score   support

           0       0.84      0.93      0.88       605
           1       0.90      0.77      0.83       456

    accuracy                           0.86      1061
   macro avg       0.87      0.85      0.86      1061
weighted avg       0.86      0.86      0.86      1061

Epoch 2/5:
  Train Loss: 0.2799
  Val Accuracy: 0.8483
Epoch 3/5:
  Train Loss: 0.2088
  Val Accuracy: 0.8897

Classification Report:
              precision    recall  f1-score   support

           0       0.86      0.96      0.91       605
           1       0.94      0.79      0.86       456

    accuracy                           0.89      1061
   macro avg       0.90      0.88      0.88      1061
weighted avg       0.90      0.89      0.89      1061

Epoch 4/5:
  Train Loss: 0.1442
  Val Accuracy: 0.9095

Classification Report:
              pre

In [None]:
ratish03/indic-BERT-Classification

In [27]:
a = "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"

In [28]:
from huggingface_hub import HfApi, HfFolder,login

username = "ratish03"
repo_name = "indic-BERT-Classification"

api = HfApi()
token = a

login(token=token)

api.upload_file(
    path_or_fileobj="/kaggle/working/best_tanglish_model.pt",
    path_in_repo = "best_tanglish_model.pt",
    repo_id = f"{username}/{repo_name}",
    token=token

)


print("Files uploaded successfully")

best_tanglish_model.pt:   0%|          | 0.00/134M [00:00<?, ?B/s]

Files uploaded successfully


In [1]:
from huggingface_hub import snapshot_download


In [3]:
from huggingface_hub import hf_hub_download

repo_id = "ratish03/indic-BERT-Classification"

# Download the model file
model_path = hf_hub_download(repo_id=repo_id, filename="best_tanglish_model.pt")
print(f"Model downloaded to: {model_path}")


Model downloaded to: /root/.cache/huggingface/hub/models--ratish03--indic-BERT-Classification/snapshots/a8f57f4af295c391ff35cbe27536790956eb5a8d/best_tanglish_model.pt


In [5]:
import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification

model_name = "ai4bharat/indic-bert"

# Load tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_name)

# Load fine-tuned model
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=2)

# Load fine-tuned weights from Hugging Face
model_path = "ratish03/indic-BERT-Classification"  # Update this with your actual model path
model.load_state_dict(torch.hub.load_state_dict_from_url(f"https://huggingface.co/{model_path}/resolve/main/best_tanglish_model.pt"))

# Move model to the correct device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
model.eval()  # Set model to evaluation mode


config.json:   0%|          | 0.00/507 [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/5.65M [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/135M [00:00<?, ?B/s]

Some weights of AlbertForSequenceClassification were not initialized from the model checkpoint at ai4bharat/indic-bert and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Downloading: "https://huggingface.co/ratish03/indic-BERT-Classification/resolve/main/best_tanglish_model.pt" to /root/.cache/torch/hub/checkpoints/best_tanglish_model.pt
100%|██████████| 128M/128M [00:02<00:00, 55.9MB/s] 


AlbertForSequenceClassification(
  (albert): AlbertModel(
    (embeddings): AlbertEmbeddings(
      (word_embeddings): Embedding(200000, 128, padding_idx=0)
      (position_embeddings): Embedding(512, 128)
      (token_type_embeddings): Embedding(2, 128)
      (LayerNorm): LayerNorm((128,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0, inplace=False)
    )
    (encoder): AlbertTransformer(
      (embedding_hidden_mapping_in): Linear(in_features=128, out_features=768, bias=True)
      (albert_layer_groups): ModuleList(
        (0): AlbertLayerGroup(
          (albert_layers): ModuleList(
            (0): AlbertLayer(
              (full_layer_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
              (attention): AlbertSdpaAttention(
                (query): Linear(in_features=768, out_features=768, bias=True)
                (key): Linear(in_features=768, out_features=768, bias=True)
                (value): Linear(in_features=768, out_features

In [19]:
sample_texts = ["thevidiyaaaan","mutti","thampi","theriyum","okkanum","kuthi","koothiya"]

for text in sample_texts:
    result = predict_hate_speech(text, model, tokenizer, device)
    print(f"Text: {result['text']}")
    print(f"Prediction: {result['result']} (Confidence: {result['confidence']:.4f})")
    print()

Text: thevidiyaaaan
Prediction: Hate Speech (Confidence: 0.9986)

Text: mutti
Prediction: Not Hate Speech (Confidence: 0.6793)

Text: thampi
Prediction: Not Hate Speech (Confidence: 0.9711)

Text: theriyum
Prediction: Not Hate Speech (Confidence: 0.7640)

Text: okkanum
Prediction: Hate Speech (Confidence: 0.7707)

Text: kuthi
Prediction: Hate Speech (Confidence: 0.9533)

Text: koothiya
Prediction: Hate Speech (Confidence: 0.9524)

