In [None]:
!pip install transformers datasets sentencepiece accelerate


In [None]:
from transformers import AutoTokenizer, AutoModelForMaskedLM, AutoModelForSequenceClassification

mlm_model_name = "ai4bharat/IndicBERTv2-MLM-only"
cls_model_name = "ai4bharat/IndicBERTv2-CLS"

tokenizer = AutoTokenizer.from_pretrained(mlm_model_name)


In [None]:
import pandas as pd
from datasets import Dataset

unlabeled_df = pd.read_csv("/content/TestV2 - testV2.csv")

# Rename column for convenience (modify if your column name differs)
unlabeled_df = unlabeled_df.rename(columns={"Text": "text"})

# Convert to HF Dataset
unlabeled_dataset = Dataset.from_pandas(unlabeled_df)
unlabeled_dataset


In [None]:
def tokenize_function(batch):
    return tokenizer(
        batch["text"],
        padding="max_length",
        truncation=True,
        max_length=128,
    )

tokenized_unlabeled = unlabeled_dataset.map(tokenize_function, batched=True)


In [None]:
from transformers import DataCollatorForLanguageModeling

mlm_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer,
    mlm=True,
    mlm_probability=0.15,
)


In [None]:
from transformers import Trainer, TrainingArguments, AutoModelForMaskedLM

mlm_model = AutoModelForMaskedLM.from_pretrained(mlm_model_name)

mlm_training_args = TrainingArguments(
    output_dir="/content/mlm_finetuned",
    eval_strategy="no",              # FIXED
    learning_rate=2e-5,
    num_train_epochs=3,
    per_device_train_batch_size=16,
    save_steps=5000,
    logging_steps=500,
    weight_decay=0.01,
)

mlm_trainer = Trainer(
    model=mlm_model,
    args=mlm_training_args,
    train_dataset=tokenized_unlabeled,
    data_collator=mlm_collator,
)

mlm_trainer.train()
mlm_trainer.save_model("/content/mlm_tamil_adapted")


Re RUn from below step7

In [None]:
import pandas as pd
from datasets import Dataset

# Load CSV
labeled_df = pd.read_csv("/content/trainV2.csv")

# Rename columns
labeled_df = labeled_df.rename(columns={"Text": "text", "Class": "label"})

# Normalize text labels
labeled_df["label"] = labeled_df["label"].astype(str).str.strip()

# Mapping based on your actual labels
label_map = {
    "Non-Abusive": 0,
    "Abusive": 1,
    "abusive": 1
}

# Apply mapping
labeled_df["label"] = labeled_df["label"].map(label_map)

# Check if any label failed to map (should be empty)
print("Unmapped labels:", labeled_df[labeled_df["label"].isna()]["label"].unique())

# Convert to integer
labeled_df["label"] = labeled_df["label"].astype(int)

print(labeled_df.head())
print(labeled_df.dtypes)


In [None]:
labeled_dataset = Dataset.from_pandas(labeled_df)

def tokenize_function(batch):
    return tokenizer(
        batch["text"],
        padding="max_length",
        truncation=True,
        max_length=128
    )

tokenized_labeled = labeled_dataset.map(tokenize_function, batched=True)


In [None]:
from transformers import AutoModelForSequenceClassification

cls_model = AutoModelForSequenceClassification.from_pretrained(
    "/content/mlm_tamil_adapted",
    num_labels=2
)

# Force use of CrossEntropyLoss (solves your error)
cls_model.config.problem_type = "single_label_classification"


In [None]:
from transformers import TrainingArguments

cls_training_args = TrainingArguments(
    output_dir="/content/tamil_abuse_classifier",
    eval_strategy="epoch",          # ‚≠ê NEW API (no evaluation_strategy)
    learning_rate=2e-5,
    num_train_epochs=4,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    weight_decay=0.01,
    logging_steps=200,
)


In [None]:
from transformers import Trainer

cls_trainer = Trainer(
    model=cls_model,
    args=cls_training_args,
    train_dataset=tokenized_labeled,
    eval_dataset=tokenized_labeled,   # use train for now unless you have a dev set
)

cls_trainer.train()



In [None]:
cls_trainer.save_model("/content/tamil_abusive_classifier_final")
tokenizer.save_pretrained("/content/tamil_abusive_classifier_final")


In [None]:
from transformers import pipeline

classifier = pipeline(
    "text-classification",
    model="/content/tamil_abusive_classifier_final",
    tokenizer=tokenizer
)

print(classifier("‡Æá‡Æ§‡ØÅ ‡Æé‡Æ©‡Øç‡Æ©‡Æü‡Ææ ‡Æö‡Øä‡Æ≤‡Øç‡Æ±‡Øá"))


In [None]:
!pip install scikit-learn matplotlib


In [None]:
from sklearn.model_selection import train_test_split

train_df, test_df = train_test_split(labeled_df, test_size=0.2, random_state=42, stratify=labeled_df["label"])


In [None]:
test_dataset = Dataset.from_pandas(test_df)
tokenized_test = test_dataset.map(tokenize_function, batched=True)


In [None]:
import numpy as np
from sklearn.metrics import accuracy_score, precision_recall_fscore_support, confusion_matrix
import matplotlib.pyplot as plt
import seaborn as sns
from transformers import AutoModelForSequenceClassification, Trainer

def evaluate_model(model_path, model_name="MODEL"):
    # Load model
    model = AutoModelForSequenceClassification.from_pretrained(model_path, num_labels=2)
    model.config.problem_type = "single_label_classification"

    # Trainer without tokenizer (new API)
    trainer = Trainer(
        model=model
    )

    # Predict
    raw_preds = trainer.predict(tokenized_test)
    preds = np.argmax(raw_preds.predictions, axis=1)
    true = np.array(test_df["label"])

    # Compute metrics
    acc = accuracy_score(true, preds)
    precision, recall, f1, _ = precision_recall_fscore_support(true, preds, average='binary')

    print("üìå Results for:", model_name)
    print("-------------------------------")
    print("Accuracy :", acc)
    print("Precision:", precision)
    print("Recall   :", recall)
    print("F1 Score :", f1)

    # Confusion matrix
    cm = confusion_matrix(true, preds)
    plt.figure(figsize=(6,4))
    sns.heatmap(cm, annot=True, fmt='d', cmap="Blues",
                xticklabels=["Non-Abusive", "Abusive"],
                yticklabels=["Non-Abusive", "Abusive"])
    plt.title(f"Confusion Matrix: {model_name}")
    plt.xlabel("Predicted")
    plt.ylabel("Actual")
    plt.show()


In [None]:
evaluate_model("/content/tamil_abusive_classifier_final",
               "Pipeline Model (MLM + Classifier)")
