In [None]:
!pip install transformers datasets scikit-learn pandas

import torch
print("GPU available:", torch.cuda.is_available())


GPU available: False


In [14]:

import torch
import pandas as pd
from sklearn.model_selection import train_test_split
from datasets import Dataset
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer, DataCollatorWithPadding
import numpy as np
from sklearn.metrics import accuracy_score, f1_score
import re

print("GPU available:", torch.cuda.is_available())

# --- Re-load and Clean Data ---
# Make sure the dataset path is correct. Assumes it's still /content/dataset-merged.csv
try:
    df = pd.read_csv("/content/dataset-merged.csv", encoding="utf-8", usecols=["text", "label"])
    print("Successfully loaded with utf-8")
except UnicodeDecodeError:
    print("utf-8 decoding failed, trying latin1")
    df = pd.read_csv("/content/dataset-merged.csv", encoding="latin1", usecols=["text", "label"])
    print("Successfully loaded with latin1")

df.dropna(inplace=True)
df['label'] = df['label'].astype(int)

# Data Cleaning Function (can be refined further if needed)
def clean_text(text):
    # Convert to lowercase
    text = str(text).lower() # Ensure text is string
    # Remove URLs
    text = re.sub(r'http\S+|www\S+|https\S+', '', text, flags=re.MULTILINE)
    # Remove HTML tags
    text = re.sub(r'<.*?>', '', text)
    # Remove special characters and numbers, keep Hindi and English letters, and spaces
    # This regex keeps a-z, A-Z, 0-9 (if needed), and Hindi characters (adjust Unicode range as necessary)
    # A basic range for Devanagari is \u0900-\u097F
    text = re.sub(r'[^a-zA-Z\s\u0900-\u097F]', '', text)
    # Remove extra whitespace
    text = re.sub(r'\s+', ' ', text).strip()
    return text

# Apply cleaning to the dataframe's text column
df['text'] = df['text'].apply(clean_text)

# Handle potentially empty strings after cleaning
df = df[df['text'].str.strip() != '']


# Split data
train_texts, test_texts, train_labels, test_labels = train_test_split(
    df["text"].tolist(), df["label"].tolist(), test_size=0.2, random_state=42)

# --- Use a Hindi-Specific Model ---
model_name = "l3cube-pune/hindi-bert-v2" # A BERT model pre-trained on Hindi

try:
    # Initialize tokenizer for the Hindi model
    tokenizer = AutoTokenizer.from_pretrained(model_name)

    # Tokenization function (same as before)
    def tokenize(batch):
        return tokenizer(batch["text"], truncation=True, max_length=256) # Adjust max_length if texts are longer

    # Create Hugging Face Datasets and tokenize
    train_dataset = Dataset.from_dict({"text": train_texts, "label": train_labels}).map(tokenize, batched=True)
    test_dataset = Dataset.from_dict({"text": test_texts, "label": test_labels}).map(tokenize, batched=True)

    # Initialize the model with the Hindi pre-trained weights
    model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=2)

    # Data collator for padding
    data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

    # Define compute_metrics function for evaluation (same as before)
    def compute_metrics(eval_pred):
        logits, labels = eval_pred
        predictions = np.argmax(logits, axis=-1)
        accuracy = accuracy_score(labels, predictions)
        f1 = f1_score(labels, predictions, average="weighted")
        return {"accuracy": accuracy, "f1": f1}

    # Define Training Arguments (using mixed precision and increased epochs from previous attempt)
    # You might need to adjust learning rate or batch size further.
    if torch.cuda.is_available():
      training_args = TrainingArguments(
          output_dir="./hindi_bert_model",      # New output directory
          eval_strategy="epoch",
          save_strategy="epoch",
          logging_dir="./logs_hindi_bert",      # New logs directory
          learning_rate=2e-5,                   # Standard learning rate
          per_device_train_batch_size=8,        # Consider increasing
          per_device_eval_batch_size=8,         # Consider increasing
          num_train_epochs=6,                   # Increased epochs
          weight_decay=0.01,                    # Standard weight decay, can adjust
          save_total_limit=2,
          load_best_model_at_end=True,
          metric_for_best_model="accuracy",
          fp16=True,                            # Enable mixed precision
          # gradient_accumulation_steps=2,      # Uncomment and adjust if needed
          report_to="none"                      # Disable reporting to external services if not used
      )
      # Move model to GPU
      model.to('cuda')
    else:
      print("GPU not available, training on CPU. This will be slow.")
      training_args = TrainingArguments(
          output_dir="./hindi_bert_model",
          eval_strategy="epoch",
          save_strategy="epoch",
          logging_dir="./logs_hindi_bert",
          learning_rate=2e-5,
          per_device_train_batch_size=8,
          per_device_eval_batch_size=8,
          num_train_epochs=6,
          weight_decay=0.01,
          save_total_limit=2,
          load_best_model_at_end=True,
          metric_for_best_model="accuracy",
          report_to="none"
      )


    # Initialize the Trainer
    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_dataset,
        eval_dataset=test_dataset,
        tokenizer=tokenizer,
        data_collator=data_collator,
        compute_metrics=compute_metrics
    )

    # Train the model
    print(f"Starting training with {model_name}...")
    trainer.train()
    print("Training finished.")

    # Evaluate the model
    print(f"\nEvaluation results with {model_name}:")
    evaluation_results_hindi_bert = trainer.evaluate()
    print(evaluation_results_hindi_bert)

except Exception as e:
    print(f"An error occurred while using {model_name}: {e}")
    print("Please check if the model exists and is compatible, or try 'bert-base-multilingual-cased' again with more aggressive tuning.")
    # Fallback or next steps could be to refine data cleaning, increase epochs/batch size for multilingual BERT, etc.



GPU available: True
Successfully loaded with utf-8


tokenizer_config.json:   0%|          | 0.00/453 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/3.16M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/6.41M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/125 [00:00<?, ?B/s]

Map:   0%|          | 0/11057 [00:00<?, ? examples/s]

Map:   0%|          | 0/2765 [00:00<?, ? examples/s]

config.json:   0%|          | 0.00/663 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/951M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at l3cube-pune/hindi-bert-v2 and are newly initialized: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight', 'classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(


Starting training with l3cube-pune/hindi-bert-v2...


Epoch,Training Loss,Validation Loss,Accuracy,F1
1,0.4269,0.351459,0.877758,0.875963
2,0.3036,0.315664,0.898011,0.896346
3,0.2285,0.365973,0.903436,0.902117
4,0.1686,0.441943,0.894756,0.893342
5,0.1192,0.562591,0.896564,0.895557
6,0.0733,0.602227,0.892948,0.892213


Training finished.

Evaluation results with l3cube-pune/hindi-bert-v2:


{'eval_loss': 0.36597296595573425, 'eval_accuracy': 0.9034358047016274, 'eval_f1': 0.9021165183680782, 'eval_runtime': 10.4405, 'eval_samples_per_second': 264.833, 'eval_steps_per_second': 33.14, 'epoch': 6.0}
