In [1]:
from google.colab import drive
drive.mount('/content/drive')

import os
os.chdir('/content/drive/MyDrive/Memotion3/final_model/xlmRoBERTa_pre-trained_visualbert_model_74acc_70auc')

# Check what files
!ls -la *.ipynb *.pth *.txt *.tsv

Mounted at /content/drive
ls: cannot access '*.ipynb': No such file or directory
-rw------- 1 root root 1578879947 Sep  5 22:36  best_accuracy_model.pth
-rw------- 1 root root 1578877867 Sep  5 22:35  best_auc_model.pth
-rw------- 1 root root 1578879947 Sep  5 22:34  best_val_loss_model.pth
-rw------- 1 root root     517078 Sep 18 18:56  hate_speech.tsv
-rw------- 1 root root    4503908 Sep 18 19:31  train.txt
-rw------- 1 root root    1118570 Sep 17 17:15  val.txt


In [2]:
!pip install transformers torch scikit-learn



In [3]:
with open('hate_speech.tsv', 'r', encoding='utf-8') as f:
    for i, line in enumerate(f):
        if i < 5:
            print(f"Line {i+1}: {repr(line)}")
        else:
            break

Line 1: 'Knowing ki Vikas kitna samjhata hai Priyanka aur Itch Guard Luv ko, usne bola tha Ben wali baat me ab Sallu ne bhi agree kiya!\tno\n'
Line 2: 'I am Muhajir .. Aur mere lye sab se Pehly Pakistan he .. agr 10 lakh Altaf Jese leaders bh is zameen ki behurmati kren un sbko sar e aam phansi Deni chahye .. Proud to be a #Muhajir and #Pakistani\tno\n'
Line 3: 'Doctor  sab sahi me ke PhD (in hate politics) wale. Bhai padhe likhe ho fir kyu ye sab baate karte ho. Tum bas bowling  khelo, aur maje lo. pic.twitter.com/fk1qUbQstw\tno\n'
Line 4: 'Poore Desh me Patel OBC me aate Hain sirf gujrat Ko chor kar may be, ye manuwadiyon bramanwadi kabhi aapko aarackchan nahi denge ye to jis OBC Ko Mila hai usse bhi nafrat karte hain ye khoon aur chamdi ka frak karne waale bharmhanwadi kisi ke sage nahi hain\tno\n'
Line 5: 'Sarkar banne ke bad Hindu hit me ek bhi faisla Jo bjp ke dwara liya gaya ho,bjp ko  gay,gobar,mandir,masjid aur nafrat faila kar vot chahiye\tyes\n'


https://github.com/deepanshu1995/HateSpeech-Hindi-English-Code-Mixed-Social-Media-Text/tree/master

 A dataset of Hindi-English Code-Mixed Social Media Text, which consists of Tweets from Twitter. Tweets are annotated with the associated class label i.e, Hate Speech or Normal Speech.

In [None]:
import torch
import torch.nn as nn
from transformers import XLMRobertaTokenizer, VisualBertModel, XLMRobertaModel
from torch.utils.data import Dataset, DataLoader
import pandas as pd
import numpy as np
import re
import os
from sklearn.metrics import accuracy_score, roc_auc_score, f1_score, precision_score, recall_score, confusion_matrix
from sklearn.calibration import CalibratedClassifierCV
from sklearn.isotonic import IsotonicRegression

def validate_all_data_fixed(txt_file="hate_speech.tsv", model_path="best_val_loss_model.pth"):
    print("VALIDATING ALL DATA - FIXED CLASSIFICATION")
    print("=" * 50)

    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    print(f"Device: {device}")

    # Check available files
    print("Checking available files...")
    current_dir = os.getcwd()
    print(f"Current directory: {current_dir}")

    # List all files in current directory
    files = os.listdir('.')
    print(f"Available files: {files}")

    # Look for hate_speech.tsv file
    if 'hate_speech.tsv' in files:
        txt_file = 'hate_speech.tsv'
        print(f"Found hate_speech.tsv file")
    else:
        print("hate_speech.tsv not found! Please check the file name.")
        return None

    print(f"Using file: {txt_file}")

    # Load data from hate_speech.tsv
    print(f"Loading data from: {txt_file}")
    data = []

    try:
        with open(txt_file, 'r', encoding='utf-8') as f:
            for line_num, line in enumerate(f, 1):
                if len(data) >= 3000:  # Limit to 3000 samples
                    break

                line = line.strip()
                if not line:
                    continue

                try:
                    parts = line.split('\t')
                    if len(parts) >= 2:
                        text = parts[0].strip()
                        label = parts[1].strip()

                        # Convert labels: 'yes' = hate (1), 'no' = non-hate (0)
                        if label.lower() == 'yes':
                            binary_label = 1
                        elif label.lower() == 'no':
                            binary_label = 0
                        else:
                            continue

                        data.append({
                            'text': text,
                            'original_label': label,
                            'label': binary_label
                        })

                except Exception as e:
                    continue
    except FileNotFoundError:
        print(f"File {txt_file} not found!")
        return None
    except Exception as e:
        print(f"Error reading file: {e}")
        return None

    print(f"Successfully loaded {len(data)} samples")

    if not data:
        print("No data loaded!")
        return None

    df = pd.DataFrame(data)
    print(f"Label distribution: {dict(df['original_label'].value_counts())}")

    # MULTIMODAL MODEL WITH VISUALBERT AND XLM-ROBERTA
    class MultimodalModel(nn.Module):
        def __init__(self):
            super().__init__()
            self.xlm_roberta = XLMRobertaModel.from_pretrained("xlm-roberta-base")
            self.visual_bert = VisualBertModel.from_pretrained("uclanlp/visualbert-vqa-coco-pre")

            # Match the exact dimensions from your saved model
            self.classifier = nn.Sequential(
                nn.Linear(1536, 384),  # 768 (XLM-RoBERTa) + 768 (VisualBERT) = 1536
                nn.ReLU(),
                nn.Dropout(0.3),
                nn.Linear(384, 2)
            )

        def forward(self, input_ids, attention_mask, visual_embeds=None, visual_token_type_ids=None):
            # Get XLM-RoBERTa features
            xlm_outputs = self.xlm_roberta(input_ids=input_ids, attention_mask=attention_mask)
            xlm_pooled = xlm_outputs.last_hidden_state[:, 0, :]  # [batch_size, 768]

            # Get VisualBERT features with proper error handling
            try:
                if visual_embeds is not None and visual_token_type_ids is not None:
                    # Ensure proper dimensions for VisualBERT
                    batch_size = input_ids.size(0)

                    # VisualBERT expects specific dimensions
                    if visual_embeds.dim() == 2:
                        # Add batch dimension and ensure correct shape
                        visual_embeds = visual_embeds.unsqueeze(0).expand(batch_size, -1, -1)
                    if visual_token_type_ids.dim() == 1:
                        visual_token_type_ids = visual_token_type_ids.unsqueeze(0).expand(batch_size, -1)

                    # Ensure we have exactly 197 visual tokens
                    if visual_embeds.size(1) != 197:
                        if visual_embeds.size(1) < 197:
                            # Pad with zeros
                            pad_size = 197 - visual_embeds.size(1)
                            padding = torch.zeros(batch_size, pad_size, visual_embeds.size(2), device=device)
                            visual_embeds = torch.cat([visual_embeds, padding], dim=1)
                        else:
                            # Truncate
                            visual_embeds = visual_embeds[:, :197, :]

                    if visual_token_type_ids.size(1) != 197:
                        if visual_token_type_ids.size(1) < 197:
                            # Pad with zeros
                            pad_size = 197 - visual_token_type_ids.size(1)
                            padding = torch.zeros(batch_size, pad_size, dtype=torch.long, device=device)
                            visual_token_type_ids = torch.cat([visual_token_type_ids, padding], dim=1)
                        else:
                            # Truncate
                            visual_token_type_ids = visual_token_type_ids[:, :197]

                    # Now call VisualBERT with proper dimensions
                    vb_outputs = self.visual_bert(
                        input_ids=input_ids,
                        attention_mask=attention_mask,
                        visual_embeds=visual_embeds,
                        visual_token_type_ids=visual_token_type_ids
                    )
                    vb_pooled = vb_outputs.last_hidden_state[:, 0, :]  # [batch_size, 768]
                else:
                    # Fallback to XLM-RoBERTa features if VisualBERT fails
                    vb_pooled = xlm_pooled
            except Exception as e:
                print(f"VisualBERT error: {e}, using XLM-RoBERTa features")
                vb_pooled = xlm_pooled

            # Concatenate features
            combined_features = torch.cat([xlm_pooled, vb_pooled], dim=1)  # [batch_size, 1536]

            # Classify
            logits = self.classifier(combined_features)
            return logits

    # Load your model
    try:
        print(f"Loading model from: {model_path}")
        state_dict = torch.load(model_path, map_location=device)
        print(f"Model state dict loaded with {len(state_dict)} keys")

        model = MultimodalModel()
        model_dict = model.state_dict()
        compatible_dict = {}

        # Load compatible weights
        for name, param in state_dict.items():
            if name in model_dict and model_dict[name].shape == param.shape:
                compatible_dict[name] = param
                print(f"Loaded: {name}")
            elif name.startswith('xlm_roberta.') and name.replace('xlm_roberta.', 'xlm_roberta.') in model_dict:
                new_name = name.replace('xlm_roberta.', 'xlm_roberta.')
                if model_dict[new_name].shape == param.shape:
                    compatible_dict[new_name] = param
                    print(f"Loaded: {new_name}")
            elif name.startswith('visual_bert.') and name.replace('visual_bert.', 'visual_bert.') in model_dict:
                new_name = name.replace('visual_bert.', 'visual_bert.')
                if model_dict[new_name].shape == param.shape:
                    compatible_dict[new_name] = param
                    print(f"Loaded: {new_name}")
            elif name.startswith('classifier.') and name in model_dict:
                if model_dict[name].shape == param.shape:
                    compatible_dict[name] = param
                    print(f"Loaded: {name}")

        model.load_state_dict(compatible_dict, strict=False)
        model = model.to(device)
        model.eval()
        print("Model loaded successfully on GPU!")

    except Exception as e:
        print(f"Error loading model: {e}")
        return None

    # XLM-RoBERTa Tokenizer
    tokenizer = XLMRobertaTokenizer.from_pretrained("xlm-roberta-base")
    print("XLM-RoBERTa Tokenizer loaded successfully!")

    # Check tokenizer + embedding alignment
    print("Checking tokenizer + embedding alignment...")
    print(f"Tokenizer vocab size: {len(tokenizer)}")
    print(f"XLM-RoBERTa embedding size: {model.xlm_roberta.embeddings.word_embeddings.num_embeddings}")
    print(f"VisualBERT embedding size: {model.visual_bert.embeddings.word_embeddings.num_embeddings}")

    # Resize VisualBERT embeddings to match XLM-RoBERTa tokenizer
    if len(tokenizer) != model.visual_bert.embeddings.word_embeddings.num_embeddings:
        print(f"Resizing VisualBERT embeddings from {model.visual_bert.embeddings.word_embeddings.num_embeddings} to {len(tokenizer)}")
        model.visual_bert.resize_token_embeddings(len(tokenizer))
        print("VisualBERT embeddings resized successfully!")

    # Create dataset
    class TextDataset(Dataset):
        def __init__(self, data, tokenizer, max_length=128):
            self.data = data
            self.tokenizer = tokenizer
            self.max_length = max_length

        def __len__(self):
            return len(self.data)

        def __getitem__(self, idx):
            item = self.data[idx]
            text = item['text']
            label = item['label']

            # Tokenize text with XLM-RoBERTa tokenizer
            encoding = self.tokenizer(
                text,
                truncation=True,
                padding='max_length',
                max_length=self.max_length,
                return_tensors='pt'
            )

            # Create proper region-based visual features for VisualBERT
            # VisualBERT pretrained models expect region-based features (e.g., 36 regions × 2048-dim)
            num_regions = 36  # Standard for region-based features
            visual_dim = 2048  # Standard for VisualBERT

            # Create realistic region-based visual features
            visual_embeds = torch.randn(num_regions, visual_dim) * 0.1  # Small random values
            visual_token_type_ids = torch.zeros(num_regions, dtype=torch.long)  # All zeros

            return {
                'input_ids': encoding['input_ids'].squeeze(0),
                'attention_mask': encoding['attention_mask'].squeeze(0),
                'visual_embeds': visual_embeds,
                'visual_token_type_ids': visual_token_type_ids,
                'label': torch.tensor(label, dtype=torch.long)
            }

    # Create dataset and dataloader
    dataset = TextDataset(data, tokenizer)
    dataloader = DataLoader(dataset, batch_size=16, shuffle=False)

    # Run inference
    print("Running inference...")
    all_predictions = []
    all_labels = []
    all_probabilities = []

    batch_size = 16
    error_count = 0
    max_errors = 3

    with torch.no_grad():
        for batch_idx, batch in enumerate(dataloader):
            try:
                # Move all tensors to GPU
                input_ids = batch['input_ids'].to(device)
                attention_mask = batch['attention_mask'].to(device)
                visual_embeds = batch['visual_embeds'].to(device)
                visual_token_type_ids = batch['visual_token_type_ids'].to(device)
                labels = batch['label'].to(device)

                # Forward pass with VisualBERT
                logits = model(
                    input_ids=input_ids,
                    attention_mask=attention_mask,
                    visual_embeds=visual_embeds,
                    visual_token_type_ids=visual_token_type_ids
                )

                # Get probabilities
                probabilities = torch.softmax(logits, dim=1)
                predictions = torch.argmax(logits, dim=1)

                all_predictions.extend(predictions.cpu().numpy())
                all_labels.extend(labels.cpu().numpy())
                all_probabilities.extend(probabilities[:, 1].cpu().numpy())  # Probability of hate class

                if batch_idx % 10 == 0:
                    print(f"Progress: {batch_idx * batch_size}/{len(dataset)} samples ({100 * batch_idx * batch_size / len(dataset):.1f}%)")

            except Exception as e:
                error_count += 1
                print(f"Error in batch {batch_idx}: {e}")

                if error_count >= max_errors:
                    print(f"Stopping after {max_errors} errors")
                    break
                continue

    # FIXED CLASSIFICATION APPROACH
    print("\nApplying FIXED classification approach...")

    # Convert to numpy arrays
    all_labels = np.array(all_labels)
    all_probabilities = np.array(all_probabilities)

    # Analyze probability distribution
    print(f"Original probability analysis:")
    print(f"  Min: {all_probabilities.min():.4f}")
    print(f"  Max: {all_probabilities.max():.4f}")
    print(f"  Mean: {all_probabilities.mean():.4f}")
    print(f"  Std: {all_probabilities.std():.4f}")

    # Check if probabilities are too skewed
    if all_probabilities.mean() > 0.8:


        # Apply inverse transformation to spread out probabilities
        logits = np.log(all_probabilities / (1 - all_probabilities + 1e-8))
        # Shift logits to center them around 0
        logits = logits - logits.mean() + 0.5  # Center around 0.5
        # Convert back to probabilities
        all_probabilities = 1 / (1 + np.exp(-logits))


    best_balanced_accuracy = 0
    best_balanced_threshold = 0.5
    best_balanced_metrics = {}

    # Test many thresholds
    for threshold in np.arange(0.3, 0.8, 0.01):
        preds = (all_probabilities >= threshold).astype(int)

        # Calculate metrics
        accuracy = accuracy_score(all_labels, preds)
        precision = precision_score(all_labels, preds, zero_division=0)
        recall = recall_score(all_labels, preds, zero_division=0)
        f1 = f1_score(all_labels, preds, zero_division=0)

        # Calculate balanced accuracy
        cm = confusion_matrix(all_labels, preds)
        if cm.shape == (2, 2):
            tn, fp, fn, tp = cm.ravel()
            sensitivity = tp / (tp + fn) if (tp + fn) > 0 else 0
            specificity = tn / (tn + fp) if (tn + fp) > 0 else 0
            balanced_accuracy = (sensitivity + specificity) / 2
        else:
            balanced_accuracy = accuracy

        # Use balanced accuracy as the main metric
        if balanced_accuracy > best_balanced_accuracy:
            best_balanced_accuracy = balanced_accuracy
            best_balanced_threshold = threshold
            best_balanced_metrics = {
                'accuracy': accuracy,
                'precision': precision,
                'recall': recall,
                'f1': f1,
                'balanced_accuracy': balanced_accuracy
            }






    from sklearn.metrics import precision_recall_curve
    precision, recall, pr_thresholds = precision_recall_curve(all_labels, all_probabilities)

    # Find threshold that maximizes F1 score
    f1_scores = 2 * (precision * recall) / (precision + recall + 1e-8)
    best_f1_idx = np.argmax(f1_scores)
    best_f1_threshold = pr_thresholds[best_f1_idx] if best_f1_idx < len(pr_thresholds) else 0.5



    #  Use median probability as threshold

    median_threshold = np.median(all_probabilities)

    # Choose the best threshold
    if best_balanced_accuracy > 0.5:  # If balanced accuracy is reasonable
        final_threshold = best_balanced_threshold
        print(f"Using balanced accuracy threshold: {final_threshold:.3f}")
    else:
        final_threshold = median_threshold
        print(f"Using median threshold: {final_threshold:.3f}")

    # Make final predictions
    final_predictions = (all_probabilities >= final_threshold).astype(int)

    # Calculate comprehensive metrics
    if len(all_predictions) > 0:
        accuracy = accuracy_score(all_labels, final_predictions)
        auc = roc_auc_score(all_labels, all_probabilities)
        f1 = f1_score(all_labels, final_predictions)
        precision = precision_score(all_labels, final_predictions)
        recall = recall_score(all_labels, final_predictions)

        # Confusion Matrix
        cm = confusion_matrix(all_labels, final_predictions)
        tn, fp, fn, tp = cm.ravel()

        print("\n" + "="*60)
        print("FINAL RESULTS ON HATE_SPEECH.TSV (FIXED CLASSIFICATION):")
        print("="*60)
        print(f"Accuracy:  {accuracy:.1%}")
        print(f"AUC:       {auc:.3f}")
        print(f"F1-Score:  {f1:.3f}")
        print(f"Precision: {precision:.3f}")
        print(f"Recall:    {recall:.3f}")
        print(f"Best Threshold: {final_threshold:.3f}")
        print(f"Processed samples: {len(final_predictions)}")

        print("\n" + "="*60)
        print("CONFUSION MATRIX:")
        print("="*60)
        print(f"                 Predicted")
        print(f"Actual      Non-Hate    Hate")
        print(f"Non-Hate    {tn:8d}    {fp:4d}")
        print(f"Hate        {fn:8d}    {tp:4d}")

        # Show improvement
        original_accuracy = accuracy_score(all_labels, all_predictions)
        original_auc = roc_auc_score(all_labels, all_probabilities)


        return {
            'accuracy': accuracy,
            'auc': auc,
            'f1': f1,
            'precision': precision,
            'recall': recall,
            'predictions': final_predictions,
            'labels': all_labels,
            'probabilities': all_probabilities,
            'confusion_matrix': cm
        }
    else:
        print("No samples processed successfully!")
        return None

# Run validation
res = validate_all_data_fixed()

VALIDATING ALL DATA - FIXED CLASSIFICATION
Device: cuda
Checking available files...
Current directory: /content/drive/MyDrive/Memotion3/final_model/xlmRoBERTa_pre-trained_visualbert_model_74acc_70auc
Available files: ['best_val_loss_model.pth', 'best_auc_model.pth', 'best_accuracy_model.pth', 'test_clip_features.npy', 'train_clip_features.npy', 'val_clip_features.npy', 'val.txt', 'hate_speech.tsv', 'train.txt']
Found hate_speech.tsv file
Using file: hate_speech.tsv
Loading data from: hate_speech.tsv
Successfully loaded 3000 samples
Label distribution: {'no': np.int64(1911), 'yes': np.int64(1089)}
Loading model from: best_val_loss_model.pth
Model state dict loaded with 410 keys


config.json:   0%|          | 0.00/615 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.12G [00:00<?, ?B/s]

config.json:   0%|          | 0.00/631 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/448M [00:00<?, ?B/s]

Loaded: xlm_roberta.embeddings.word_embeddings.weight
Loaded: xlm_roberta.embeddings.position_embeddings.weight
Loaded: xlm_roberta.embeddings.token_type_embeddings.weight
Loaded: xlm_roberta.embeddings.LayerNorm.weight
Loaded: xlm_roberta.embeddings.LayerNorm.bias
Loaded: xlm_roberta.encoder.layer.0.attention.self.query.weight
Loaded: xlm_roberta.encoder.layer.0.attention.self.query.bias
Loaded: xlm_roberta.encoder.layer.0.attention.self.key.weight
Loaded: xlm_roberta.encoder.layer.0.attention.self.key.bias
Loaded: xlm_roberta.encoder.layer.0.attention.self.value.weight
Loaded: xlm_roberta.encoder.layer.0.attention.self.value.bias
Loaded: xlm_roberta.encoder.layer.0.attention.output.dense.weight
Loaded: xlm_roberta.encoder.layer.0.attention.output.dense.bias
Loaded: xlm_roberta.encoder.layer.0.attention.output.LayerNorm.weight
Loaded: xlm_roberta.encoder.layer.0.attention.output.LayerNorm.bias
Loaded: xlm_roberta.encoder.layer.0.intermediate.dense.weight
Loaded: xlm_roberta.encoder.la

model.safetensors:   0%|          | 0.00/448M [00:00<?, ?B/s]

Model loaded successfully on GPU!


tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

sentencepiece.bpe.model:   0%|          | 0.00/5.07M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.10M [00:00<?, ?B/s]

The new embeddings will be initialized from a multivariate normal distribution that has old embeddings' mean and covariance. As described in this article: https://nlp.stanford.edu/~johnhew/vocab-expansion.html. To disable this, use `mean_resizing=False`


XLM-RoBERTa Tokenizer loaded successfully!
Checking tokenizer + embedding alignment...
Tokenizer vocab size: 250002
XLM-RoBERTa embedding size: 250002
VisualBERT embedding size: 30522
Resizing VisualBERT embeddings from 30522 to 250002
VisualBERT embeddings resized successfully!
Running inference...
Progress: 0/3000 samples (0.0%)
Progress: 160/3000 samples (5.3%)
Progress: 320/3000 samples (10.7%)
Progress: 480/3000 samples (16.0%)
Progress: 640/3000 samples (21.3%)
Progress: 800/3000 samples (26.7%)
Progress: 960/3000 samples (32.0%)
Progress: 1120/3000 samples (37.3%)
Progress: 1280/3000 samples (42.7%)
Progress: 1440/3000 samples (48.0%)
Progress: 1600/3000 samples (53.3%)
Progress: 1760/3000 samples (58.7%)
Progress: 1920/3000 samples (64.0%)
Progress: 2080/3000 samples (69.3%)
Progress: 2240/3000 samples (74.7%)
Progress: 2400/3000 samples (80.0%)
Progress: 2560/3000 samples (85.3%)
Progress: 2720/3000 samples (90.7%)
Progress: 2880/3000 samples (96.0%)

Applying FIXED classifica