In [None]:
# Install required packages
!pip install -q transformers scikit-learn pandas torch tqdm

import pandas as pd
import torch
import torch.nn.functional as F
from transformers import AutoTokenizer, AutoModel, BertModel, RobertaModel
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.linear_model import LogisticRegression
from tqdm.auto import tqdm
import numpy as np
import warnings
warnings.filterwarnings('ignore')

# =============================================================================
# DATA LOADING AND PREPARATION
# =============================================================================

def load_and_prepare_data():
    """Load the data and prepare it for classification"""
    print("📂 Loading data...")

    # Load both datasets
    df_slang = pd.read_csv("/content/slang_OpenSub_filtered.tsv", sep="\t", dtype=str)
    df_nonslang = pd.read_csv("/content/slang_OpenSub_negatives_filtered.tsv", sep="\t", dtype=str)

    # Add labels and confidence
    df_slang["label"] = 1  # slang
    df_slang["confidence"] = df_slang["ANNOTATOR_CONFIDENCE"].astype(int)

    df_nonslang["label"] = 0  # non-slang
    df_nonslang["confidence"] = 0  # non-slang doesn't have confidence ratings

    # Combine datasets
    df_all = pd.concat([df_slang, df_nonslang], ignore_index=True)

    # Extract sentences and labels
    sentences = df_all["SENTENCE"].tolist()
    true_labels = df_all["label"].tolist()
    confidence_levels = df_all["confidence"].tolist()

    print(f"✅ Data loaded successfully!")
    print(f"📊 Total sentences: {len(sentences)}")
    print(f"📊 Slang sentences: {sum(true_labels)}")
    print(f"📊 Non-slang sentences: {len(true_labels) - sum(true_labels)}")

    # Print confidence distribution for slang sentences
    slang_confidence = df_slang["confidence"].value_counts().sort_index()
    print(f"📊 Slang confidence distribution:")
    for conf, count in slang_confidence.items():
        print(f"   Confidence {conf}: {count} sentences")

    return sentences, true_labels, confidence_levels

# =============================================================================
# REPRESENTATION EXTRACTION FOR DIFFERENT MODEL TYPES
# =============================================================================

def get_model_type(model_name):
    """Determine the model type for proper loading"""
    if 'bart' in model_name.lower():
        return 'bart'
    elif 'bert' in model_name.lower():
        return 'bert'
    elif 'roberta' in model_name.lower():
        return 'roberta'
    elif 'deberta' in model_name.lower():
        return 'deberta'
    else:
        return 'unknown'

def extract_representations(model_name, sentences, layer_index=-2):
    """Extract representations from specified layer for different model types"""
    print(f"🔧 Loading model for representation extraction: {model_name}")

    model_type = get_model_type(model_name)
    print(f"🔧 Detected model type: {model_type}")

    # Load tokenizer
    tokenizer = AutoTokenizer.from_pretrained(model_name)

    # Load appropriate model type
    if model_type == 'bart':
        # For BART, we need to use the encoder part
        from transformers import BartModel
        model = BartModel.from_pretrained(model_name)
        use_encoder_only = True
    elif model_type == 'bert':
        model = BertModel.from_pretrained(model_name)
        use_encoder_only = False
    elif model_type == 'roberta':
        model = RobertaModel.from_pretrained(model_name)
        use_encoder_only = False
    elif model_type == 'deberta':
        from transformers import DebertaV2Model
        model = DebertaV2Model.from_pretrained(model_name)
        use_encoder_only = False
    else:
        # Fallback to AutoModel
        model = AutoModel.from_pretrained(model_name)
        use_encoder_only = False

    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model.to(device).eval()

    print(f"🔧 Model loaded on: {device}")
    print(f"🔧 Extracting representations from layer {layer_index}")

    representations = []

    for sentence in tqdm(sentences, desc="Extracting representations"):
        # Tokenize
        inputs = tokenizer(
            sentence,
            return_tensors="pt",
            padding=True,
            truncation=True,
            max_length=256
        ).to(device)

        with torch.no_grad():
            if use_encoder_only:
                # For BART, use encoder
                outputs = model.encoder(**inputs, output_hidden_states=True)
                hidden_states = outputs.hidden_states[layer_index]
            else:
                # For other models
                outputs = model(**inputs, output_hidden_states=True)
                hidden_states = outputs.hidden_states[layer_index]

            # Use mean pooling over sequence length
            representation = hidden_states.mean(dim=1).squeeze().cpu().numpy()
            representations.append(representation)

    return np.array(representations)

# =============================================================================
# EVALUATION FUNCTIONS
# =============================================================================

def evaluate_model_representations(model_name, representations, true_labels, confidence_levels):
    """Evaluate representation-based classification with logistic regression"""
    print(f"\n{'='*60}")
    print(f"🔍 Evaluating: {model_name}")
    print(f"{'='*60}")

    # Split data for training classifier
    n_train = int(0.7 * len(representations))
    indices = np.random.permutation(len(representations))

    train_indices = indices[:n_train]
    test_indices = indices[n_train:]

    X_train = representations[train_indices]
    y_train = np.array(true_labels)[train_indices]
    X_test = representations[test_indices]
    y_test = np.array(true_labels)[test_indices]
    test_confidence = np.array(confidence_levels)[test_indices]

    # Train logistic regression
    print("🔧 Training Logistic Regression on internal representations...")
    clf = LogisticRegression(random_state=42, max_iter=1000)
    clf.fit(X_train, y_train)
    predictions = clf.predict(X_test)

    # Calculate overall metrics
    overall_accuracy = accuracy_score(y_test, predictions)

    # Calculate per-class metrics
    slang_mask = y_test == 1
    nonslang_mask = y_test == 0

    slang_accuracy = accuracy_score(y_test[slang_mask], predictions[slang_mask]) if slang_mask.any() else 0
    nonslang_accuracy = accuracy_score(y_test[nonslang_mask], predictions[nonslang_mask]) if nonslang_mask.any() else 0

    # Calculate confidence-based metrics for slang sentences
    confidence_metrics = {}
    for conf_level in [1, 2, 3]:
        conf_mask = (y_test == 1) & (test_confidence == conf_level)
        if conf_mask.any():
            conf_accuracy = accuracy_score(y_test[conf_mask], predictions[conf_mask])
            total_sentences = conf_mask.sum()
            correct_predictions = (predictions[conf_mask] == y_test[conf_mask]).sum()
            confidence_metrics[conf_level] = {
                'accuracy': conf_accuracy,
                'total_sentences': total_sentences,
                'correct_predictions': correct_predictions
            }
        else:
            confidence_metrics[conf_level] = {
                'accuracy': 0.0,
                'total_sentences': 0,
                'correct_predictions': 0
            }

    # Generate confusion matrix and classification report
    cm = confusion_matrix(y_test, predictions)
    class_report = classification_report(
        y_test,
        predictions,
        target_names=['Non-Slang', 'Slang'],
        output_dict=True
    )

    results = {
        'model_name': model_name,
        'overall_accuracy': overall_accuracy,
        'slang_accuracy': slang_accuracy,
        'nonslang_accuracy': nonslang_accuracy,
        'confidence_metrics': confidence_metrics,
        'confusion_matrix': cm,
        'classification_report': class_report,
        'predictions': predictions,
        'balance_score': (slang_accuracy + nonslang_accuracy) / 2
    }

    print_model_results(results)
    return results

def print_model_results(results):
    """Print detailed results for a single model"""
    print(f"\n📈 RESULTS FOR: {results['model_name']}")
    print("-" * 50)

    print(f"🎯 Overall Accuracy: {results['overall_accuracy']:.4f}")
    print(f"🔥 Slang Accuracy: {results['slang_accuracy']:.4f}")
    print(f"❄️  Non-Slang Accuracy: {results['nonslang_accuracy']:.4f}")
    print(f"⚖️  Balance Score: {results['balance_score']:.4f}")

    print(f"\n📊 CONFIDENCE-BASED ACCURACY (Slang sentences only):")
    for conf_level in [1, 2, 3]:
        metrics = results['confidence_metrics'][conf_level]
        if metrics['total_sentences'] > 0:
            print(f"   Confidence {conf_level}: {metrics['accuracy']:.4f} "
                  f"({metrics['correct_predictions']}/{metrics['total_sentences']} sentences)")
        else:
            print(f"   Confidence {conf_level}: No sentences found")

    print(f"\n📋 CONFUSION MATRIX:")
    cm = results['confusion_matrix']
    print(f"                Predicted")
    print(f"              Non-Slang  Slang")
    print(f"Actual Non-Slang  {cm[0,0]:4d}    {cm[0,1]:4d}")
    print(f"       Slang      {cm[1,0]:4d}    {cm[1,1]:4d}")

    report = results['classification_report']
    print(f"\n📋 DETAILED CLASSIFICATION REPORT:")
    print(f"              Precision  Recall  F1-Score  Support")
    print(f"Non-Slang        {report['Non-Slang']['precision']:.3f}   {report['Non-Slang']['recall']:.3f}    {report['Non-Slang']['f1-score']:.3f}     {report['Non-Slang']['support']}")
    print(f"Slang            {report['Slang']['precision']:.3f}   {report['Slang']['recall']:.3f}    {report['Slang']['f1-score']:.3f}     {report['Slang']['support']}")
    print(f"Macro Avg        {report['macro avg']['precision']:.3f}   {report['macro avg']['recall']:.3f}    {report['macro avg']['f1-score']:.3f}     {report['macro avg']['support']}")
    print(f"Weighted Avg     {report['weighted avg']['precision']:.3f}   {report['weighted avg']['recall']:.3f}    {report['weighted avg']['f1-score']:.3f}     {report['weighted avg']['support']}")

def create_summary_table(all_results):
    """Create a summary table comparing all models"""
    print(f"\n{'='*80}")
    print(f"📊 SUMMARY COMPARISON - REPRESENTATION-BASED CLASSIFICATION")
    print(f"{'='*80}")

    print(f"{'Model':<25} {'Overall':<8} {'Slang':<8} {'Non-Slang':<10} {'Balance':<8} {'Conf-1':<8} {'Conf-2':<8} {'Conf-3':<8}")
    print("-" * 80)

    for results in all_results:
        model_short = results['model_name'].split('/')[-1][:20]
        conf1_acc = results['confidence_metrics'][1]['accuracy']
        conf2_acc = results['confidence_metrics'][2]['accuracy']
        conf3_acc = results['confidence_metrics'][3]['accuracy']

        print(f"{model_short:<25} {results['overall_accuracy']:<8.4f} {results['slang_accuracy']:<8.4f} "
              f"{results['nonslang_accuracy']:<10.4f} {results['balance_score']:<8.4f} "
              f"{conf1_acc:<8.4f} {conf2_acc:<8.4f} {conf3_acc:<8.4f}")

def create_detailed_summary(all_results):
    """Create a detailed summary with additional insights"""
    print(f"\n{'='*80}")
    print(f"🔍 DETAILED ANALYSIS")
    print(f"{'='*80}")

    # Find best performing models
    best_overall = max(all_results, key=lambda x: x['overall_accuracy'])
    best_slang = max(all_results, key=lambda x: x['slang_accuracy'])
    best_nonslang = max(all_results, key=lambda x: x['nonslang_accuracy'])
    best_balanced = max(all_results, key=lambda x: x['balance_score'])

    print(f"🏆 Best Overall Accuracy: {best_overall['model_name']} ({best_overall['overall_accuracy']:.4f})")
    print(f"🔥 Best Slang Detection: {best_slang['model_name']} ({best_slang['slang_accuracy']:.4f})")
    print(f"❄️  Best Non-Slang Detection: {best_nonslang['model_name']} ({best_nonslang['nonslang_accuracy']:.4f})")
    print(f"⚖️  Best Balanced Performance: {best_balanced['model_name']} ({best_balanced['balance_score']:.4f})")

    # Confidence analysis
    print(f"\n📊 CONFIDENCE LEVEL ANALYSIS:")
    for conf_level in [1, 2, 3]:
        accuracies = [r['confidence_metrics'][conf_level]['accuracy']
                     for r in all_results
                     if r['confidence_metrics'][conf_level]['total_sentences'] > 0]
        if accuracies:
            avg_acc = np.mean(accuracies)
            best_conf_model = max(all_results,
                                key=lambda x: x['confidence_metrics'][conf_level]['accuracy'])
            print(f"   Confidence {conf_level}: Avg={avg_acc:.4f}, "
                  f"Best={best_conf_model['model_name']} ({best_conf_model['confidence_metrics'][conf_level]['accuracy']:.4f})")

# =============================================================================
# MAIN EXECUTION
# =============================================================================

def main():
    """Main execution function"""
    print("🚀 Starting Representation-Based Slang Classification Analysis")
    print("=" * 70)

    # Load data
    sentences, true_labels, confidence_levels = load_and_prepare_data()

    # Define models to evaluate
    models = {
        "BART-large-MNLI": "facebook/bart-large-mnli",
        "BERT-base-MNLI": "textattack/bert-base-uncased-MNLI",
        "RoBERTa-large-MNLI": "roberta-large-mnli",
        "DeBERTa-v3-large-MNLI": "MoritzLaurer/DeBERTa-v3-large-mnli-fever-anli-ling-wanli"
    }

    # Evaluate all models using internal representations
    all_results = []

    for model_name, model_id in models.items():
        try:
            print(f"\n{'='*70}")
            print(f"🔍 PROCESSING: {model_name}")
            print(f"{'='*70}")

            # Extract representations from second-to-last layer
            representations = extract_representations(model_id, sentences, layer_index=-2)

            # Evaluate using logistic regression on representations
            results = evaluate_model_representations(model_id, representations, true_labels, confidence_levels)
            results['model_name'] = model_name  # Use friendly name
            all_results.append(results)

        except Exception as e:
            print(f"❌ Error evaluating {model_name}: {str(e)}")
            continue

    # Create summary comparisons
    if all_results:
        create_summary_table(all_results)
        create_detailed_summary(all_results)

        # Save results to CSV for further analysis
        print(f"\n💾 Saving detailed results...")

        # Create a comprehensive results DataFrame
        results_data = []
        for results in all_results:
            base_data = {
                'model': results['model_name'],
                'overall_accuracy': results['overall_accuracy'],
                'slang_accuracy': results['slang_accuracy'],
                'nonslang_accuracy': results['nonslang_accuracy'],
                'balance_score': results['balance_score'],
            }

            # Add confidence-based metrics
            for conf_level in [1, 2, 3]:
                conf_metrics = results['confidence_metrics'][conf_level]
                base_data[f'confidence_{conf_level}_accuracy'] = conf_metrics['accuracy']
                base_data[f'confidence_{conf_level}_total'] = conf_metrics['total_sentences']
                base_data[f'confidence_{conf_level}_correct'] = conf_metrics['correct_predictions']

            # Add classification metrics
            base_data['slang_precision'] = results['classification_report']['Slang']['precision']
            base_data['slang_recall'] = results['classification_report']['Slang']['recall']
            base_data['slang_f1'] = results['classification_report']['Slang']['f1-score']
            base_data['nonslang_precision'] = results['classification_report']['Non-Slang']['precision']
            base_data['nonslang_recall'] = results['classification_report']['Non-Slang']['recall']
            base_data['nonslang_f1'] = results['classification_report']['Non-Slang']['f1-score']

            results_data.append(base_data)

        results_df = pd.DataFrame(results_data)
        results_df.to_csv('/content/representation_based_slang_results.csv', index=False)
        print(f"✅ Results saved to '/content/representation_based_slang_results.csv'")

        print(f"\n🎉 Analysis completed successfully!")
        print(f"📊 Evaluated {len(all_results)} models on {len(sentences)} sentences")
        print(f"💡 All models used internal representations + Logistic Regression")
        print(f"🔑 This approach bypasses the problematic final layer and leverages")
        print(f"   the internal slang understanding that TCAV revealed!")

    else:
        print("❌ No models were successfully evaluated")

# Run the analysis
if __name__ == "__main__":
    main()

[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m363.4/363.4 MB[0m [31m4.8 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m13.8/13.8 MB[0m [31m118.6 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m24.6/24.6 MB[0m [31m99.6 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m883.7/883.7 kB[0m [31m52.4 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m664.8/664.8 MB[0m [31m1.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m211.5/211.5 MB[0m [31m11.9 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m56.3/56.3 MB[0m [31m44.4 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m127.9/127.9 MB[0m [31m19.9 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━

tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.15k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.63G [00:00<?, ?B/s]

🔧 Model loaded on: cuda
🔧 Extracting representations from layer -2


Extracting representations:   0%|          | 0/3617 [00:00<?, ?it/s]


🔍 Evaluating: facebook/bart-large-mnli
🔧 Training Logistic Regression on internal representations...

📈 RESULTS FOR: facebook/bart-large-mnli
--------------------------------------------------
🎯 Overall Accuracy: 0.7348
🔥 Slang Accuracy: 0.6147
❄️  Non-Slang Accuracy: 0.8154
⚖️  Balance Score: 0.7150

📊 CONFIDENCE-BASED ACCURACY (Slang sentences only):
   Confidence 1: 0.5425 (166/306 sentences)
   Confidence 2: 0.7736 (82/106 sentences)
   Confidence 3: 0.8333 (20/24 sentences)

📋 CONFUSION MATRIX:
                Predicted
              Non-Slang  Slang
Actual Non-Slang   530     120
       Slang       168     268

📋 DETAILED CLASSIFICATION REPORT:
              Precision  Recall  F1-Score  Support
Non-Slang        0.759   0.815    0.786     650.0
Slang            0.691   0.615    0.650     436.0
Macro Avg        0.725   0.715    0.718     1086.0
Weighted Avg     0.732   0.735    0.732     1086.0

🔍 PROCESSING: BERT-base-MNLI
🔧 Loading model for representation extraction: textattack

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/630 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/438M [00:00<?, ?B/s]

🔧 Model loaded on: cuda
🔧 Extracting representations from layer -2


Extracting representations:   0%|          | 0/3617 [00:00<?, ?it/s]

model.safetensors:   0%|          | 0.00/438M [00:00<?, ?B/s]


🔍 Evaluating: textattack/bert-base-uncased-MNLI
🔧 Training Logistic Regression on internal representations...

📈 RESULTS FOR: textattack/bert-base-uncased-MNLI
--------------------------------------------------
🎯 Overall Accuracy: 0.7219
🔥 Slang Accuracy: 0.5899
❄️  Non-Slang Accuracy: 0.8098
⚖️  Balance Score: 0.6998

📊 CONFIDENCE-BASED ACCURACY (Slang sentences only):
   Confidence 1: 0.5552 (171/308 sentences)
   Confidence 2: 0.6509 (69/106 sentences)
   Confidence 3: 0.8000 (16/20 sentences)

📋 CONFUSION MATRIX:
                Predicted
              Non-Slang  Slang
Actual Non-Slang   528     124
       Slang       178     256

📋 DETAILED CLASSIFICATION REPORT:
              Precision  Recall  F1-Score  Support
Non-Slang        0.748   0.810    0.778     652.0
Slang            0.674   0.590    0.629     434.0
Macro Avg        0.711   0.700    0.703     1086.0
Weighted Avg     0.718   0.722    0.718     1086.0

🔍 PROCESSING: RoBERTa-large-MNLI
🔧 Loading model for representation 

tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/688 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

You are using a model of type roberta to instantiate a model of type bert. This is not supported for all configurations of models and can yield errors.


model.safetensors:   0%|          | 0.00/1.43G [00:00<?, ?B/s]

Some weights of BertModel were not initialized from the model checkpoint at roberta-large-mnli and are newly initialized: ['embeddings.LayerNorm.bias', 'embeddings.LayerNorm.weight', 'embeddings.position_embeddings.weight', 'embeddings.token_type_embeddings.weight', 'embeddings.word_embeddings.weight', 'encoder.layer.0.attention.output.LayerNorm.bias', 'encoder.layer.0.attention.output.LayerNorm.weight', 'encoder.layer.0.attention.output.dense.bias', 'encoder.layer.0.attention.output.dense.weight', 'encoder.layer.0.attention.self.key.bias', 'encoder.layer.0.attention.self.key.weight', 'encoder.layer.0.attention.self.query.bias', 'encoder.layer.0.attention.self.query.weight', 'encoder.layer.0.attention.self.value.bias', 'encoder.layer.0.attention.self.value.weight', 'encoder.layer.0.intermediate.dense.bias', 'encoder.layer.0.intermediate.dense.weight', 'encoder.layer.0.output.LayerNorm.bias', 'encoder.layer.0.output.LayerNorm.weight', 'encoder.layer.0.output.dense.bias', 'encoder.layer.

🔧 Model loaded on: cuda
🔧 Extracting representations from layer -2


Extracting representations:   0%|          | 0/3617 [00:00<?, ?it/s]


🔍 Evaluating: roberta-large-mnli
🔧 Training Logistic Regression on internal representations...

📈 RESULTS FOR: roberta-large-mnli
--------------------------------------------------
🎯 Overall Accuracy: 0.6031
🔥 Slang Accuracy: 0.4005
❄️  Non-Slang Accuracy: 0.7319
⚖️  Balance Score: 0.5662

📊 CONFIDENCE-BASED ACCURACY (Slang sentences only):
   Confidence 1: 0.3910 (113/289 sentences)
   Confidence 2: 0.4299 (46/107 sentences)
   Confidence 3: 0.3846 (10/26 sentences)

📋 CONFUSION MATRIX:
                Predicted
              Non-Slang  Slang
Actual Non-Slang   486     178
       Slang       253     169

📋 DETAILED CLASSIFICATION REPORT:
              Precision  Recall  F1-Score  Support
Non-Slang        0.658   0.732    0.693     664.0
Slang            0.487   0.400    0.440     422.0
Macro Avg        0.572   0.566    0.566     1086.0
Weighted Avg     0.591   0.603    0.594     1086.0

🔍 PROCESSING: DeBERTa-v3-large-MNLI
🔧 Loading model for representation extraction: MoritzLaurer/De

tokenizer_config.json:   0%|          | 0.00/395 [00:00<?, ?B/s]

spm.model:   0%|          | 0.00/2.46M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/8.65M [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/18.0 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/156 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.06k [00:00<?, ?B/s]

You are using a model of type deberta-v2 to instantiate a model of type bert. This is not supported for all configurations of models and can yield errors.


model.safetensors:   0%|          | 0.00/870M [00:00<?, ?B/s]

Some weights of BertModel were not initialized from the model checkpoint at MoritzLaurer/DeBERTa-v3-large-mnli-fever-anli-ling-wanli and are newly initialized: ['embeddings.LayerNorm.bias', 'embeddings.LayerNorm.weight', 'embeddings.position_embeddings.weight', 'embeddings.token_type_embeddings.weight', 'embeddings.word_embeddings.weight', 'encoder.layer.0.attention.output.LayerNorm.bias', 'encoder.layer.0.attention.output.LayerNorm.weight', 'encoder.layer.0.attention.output.dense.bias', 'encoder.layer.0.attention.output.dense.weight', 'encoder.layer.0.attention.self.key.bias', 'encoder.layer.0.attention.self.key.weight', 'encoder.layer.0.attention.self.query.bias', 'encoder.layer.0.attention.self.query.weight', 'encoder.layer.0.attention.self.value.bias', 'encoder.layer.0.attention.self.value.weight', 'encoder.layer.0.intermediate.dense.bias', 'encoder.layer.0.intermediate.dense.weight', 'encoder.layer.0.output.LayerNorm.bias', 'encoder.layer.0.output.LayerNorm.weight', 'encoder.layer

🔧 Model loaded on: cuda
🔧 Extracting representations from layer -2


Extracting representations:   0%|          | 0/3617 [00:00<?, ?it/s]

❌ Error evaluating DeBERTa-v3-large-MNLI: CUDA error: device-side assert triggered
CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
For debugging consider passing CUDA_LAUNCH_BLOCKING=1
Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.


📊 SUMMARY COMPARISON - REPRESENTATION-BASED CLASSIFICATION
Model                     Overall  Slang    Non-Slang  Balance  Conf-1   Conf-2   Conf-3  
--------------------------------------------------------------------------------
BART-large-MNLI           0.7348   0.6147   0.8154     0.7150   0.5425   0.7736   0.8333  
BERT-base-MNLI            0.7219   0.5899   0.8098     0.6998   0.5552   0.6509   0.8000  
RoBERTa-large-MNLI        0.6031   0.4005   0.7319     0.5662   0.3910   0.4299   0.3846  

🔍 DETAILED ANALYSIS
🏆 Best Overall Accuracy: BART-large-MNLI (0.7348)
🔥 Best Slang Detection: BART-large-MNLI (0.6147)
❄️  Best Non-Slang Detection: BART-large-MNLI (0.

In [None]:
# =============================================================================
# STANDALONE DEBERTA SLANG CLASSIFICATION NOTEBOOK
# =============================================================================
# This notebook evaluates DeBERTa-v3-large-MNLI for slang classification
# using internal representations + Logistic Regression
# =============================================================================

# Install required packages
!pip install -q transformers scikit-learn pandas torch tqdm

import pandas as pd
import torch
import torch.nn.functional as F
from transformers import AutoTokenizer, DebertaV2Model
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.linear_model import LogisticRegression
from tqdm.auto import tqdm
import numpy as np
import warnings
warnings.filterwarnings('ignore')

# =============================================================================
# DATA LOADING AND PREPARATION
# =============================================================================

def load_and_prepare_data():
    """Load the data and prepare it for classification"""
    print("📂 Loading data...")

    # Load both datasets
    df_slang = pd.read_csv("/content/slang_OpenSub_filtered.tsv", sep="\t", dtype=str)
    df_nonslang = pd.read_csv("/content/slang_OpenSub_negatives_filtered.tsv", sep="\t", dtype=str)

    # Add labels and confidence
    df_slang["label"] = 1  # slang
    df_slang["confidence"] = df_slang["ANNOTATOR_CONFIDENCE"].astype(int)

    df_nonslang["label"] = 0  # non-slang
    df_nonslang["confidence"] = 0  # non-slang doesn't have confidence ratings

    # Combine datasets
    df_all = pd.concat([df_slang, df_nonslang], ignore_index=True)

    # Extract sentences and labels
    sentences = df_all["SENTENCE"].tolist()
    true_labels = df_all["label"].tolist()
    confidence_levels = df_all["confidence"].tolist()

    print(f"✅ Data loaded successfully!")
    print(f"📊 Total sentences: {len(sentences)}")
    print(f"📊 Slang sentences: {sum(true_labels)}")
    print(f"📊 Non-slang sentences: {len(true_labels) - sum(true_labels)}")

    # Print confidence distribution for slang sentences
    slang_confidence = df_slang["confidence"].value_counts().sort_index()
    print(f"📊 Slang confidence distribution:")
    for conf, count in slang_confidence.items():
        print(f"   Confidence {conf}: {count} sentences")

    return sentences, true_labels, confidence_levels

# =============================================================================
# DEBERTA REPRESENTATION EXTRACTION
# =============================================================================

def extract_deberta_representations(sentences, layer_index=-2):
    """Extract representations from DeBERTa model"""
    print(f"🔧 Loading DeBERTa model...")

    model_name = "MoritzLaurer/DeBERTa-v3-large-mnli-fever-anli-ling-wanli"

    # Load tokenizer and model
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    model = DebertaV2Model.from_pretrained(model_name)

    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model.to(device).eval()

    print(f"🔧 Model loaded on: {device}")
    print(f"🔧 Extracting representations from layer {layer_index}")

    representations = []
    batch_size = 16  # Adjust based on GPU memory

    for i in tqdm(range(0, len(sentences), batch_size), desc="Extracting representations"):
        batch_sentences = sentences[i:i+batch_size]

        # Tokenize
        inputs = tokenizer(
            batch_sentences,
            return_tensors="pt",
            padding=True,
            truncation=True,
            max_length=256
        ).to(device)

        with torch.no_grad():
            outputs = model(**inputs, output_hidden_states=True)
            hidden_states = outputs.hidden_states[layer_index]

            # Use mean pooling over sequence length
            batch_representations = hidden_states.mean(dim=1).cpu().numpy()
            representations.extend(batch_representations)

    return np.array(representations)

# =============================================================================
# EVALUATION FUNCTIONS
# =============================================================================

def evaluate_deberta_classification(representations, true_labels, confidence_levels):
    """Evaluate representation-based classification with logistic regression"""
    print(f"\n{'='*60}")
    print(f"🔍 Evaluating DeBERTa-v3-large-MNLI")
    print(f"{'='*60}")

    # Split data for training classifier
    n_train = int(0.7 * len(representations))
    indices = np.random.permutation(len(representations))

    train_indices = indices[:n_train]
    test_indices = indices[n_train:]

    X_train = representations[train_indices]
    y_train = np.array(true_labels)[train_indices]
    X_test = representations[test_indices]
    y_test = np.array(true_labels)[test_indices]
    test_confidence = np.array(confidence_levels)[test_indices]

    # Train logistic regression
    print("🔧 Training Logistic Regression on internal representations...")
    clf = LogisticRegression(random_state=42, max_iter=1000)
    clf.fit(X_train, y_train)
    predictions = clf.predict(X_test)

    # Calculate overall metrics
    overall_accuracy = accuracy_score(y_test, predictions)

    # Calculate per-class metrics
    slang_mask = y_test == 1
    nonslang_mask = y_test == 0

    slang_accuracy = accuracy_score(y_test[slang_mask], predictions[slang_mask]) if slang_mask.any() else 0
    nonslang_accuracy = accuracy_score(y_test[nonslang_mask], predictions[nonslang_mask]) if nonslang_mask.any() else 0

    # Calculate confidence-based metrics for slang sentences
    confidence_metrics = {}
    for conf_level in [1, 2, 3]:
        conf_mask = (y_test == 1) & (test_confidence == conf_level)
        if conf_mask.any():
            conf_accuracy = accuracy_score(y_test[conf_mask], predictions[conf_mask])
            total_sentences = conf_mask.sum()
            correct_predictions = (predictions[conf_mask] == y_test[conf_mask]).sum()
            confidence_metrics[conf_level] = {
                'accuracy': conf_accuracy,
                'total_sentences': total_sentences,
                'correct_predictions': correct_predictions
            }
        else:
            confidence_metrics[conf_level] = {
                'accuracy': 0.0,
                'total_sentences': 0,
                'correct_predictions': 0
            }

    # Generate confusion matrix and classification report
    cm = confusion_matrix(y_test, predictions)
    class_report = classification_report(
        y_test,
        predictions,
        target_names=['Non-Slang', 'Slang'],
        output_dict=True
    )

    results = {
        'model_name': 'DeBERTa-v3-large-MNLI',
        'overall_accuracy': overall_accuracy,
        'slang_accuracy': slang_accuracy,
        'nonslang_accuracy': nonslang_accuracy,
        'confidence_metrics': confidence_metrics,
        'confusion_matrix': cm,
        'classification_report': class_report,
        'predictions': predictions,
        'balance_score': (slang_accuracy + nonslang_accuracy) / 2
    }

    return results

def print_results(results):
    """Print detailed results"""
    print(f"\n📈 RESULTS FOR: {results['model_name']}")
    print("-" * 50)

    print(f"🎯 Overall Accuracy: {results['overall_accuracy']:.4f}")
    print(f"🔥 Slang Accuracy: {results['slang_accuracy']:.4f}")
    print(f"❄️  Non-Slang Accuracy: {results['nonslang_accuracy']:.4f}")
    print(f"⚖️  Balance Score: {results['balance_score']:.4f}")

    print(f"\n📊 CONFIDENCE-BASED ACCURACY (Slang sentences only):")
    for conf_level in [1, 2, 3]:
        metrics = results['confidence_metrics'][conf_level]
        if metrics['total_sentences'] > 0:
            print(f"   Confidence {conf_level}: {metrics['accuracy']:.4f} "
                  f"({metrics['correct_predictions']}/{metrics['total_sentences']} sentences)")
        else:
            print(f"   Confidence {conf_level}: No sentences found")

    print(f"\n📋 CONFUSION MATRIX:")
    cm = results['confusion_matrix']
    print(f"                Predicted")
    print(f"              Non-Slang  Slang")
    print(f"Actual Non-Slang  {cm[0,0]:4d}    {cm[0,1]:4d}")
    print(f"       Slang      {cm[1,0]:4d}    {cm[1,1]:4d}")

    report = results['classification_report']
    print(f"\n📋 DETAILED CLASSIFICATION REPORT:")
    print(f"              Precision  Recall  F1-Score  Support")
    print(f"Non-Slang        {report['Non-Slang']['precision']:.3f}   {report['Non-Slang']['recall']:.3f}    {report['Non-Slang']['f1-score']:.3f}     {report['Non-Slang']['support']}")
    print(f"Slang            {report['Slang']['precision']:.3f}   {report['Slang']['recall']:.3f}    {report['Slang']['f1-score']:.3f}     {report['Slang']['support']}")
    print(f"Macro Avg        {report['macro avg']['precision']:.3f}   {report['macro avg']['recall']:.3f}    {report['macro avg']['f1-score']:.3f}     {report['macro avg']['support']}")
    print(f"Weighted Avg     {report['weighted avg']['precision']:.3f}   {report['weighted avg']['recall']:.3f}    {report['weighted avg']['f1-score']:.3f}     {report['weighted avg']['support']}")

# =============================================================================
# MAIN EXECUTION
# =============================================================================

def main():
    """Main execution function"""
    print("🚀 Starting DeBERTa Slang Classification Analysis")
    print("=" * 70)

    try:
        # Load data
        sentences, true_labels, confidence_levels = load_and_prepare_data()

        # Extract representations from DeBERTa
        representations = extract_deberta_representations(sentences, layer_index=-2)
        print(f"✅ Extracted representations shape: {representations.shape}")

        # Evaluate using logistic regression on representations
        results = evaluate_deberta_classification(representations, true_labels, confidence_levels)

        # Print results
        print_results(results)

        # Save results to CSV
        print(f"\n💾 Saving results...")
        results_data = [{
            'model': results['model_name'],
            'overall_accuracy': results['overall_accuracy'],
            'slang_accuracy': results['slang_accuracy'],
            'nonslang_accuracy': results['nonslang_accuracy'],
            'balance_score': results['balance_score'],
            'confidence_1_accuracy': results['confidence_metrics'][1]['accuracy'],
            'confidence_1_total': results['confidence_metrics'][1]['total_sentences'],
            'confidence_1_correct': results['confidence_metrics'][1]['correct_predictions'],
            'confidence_2_accuracy': results['confidence_metrics'][2]['accuracy'],
            'confidence_2_total': results['confidence_metrics'][2]['total_sentences'],
            'confidence_2_correct': results['confidence_metrics'][2]['correct_predictions'],
            'confidence_3_accuracy': results['confidence_metrics'][3]['accuracy'],
            'confidence_3_total': results['confidence_metrics'][3]['total_sentences'],
            'confidence_3_correct': results['confidence_metrics'][3]['correct_predictions'],
            'slang_precision': results['classification_report']['Slang']['precision'],
            'slang_recall': results['classification_report']['Slang']['recall'],
            'slang_f1': results['classification_report']['Slang']['f1-score'],
            'nonslang_precision': results['classification_report']['Non-Slang']['precision'],
            'nonslang_recall': results['classification_report']['Non-Slang']['recall'],
            'nonslang_f1': results['classification_report']['Non-Slang']['f1-score']
        }]

        results_df = pd.DataFrame(results_data)
        results_df.to_csv('/content/deberta_slang_results.csv', index=False)
        print(f"✅ Results saved to '/content/deberta_slang_results.csv'")

        print(f"\n🎉 DeBERTa analysis completed successfully!")
        print(f"📊 Evaluated on {len(sentences)} sentences")
        print(f"💡 Used internal representations + Logistic Regression approach")

        return results

    except Exception as e:
        print(f"❌ Error in analysis: {str(e)}")
        import traceback
        traceback.print_exc()
        return None

# Run the analysis
if __name__ == "__main__":
    results = main()

    if results:
        print(f"\n🏆 FINAL SUMMARY:")
        print(f"📊 Overall Accuracy: {results['overall_accuracy']:.4f}")
        print(f"🔥 Slang Detection: {results['slang_accuracy']:.4f}")
        print(f"❄️  Non-Slang Detection: {results['nonslang_accuracy']:.4f}")
        print(f"⚖️  Balanced Performance: {results['balance_score']:.4f}")
    else:
        print(f"❌ Analysis failed - check error messages above")

print("🔥 DeBERTa Slang Classification Notebook Ready!")
print("📋 Make sure to upload your data files:")
print("   - slang_OpenSub_filtered.tsv")
print("   - slang_OpenSub_negatives_filtered.tsv")
print("🚀 Run the cells above to start the analysis!")

🚀 Starting DeBERTa Slang Classification Analysis
📂 Loading data...
✅ Data loaded successfully!
📊 Total sentences: 3617
📊 Slang sentences: 1424
📊 Non-slang sentences: 2193
📊 Slang confidence distribution:
   Confidence 1: 989 sentences
   Confidence 2: 361 sentences
   Confidence 3: 74 sentences
🔧 Loading DeBERTa model...
🔧 Model loaded on: cuda
🔧 Extracting representations from layer -2


Extracting representations:   0%|          | 0/227 [00:00<?, ?it/s]

✅ Extracted representations shape: (3617, 1024)

🔍 Evaluating DeBERTa-v3-large-MNLI
🔧 Training Logistic Regression on internal representations...

📈 RESULTS FOR: DeBERTa-v3-large-MNLI
--------------------------------------------------
🎯 Overall Accuracy: 0.7366
🔥 Slang Accuracy: 0.6124
❄️  Non-Slang Accuracy: 0.8144
⚖️  Balance Score: 0.7134

📊 CONFIDENCE-BASED ACCURACY (Slang sentences only):
   Confidence 1: 0.5661 (167/295 sentences)
   Confidence 2: 0.7404 (77/104 sentences)
   Confidence 3: 0.6316 (12/19 sentences)

📋 CONFUSION MATRIX:
                Predicted
              Non-Slang  Slang
Actual Non-Slang   544     124
       Slang       162     256

📋 DETAILED CLASSIFICATION REPORT:
              Precision  Recall  F1-Score  Support
Non-Slang        0.771   0.814    0.792     668.0
Slang            0.674   0.612    0.642     418.0
Macro Avg        0.722   0.713    0.717     1086.0
Weighted Avg     0.733   0.737    0.734     1086.0

💾 Saving results...
✅ Results saved to '/cont